Loading in dataset

library(data.table)
H_all<-fread("https://raw.githubusercontent.com/adnguyen/2017_Ecological_Genomics/master/data/2017-03-20_popgenomics_part3/H_AlleleFreqs.frq?token=AEcFit8v6R38egNl4eIPZSKjoL-K7OS9ks5Y2Sg4wA%3D%3D",header=TRUE)

S_all<-fread("https://raw.githubusercontent.com/adnguyen/2017_Ecological_Genomics/master/data/2017-03-20_popgenomics_part3/S_AlleleFreqs.frq?token=AEcFiuViD3MRkBng1Ks8TpfhwPtvQm5Uks5Y2ShSwA%3D%3D",header=TRUE)

fst<-fread("https://raw.githubusercontent.com/adnguyen/2017_Ecological_Genomics/master/data/2017-03-20_popgenomics_part3/HvS_Fst.weir.fst?token=AEcFirmZh4UWmNCQJ0tWCqqicvq4F298ks5Y2SbJwA%3D%3D")

Look at the data

str(H_all)
## Classes 'data.table' and 'data.frame':   5317 obs. of  6 variables:
##  $ CHROM    : chr  "TRINITY_DN47185_c0_g1_TRINITY_DN47185_c0_g1_i2_g.24943_m.24943" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" ...
##  $ POS      : int  7456 5715 5723 5726 5728 6313 6354 6396 6481 6647 ...
##  $ N_ALLELES: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ N_CHR    : int  14 14 14 14 14 10 8 12 14 10 ...
##  $ H_REF    : num  0.929 0.929 0.929 0.929 0.929 ...
##  $ H_ALT    : num  0.0714 0.0714 0.0714 0.0714 0.0714 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(S_all)
## Classes 'data.table' and 'data.frame':   5317 obs. of  6 variables:
##  $ CHROM    : chr  "TRINITY_DN47185_c0_g1_TRINITY_DN47185_c0_g1_i2_g.24943_m.24943" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" ...
##  $ POS      : int  7456 5715 5723 5726 5728 6313 6354 6396 6481 6647 ...
##  $ N_ALLELES: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ N_CHR    : int  22 24 24 24 24 26 28 26 24 26 ...
##  $ S_REF    : num  1 0.958 1 1 1 ...
##  $ S_HALT   : num  0 0.0417 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(fst)
## Classes 'data.table' and 'data.frame':   1513 obs. of  3 variables:
##  $ CHROM                 : chr  "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" ...
##  $ POS                   : int  4733 5850 5865 5869 5874 6096 6146 6201 6289 6325 ...
##  $ WEIR_AND_COCKERHAM_FST: num  -0.0224 -0.0207 -0.0224 -0.0224 -0.0224 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Data manipulations

# Since these files have identical numbers of SNPs in the exact same order, we can concatenate them together into one large dataframe:
All_freq <- merge(H_all, S_all, by=c("CHROM", "POS"))

# Check the results of your merge to make sure things look OK
str(All_freq) # shows the structure of the data
## Classes 'data.table' and 'data.frame':   5317 obs. of  10 variables:
##  $ CHROM      : chr  "TRINITY_DN15785_c0_g1_TRINITY_DN15785_c0_g1_i1_g.1357_m.1357" "TRINITY_DN27892_c0_g1_TRINITY_DN27892_c0_g1_i1_g.3123_m.3123" "TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467" "TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467" ...
##  $ POS        : int  174 41 282 300 210 60 324 387 388 389 ...
##  $ N_ALLELES.x: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ N_CHR.x    : int  16 16 16 16 16 16 16 14 16 16 ...
##  $ H_REF      : num  0.875 1 1 0.938 1 ...
##  $ H_ALT      : num  0.125 0 0 0.0625 0 ...
##  $ N_ALLELES.y: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ N_CHR.y    : int  22 28 24 24 28 28 24 26 26 26 ...
##  $ S_REF      : num  0.909 1 0.958 0.917 0.964 ...
##  $ S_HALT     : num  0.0909 0 0.0417 0.0833 0.0357 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "CHROM" "POS"
head(All_freq)
##                                                           CHROM POS
## 1: TRINITY_DN15785_c0_g1_TRINITY_DN15785_c0_g1_i1_g.1357_m.1357 174
## 2: TRINITY_DN27892_c0_g1_TRINITY_DN27892_c0_g1_i1_g.3123_m.3123  41
## 3: TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467 282
## 4: TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467 300
## 5: TRINITY_DN35598_c0_g1_TRINITY_DN35598_c0_g1_i1_g.5802_m.5802 210
## 6: TRINITY_DN35845_c0_g1_TRINITY_DN35845_c0_g1_i1_g.5934_m.5934  60
##    N_ALLELES.x N_CHR.x  H_REF  H_ALT N_ALLELES.y N_CHR.y    S_REF
## 1:           2      16 0.8750 0.1250           2      22 0.909091
## 2:           2      16 1.0000 0.0000           2      28 1.000000
## 3:           2      16 1.0000 0.0000           2      24 0.958333
## 4:           2      16 0.9375 0.0625           2      24 0.916667
## 5:           2      16 1.0000 0.0000           2      28 0.964286
## 6:           2      16 0.9375 0.0625           2      28 0.928571
##       S_HALT
## 1: 0.0909091
## 2: 0.0000000
## 3: 0.0416667
## 4: 0.0833333
## 5: 0.0357143
## 6: 0.0714286
# Looks good, now let's calculate the difference in minor allele frequency at each SNP and plot as a histogram
All_freq$diff <- (All_freq$H_ALT - All_freq$S_HALT)

hist(All_freq$diff, breaks=50, col="red", main="Allele frequency difference (H-S)")

#hist(All_freq$diff)

Fst

# Looks like most loci show little difference (i.e., likely drift), but perhaps a few show very large differences between healthy and sick (drift or selection?)

# How do these highly divergent frequenices compare to Fst at the same SNPs?
fst <- read.table("../data/2017-03-20_popgenomics_part3/HvS_Fst.weir.fst", header=T)

#fst <- read.table("/data/2017-03-20_popgenomics_part3/HvS_Fst.weir.fst", header=T)

All_freq.fst <- merge(All_freq, fst, by=c("CHROM", "POS"))

plot(All_freq.fst$diff, All_freq.fst$WEIR_AND_COCKERHAM_FST, xlab="Allele frequency difference (H-S)", ylab="Fst", main="Healthy vs. Sick SNP divergence")

# Which are the genes that are showing the highest divergence between Healthy and Sick?
All_freq.fst[which(All_freq.fst$WEIR_AND_COCKERHAM_FST>0.2),]
##                                                              CHROM  POS
##  1:   TRINITY_DN39079_c3_g1_TRINITY_DN39079_c3_g1_i1_g.8354_m.8354  279
##  2: TRINITY_DN42225_c1_g1_TRINITY_DN42225_c1_g1_i1_g.12458_m.12458  255
##  3: TRINITY_DN42856_c0_g1_TRINITY_DN42856_c0_g1_i4_g.13698_m.13698  138
##  4: TRINITY_DN44346_c2_g2_TRINITY_DN44346_c2_g2_i8_g.16772_m.16772  479
##  5: TRINITY_DN45174_c0_g1_TRINITY_DN45174_c0_g1_i2_g.18771_m.18771  348
##  6: TRINITY_DN45186_c3_g1_TRINITY_DN45186_c3_g1_i3_g.18787_m.18787 1754
##  7: TRINITY_DN45559_c4_g3_TRINITY_DN45559_c4_g3_i2_g.19710_m.19710  210
##  8: TRINITY_DN46297_c0_g1_TRINITY_DN46297_c0_g1_i2_g.21856_m.21856  101
##  9: TRINITY_DN46874_c5_g2_TRINITY_DN46874_c5_g2_i2_g.23776_m.23776 1308
## 10: TRINITY_DN46874_c5_g2_TRINITY_DN46874_c5_g2_i2_g.23776_m.23776 1722
## 11: TRINITY_DN47139_c0_g1_TRINITY_DN47139_c0_g1_i1_g.24693_m.24693   96
##     N_ALLELES.x N_CHR.x  H_REF  H_ALT N_ALLELES.y N_CHR.y    S_REF
##  1:           2      16 0.7500 0.2500           2      28 1.000000
##  2:           2      16 0.7500 0.2500           2      28 1.000000
##  3:           2      16 0.6875 0.3125           2      28 0.964286
##  4:           2      16 0.8125 0.1875           2      28 1.000000
##  5:           2      16 0.6250 0.3750           2      28 0.928571
##  6:           2      16 0.6875 0.3125           2      28 1.000000
##  7:           2      16 0.5000 0.5000           2      28 0.892857
##  8:           2      16 0.7500 0.2500           2      28 1.000000
##  9:           2      16 0.8125 0.1875           2      28 1.000000
## 10:           2      16 0.8125 0.1875           2      28 1.000000
## 11:           2      16 0.8125 0.1875           2      28 1.000000
##        S_HALT      diff WEIR_AND_COCKERHAM_FST
##  1: 0.0000000 0.2500000               0.241535
##  2: 0.0000000 0.2500000               0.241535
##  3: 0.0357143 0.2767857               0.234998
##  4: 0.0000000 0.1875000               0.209825
##  5: 0.0714286 0.3035714               0.229294
##  6: 0.0000000 0.3125000               0.366978
##  7: 0.1071430 0.3928570               0.303730
##  8: 0.0000000 0.2500000               0.290125
##  9: 0.0000000 0.1875000               0.209825
## 10: 0.0000000 0.1875000               0.209825
## 11: 0.0000000 0.1875000               0.209825

Romiguier paper

#rom<-read.csv("../data/2017-03-20_popgenomics_part3/Romiguier_nature13685-s3.csv")
rom<-read.csv("data/2017-03-20_popgenomics_part3/Romiguier_nature13685-s3.csv")

plot(log(rom$piS),log(rom$piNpiS),pch=20,col="blue",xlab="log synonymous nucleotide diversity (piS)",ylab="log Ratio of Nonsynonymous to Synonymous Diversity (piN/piS)")
points(log(0.00585312),log(0.264041),pch=17,cex=3,col="red",bg="red")

reg<-lm(log(rom$piNpiS)~log(rom$piS))
abline(reg)

Just taking out echinoderms

echino <- rom[which(rom$Phylum=="Echinodermata"),] # subsets the data
points(log(echino$piS), log(echino$piNpiS), pch=21, bg="red") # adds the points

Calculate Ne

\(\pi_s = 4 * Ne * \mu\)

\(\mu\) is 4x10^ -9 rearrange and solve for Ne

\(Ne = \frac{\pi_s}{4\mu}\)

Ne!!

0.00585312/(4* 4E-9)
## [1] 365820