Loading in dataset
library(data.table)
H_all<-fread("https://raw.githubusercontent.com/adnguyen/2017_Ecological_Genomics/master/data/2017-03-20_popgenomics_part3/H_AlleleFreqs.frq?token=AEcFit8v6R38egNl4eIPZSKjoL-K7OS9ks5Y2Sg4wA%3D%3D",header=TRUE)
S_all<-fread("https://raw.githubusercontent.com/adnguyen/2017_Ecological_Genomics/master/data/2017-03-20_popgenomics_part3/S_AlleleFreqs.frq?token=AEcFiuViD3MRkBng1Ks8TpfhwPtvQm5Uks5Y2ShSwA%3D%3D",header=TRUE)
fst<-fread("https://raw.githubusercontent.com/adnguyen/2017_Ecological_Genomics/master/data/2017-03-20_popgenomics_part3/HvS_Fst.weir.fst?token=AEcFirmZh4UWmNCQJ0tWCqqicvq4F298ks5Y2SbJwA%3D%3D")
Look at the data
str(H_all)
## Classes 'data.table' and 'data.frame': 5317 obs. of 6 variables:
## $ CHROM : chr "TRINITY_DN47185_c0_g1_TRINITY_DN47185_c0_g1_i2_g.24943_m.24943" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" ...
## $ POS : int 7456 5715 5723 5726 5728 6313 6354 6396 6481 6647 ...
## $ N_ALLELES: int 2 2 2 2 2 2 2 2 2 2 ...
## $ N_CHR : int 14 14 14 14 14 10 8 12 14 10 ...
## $ H_REF : num 0.929 0.929 0.929 0.929 0.929 ...
## $ H_ALT : num 0.0714 0.0714 0.0714 0.0714 0.0714 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(S_all)
## Classes 'data.table' and 'data.frame': 5317 obs. of 6 variables:
## $ CHROM : chr "TRINITY_DN47185_c0_g1_TRINITY_DN47185_c0_g1_i2_g.24943_m.24943" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" "TRINITY_DN47298_c2_g2_TRINITY_DN47298_c2_g2_i5_g.25412_m.25412" ...
## $ POS : int 7456 5715 5723 5726 5728 6313 6354 6396 6481 6647 ...
## $ N_ALLELES: int 2 2 2 2 2 2 2 2 2 2 ...
## $ N_CHR : int 22 24 24 24 24 26 28 26 24 26 ...
## $ S_REF : num 1 0.958 1 1 1 ...
## $ S_HALT : num 0 0.0417 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(fst)
## Classes 'data.table' and 'data.frame': 1513 obs. of 3 variables:
## $ CHROM : chr "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" "TRINITY_DN47302_c3_g2_TRINITY_DN47302_c3_g2_i1_g.25464_m.25464" ...
## $ POS : int 4733 5850 5865 5869 5874 6096 6146 6201 6289 6325 ...
## $ WEIR_AND_COCKERHAM_FST: num -0.0224 -0.0207 -0.0224 -0.0224 -0.0224 ...
## - attr(*, ".internal.selfref")=<externalptr>
Data manipulations
# Since these files have identical numbers of SNPs in the exact same order, we can concatenate them together into one large dataframe:
All_freq <- merge(H_all, S_all, by=c("CHROM", "POS"))
# Check the results of your merge to make sure things look OK
str(All_freq) # shows the structure of the data
## Classes 'data.table' and 'data.frame': 5317 obs. of 10 variables:
## $ CHROM : chr "TRINITY_DN15785_c0_g1_TRINITY_DN15785_c0_g1_i1_g.1357_m.1357" "TRINITY_DN27892_c0_g1_TRINITY_DN27892_c0_g1_i1_g.3123_m.3123" "TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467" "TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467" ...
## $ POS : int 174 41 282 300 210 60 324 387 388 389 ...
## $ N_ALLELES.x: int 2 2 2 2 2 2 2 2 2 2 ...
## $ N_CHR.x : int 16 16 16 16 16 16 16 14 16 16 ...
## $ H_REF : num 0.875 1 1 0.938 1 ...
## $ H_ALT : num 0.125 0 0 0.0625 0 ...
## $ N_ALLELES.y: int 2 2 2 2 2 2 2 2 2 2 ...
## $ N_CHR.y : int 22 28 24 24 28 28 24 26 26 26 ...
## $ S_REF : num 0.909 1 0.958 0.917 0.964 ...
## $ S_HALT : num 0.0909 0 0.0417 0.0833 0.0357 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "CHROM" "POS"
head(All_freq)
## CHROM POS
## 1: TRINITY_DN15785_c0_g1_TRINITY_DN15785_c0_g1_i1_g.1357_m.1357 174
## 2: TRINITY_DN27892_c0_g1_TRINITY_DN27892_c0_g1_i1_g.3123_m.3123 41
## 3: TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467 282
## 4: TRINITY_DN29134_c0_g1_TRINITY_DN29134_c0_g1_i1_g.3467_m.3467 300
## 5: TRINITY_DN35598_c0_g1_TRINITY_DN35598_c0_g1_i1_g.5802_m.5802 210
## 6: TRINITY_DN35845_c0_g1_TRINITY_DN35845_c0_g1_i1_g.5934_m.5934 60
## N_ALLELES.x N_CHR.x H_REF H_ALT N_ALLELES.y N_CHR.y S_REF
## 1: 2 16 0.8750 0.1250 2 22 0.909091
## 2: 2 16 1.0000 0.0000 2 28 1.000000
## 3: 2 16 1.0000 0.0000 2 24 0.958333
## 4: 2 16 0.9375 0.0625 2 24 0.916667
## 5: 2 16 1.0000 0.0000 2 28 0.964286
## 6: 2 16 0.9375 0.0625 2 28 0.928571
## S_HALT
## 1: 0.0909091
## 2: 0.0000000
## 3: 0.0416667
## 4: 0.0833333
## 5: 0.0357143
## 6: 0.0714286
# Looks good, now let's calculate the difference in minor allele frequency at each SNP and plot as a histogram
All_freq$diff <- (All_freq$H_ALT - All_freq$S_HALT)
hist(All_freq$diff, breaks=50, col="red", main="Allele frequency difference (H-S)")
#hist(All_freq$diff)
Fst
# Looks like most loci show little difference (i.e., likely drift), but perhaps a few show very large differences between healthy and sick (drift or selection?)
# How do these highly divergent frequenices compare to Fst at the same SNPs?
fst <- read.table("../data/2017-03-20_popgenomics_part3/HvS_Fst.weir.fst", header=T)
#fst <- read.table("/data/2017-03-20_popgenomics_part3/HvS_Fst.weir.fst", header=T)
All_freq.fst <- merge(All_freq, fst, by=c("CHROM", "POS"))
plot(All_freq.fst$diff, All_freq.fst$WEIR_AND_COCKERHAM_FST, xlab="Allele frequency difference (H-S)", ylab="Fst", main="Healthy vs. Sick SNP divergence")
# Which are the genes that are showing the highest divergence between Healthy and Sick?
All_freq.fst[which(All_freq.fst$WEIR_AND_COCKERHAM_FST>0.2),]
## CHROM POS
## 1: TRINITY_DN39079_c3_g1_TRINITY_DN39079_c3_g1_i1_g.8354_m.8354 279
## 2: TRINITY_DN42225_c1_g1_TRINITY_DN42225_c1_g1_i1_g.12458_m.12458 255
## 3: TRINITY_DN42856_c0_g1_TRINITY_DN42856_c0_g1_i4_g.13698_m.13698 138
## 4: TRINITY_DN44346_c2_g2_TRINITY_DN44346_c2_g2_i8_g.16772_m.16772 479
## 5: TRINITY_DN45174_c0_g1_TRINITY_DN45174_c0_g1_i2_g.18771_m.18771 348
## 6: TRINITY_DN45186_c3_g1_TRINITY_DN45186_c3_g1_i3_g.18787_m.18787 1754
## 7: TRINITY_DN45559_c4_g3_TRINITY_DN45559_c4_g3_i2_g.19710_m.19710 210
## 8: TRINITY_DN46297_c0_g1_TRINITY_DN46297_c0_g1_i2_g.21856_m.21856 101
## 9: TRINITY_DN46874_c5_g2_TRINITY_DN46874_c5_g2_i2_g.23776_m.23776 1308
## 10: TRINITY_DN46874_c5_g2_TRINITY_DN46874_c5_g2_i2_g.23776_m.23776 1722
## 11: TRINITY_DN47139_c0_g1_TRINITY_DN47139_c0_g1_i1_g.24693_m.24693 96
## N_ALLELES.x N_CHR.x H_REF H_ALT N_ALLELES.y N_CHR.y S_REF
## 1: 2 16 0.7500 0.2500 2 28 1.000000
## 2: 2 16 0.7500 0.2500 2 28 1.000000
## 3: 2 16 0.6875 0.3125 2 28 0.964286
## 4: 2 16 0.8125 0.1875 2 28 1.000000
## 5: 2 16 0.6250 0.3750 2 28 0.928571
## 6: 2 16 0.6875 0.3125 2 28 1.000000
## 7: 2 16 0.5000 0.5000 2 28 0.892857
## 8: 2 16 0.7500 0.2500 2 28 1.000000
## 9: 2 16 0.8125 0.1875 2 28 1.000000
## 10: 2 16 0.8125 0.1875 2 28 1.000000
## 11: 2 16 0.8125 0.1875 2 28 1.000000
## S_HALT diff WEIR_AND_COCKERHAM_FST
## 1: 0.0000000 0.2500000 0.241535
## 2: 0.0000000 0.2500000 0.241535
## 3: 0.0357143 0.2767857 0.234998
## 4: 0.0000000 0.1875000 0.209825
## 5: 0.0714286 0.3035714 0.229294
## 6: 0.0000000 0.3125000 0.366978
## 7: 0.1071430 0.3928570 0.303730
## 8: 0.0000000 0.2500000 0.290125
## 9: 0.0000000 0.1875000 0.209825
## 10: 0.0000000 0.1875000 0.209825
## 11: 0.0000000 0.1875000 0.209825
Romiguier paper
#rom<-read.csv("../data/2017-03-20_popgenomics_part3/Romiguier_nature13685-s3.csv")
rom<-read.csv("data/2017-03-20_popgenomics_part3/Romiguier_nature13685-s3.csv")
plot(log(rom$piS),log(rom$piNpiS),pch=20,col="blue",xlab="log synonymous nucleotide diversity (piS)",ylab="log Ratio of Nonsynonymous to Synonymous Diversity (piN/piS)")
points(log(0.00585312),log(0.264041),pch=17,cex=3,col="red",bg="red")
reg<-lm(log(rom$piNpiS)~log(rom$piS))
abline(reg)
Just taking out echinoderms
echino <- rom[which(rom$Phylum=="Echinodermata"),] # subsets the data
points(log(echino$piS), log(echino$piNpiS), pch=21, bg="red") # adds the points
Calculate Ne
\(\pi_s = 4 * Ne * \mu\)
\(\mu\) is 4x10^ -9 rearrange and solve for Ne
\(Ne = \frac{\pi_s}{4\mu}\)
Ne!!
0.00585312/(4* 4E-9)
## [1] 365820