Installing packages and loading libraries

#install.packages("devtools")
library(devtools)
source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
biocLite("qvalue")
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.2 (2016-10-31).
## Installing package(s) 'qvalue'
## 
## The downloaded binary packages are in
##  /var/folders/s6/w7hjkkmd6zg9rt0152bhx8br0000gn/T//Rtmpmuf4LI/downloaded_packages
#install_github("whitelock/OutFLANK")

# load libraries
library(OutFLANK)
## Loading required package: qvalue
library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.4.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(adegenet)
## Loading required package: ade4
## 
##    /// adegenet 2.0.1 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

Loading in data

taking geno file and transposed

#ssw.geno.in<-read.fwf("data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf.geno",width=rep(1,24))

#has to be geno format
ssw.geno.in<-read.fwf("../data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf.geno",width=rep(1,24))
dim(ssw.geno.in)
## [1] 5317   24
ssw.geno<-t(ssw.geno.in)
dim(ssw.geno)
## [1]   24 5317
#read in meta data
#ssw_meta<-read.table("data/2017-04-03_Karl_outliertests/ssw_healthloc.txt",T)
ssw_meta<-read.table("../data/2017-04-03_Karl_outliertests/ssw_healthloc.txt",T)

# reorder the meta data by individual by numerical value
ssw_meta2<-ssw_meta[order(ssw_meta$Individual),]

ssw_meta2$Trajectory
##  [1] SS SS HS HS HH SS HS HS HS SS SS HH SS HH SS SS HH HH HH HH HH SS MM
## [24] MM
## Levels: HH HS MM SS
#ssw_meta3<-droplevels(subset(ssw_meta2,ssw_meta2$Trajectory!="MM"))
ssw_meta2$Trajectory[which(ssw_meta2$Trajectory=="MM")] = NA
ssw_meta2
##    Individual Trajectory Location SNPs
## 9           3         SS      INT    Y
## 10          7         SS      INT    Y
## 4           8         HS      INT    Y
## 5           9         HS      INT    Y
## 1          10         HH      INT    Y
## 11         14         SS      INT    Y
## 6          15         HS      INT    Y
## 7          19         HS      INT    Y
## 8          20         HS      INT    Y
## 12         22         SS      INT    Y
## 13         23         SS      INT    Y
## 2          24         HH      INT    Y
## 14         26         SS      INT    Y
## 3          27         HH      INT    Y
## 15         28         SS      INT    Y
## 16         29         SS      INT    Y
## 17         31         HH      SUB    Y
## 18         32         HH      SUB    Y
## 19         33         HH      SUB    Y
## 20         34         HH      SUB    Y
## 21         35         HH      SUB    Y
## 22         36         SS      SUB    Y
## 23         37       <NA>      SUB    Y
## 24         38       <NA>      SUB    Y
# now we can use outflank
OF_SNPs<-MakeDiploidFSTMat(ssw.geno,locusNames=seq(1,dim(ssw.geno)[2],1),popNames=ssw_meta2$Trajectory)
## Calculating FSTs, may take a few minutes...
head(OF_SNPs)
##   LocusName         He          FST            T1         T2  FSTNoCorr
## 1         1 0.05401235  0.000000000  0.0000000000 0.02777778 0.08404669
## 2         2 0.09972299 -0.019001996 -0.0009666314 0.05086999 0.06168156
## 3         3 0.05124654  0.007530054  0.0001986229 0.02637735 0.08688006
## 4         4 0.05124654  0.007530054  0.0001986229 0.02637735 0.08688006
## 5         5 0.05124654  0.007530054  0.0001986229 0.02637735 0.08688006
## 6         6 0.15277778 -0.017110266 -0.0013351135 0.07802997 0.06411525
##      T1NoCorr   T2NoCorr meanAlleleFreq
## 1 0.002336449 0.02779941     0.02777778
## 2 0.003142655 0.05094967     0.05263158
## 3 0.002295198 0.02641800     0.02631579
## 4 0.002295198 0.02641800     0.02631579
## 5 0.002295198 0.02641800     0.02631579
## 6 0.005006676 0.07808869     0.08333333
#let's llok at fst distribution
hist(OF_SNPs$FST)

OF_out<-OutFLANK(FstDataFrame = OF_SNPs,NumberOfSamples = 3,qthreshold=0.1)
str(OF_out)  
## List of 6
##  $ FSTbar               : num 0.00309
##  $ FSTNoCorrbar         : num 0.0789
##  $ dfInferred           : num 4.91
##  $ numberLowFstOutliers : int 0
##  $ numberHighFstOutliers: int 7
##  $ results              :'data.frame':   5317 obs. of  15 variables:
##   ..$ LocusName       : num [1:5317] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ He              : num [1:5317] 0.054 0.0997 0.0512 0.0512 0.0512 ...
##   ..$ FST             : num [1:5317] 0 -0.019 0.00753 0.00753 0.00753 ...
##   ..$ T1              : num [1:5317] 0 -0.000967 0.000199 0.000199 0.000199 ...
##   ..$ T2              : num [1:5317] 0.0278 0.0509 0.0264 0.0264 0.0264 ...
##   ..$ FSTNoCorr       : num [1:5317] 0.084 0.0617 0.0869 0.0869 0.0869 ...
##   ..$ T1NoCorr        : num [1:5317] 0.00234 0.00314 0.0023 0.0023 0.0023 ...
##   ..$ T2NoCorr        : num [1:5317] 0.0278 0.0509 0.0264 0.0264 0.0264 ...
##   ..$ meanAlleleFreq  : num [1:5317] 0.0278 0.0526 0.0263 0.0263 0.0263 ...
##   ..$ indexOrder      : int [1:5317] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ GoodH           : Factor w/ 2 levels "goodH","lowH": 2 2 2 2 2 1 1 2 2 1 ...
##   ..$ qvalues         : num [1:5317] 0.548 0.548 0.548 0.548 0.548 ...
##   ..$ pvalues         : num [1:5317] 0.754 0.879 0.715 0.715 0.715 ...
##   ..$ pvaluesRightTail: num [1:5317] 0.377 0.561 0.357 0.357 0.357 ...
##   ..$ OutlierFlag     : logi [1:5317] FALSE FALSE FALSE FALSE FALSE FALSE ...
OutFLANKResultsPlotter(OF_out,withOutliers =T,NoCorr=T,Hmin=.1,binwidth=0.005)

outliers<-which(OF_out$results$OutlierFlag=="TRUE")
outliers
## [1] 1223 1452 3067 3068 3546 3602 4374

Extract annotations

Goal: Extract info abou the outliers by reading in the cvf file and match outliers with annotations.

#vcf1<-read.vcfR("data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf")
vcf1<-read.vcfR("../data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf") # read in vcf file
## 
Meta line 8 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 5317
## Character matrix gt cols: 33
## skip: 0
## nrows: 5317
## row_num: 0
## 
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant: 5317
## All variants processed
vcfann<-as.data.frame(getFIX(vcf1)) # converting to data frame
vcfann[outliers,] # matching the rows with outlier row IDs
##                                                               CHROM  POS
## 1223 TRINITY_DN46509_c0_g1_TRINITY_DN46509_c0_g1_i1_g.22498_m.22498 1036
## 1452 TRINITY_DN46269_c0_g1_TRINITY_DN46269_c0_g1_i1_g.21774_m.21774 1242
## 3067 TRINITY_DN46834_c0_g2_TRINITY_DN46834_c0_g2_i3_g.23511_m.23511  408
## 3068 TRINITY_DN46834_c0_g2_TRINITY_DN46834_c0_g2_i3_g.23511_m.23511  411
## 3546 TRINITY_DN43783_c3_g1_TRINITY_DN43783_c3_g1_i6_g.15491_m.15491  110
## 3602 TRINITY_DN42222_c4_g2_TRINITY_DN42222_c4_g2_i1_g.12455_m.12455   61
## 4374 TRINITY_DN44735_c6_g1_TRINITY_DN44735_c6_g1_i3_g.17655_m.17655   80
##        ID REF ALT QUAL FILTER
## 1223 <NA>   G   A <NA>   PASS
## 1452 <NA>   C   T <NA>   PASS
## 3067 <NA>   A   G <NA>   PASS
## 3068 <NA>   G   A <NA>   PASS
## 3546 <NA>   T   C <NA>   PASS
## 3602 <NA>   C   T <NA>   PASS
## 4374 <NA>   C   G <NA>   PASS

What’s next

  1. Now that you have the ID’s of the transcripts, locate them in the “.cds” file on the server
  2. BLAST search it get an annotation (what the gene does)