Installing packages and loading libraries
#install.packages("devtools")
library(devtools)
source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
biocLite("qvalue")
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.2 (2016-10-31).
## Installing package(s) 'qvalue'
##
## The downloaded binary packages are in
## /var/folders/s6/w7hjkkmd6zg9rt0152bhx8br0000gn/T//Rtmpmuf4LI/downloaded_packages
#install_github("whitelock/OutFLANK")
# load libraries
library(OutFLANK)
## Loading required package: qvalue
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.4.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(adegenet)
## Loading required package: ade4
##
## /// adegenet 2.0.1 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
Loading in data
taking geno file and transposed
#ssw.geno.in<-read.fwf("data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf.geno",width=rep(1,24))
#has to be geno format
ssw.geno.in<-read.fwf("../data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf.geno",width=rep(1,24))
dim(ssw.geno.in)
## [1] 5317 24
ssw.geno<-t(ssw.geno.in)
dim(ssw.geno)
## [1] 24 5317
#read in meta data
#ssw_meta<-read.table("data/2017-04-03_Karl_outliertests/ssw_healthloc.txt",T)
ssw_meta<-read.table("../data/2017-04-03_Karl_outliertests/ssw_healthloc.txt",T)
# reorder the meta data by individual by numerical value
ssw_meta2<-ssw_meta[order(ssw_meta$Individual),]
ssw_meta2$Trajectory
## [1] SS SS HS HS HH SS HS HS HS SS SS HH SS HH SS SS HH HH HH HH HH SS MM
## [24] MM
## Levels: HH HS MM SS
#ssw_meta3<-droplevels(subset(ssw_meta2,ssw_meta2$Trajectory!="MM"))
ssw_meta2$Trajectory[which(ssw_meta2$Trajectory=="MM")] = NA
ssw_meta2
## Individual Trajectory Location SNPs
## 9 3 SS INT Y
## 10 7 SS INT Y
## 4 8 HS INT Y
## 5 9 HS INT Y
## 1 10 HH INT Y
## 11 14 SS INT Y
## 6 15 HS INT Y
## 7 19 HS INT Y
## 8 20 HS INT Y
## 12 22 SS INT Y
## 13 23 SS INT Y
## 2 24 HH INT Y
## 14 26 SS INT Y
## 3 27 HH INT Y
## 15 28 SS INT Y
## 16 29 SS INT Y
## 17 31 HH SUB Y
## 18 32 HH SUB Y
## 19 33 HH SUB Y
## 20 34 HH SUB Y
## 21 35 HH SUB Y
## 22 36 SS SUB Y
## 23 37 <NA> SUB Y
## 24 38 <NA> SUB Y
# now we can use outflank
OF_SNPs<-MakeDiploidFSTMat(ssw.geno,locusNames=seq(1,dim(ssw.geno)[2],1),popNames=ssw_meta2$Trajectory)
## Calculating FSTs, may take a few minutes...
head(OF_SNPs)
## LocusName He FST T1 T2 FSTNoCorr
## 1 1 0.05401235 0.000000000 0.0000000000 0.02777778 0.08404669
## 2 2 0.09972299 -0.019001996 -0.0009666314 0.05086999 0.06168156
## 3 3 0.05124654 0.007530054 0.0001986229 0.02637735 0.08688006
## 4 4 0.05124654 0.007530054 0.0001986229 0.02637735 0.08688006
## 5 5 0.05124654 0.007530054 0.0001986229 0.02637735 0.08688006
## 6 6 0.15277778 -0.017110266 -0.0013351135 0.07802997 0.06411525
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 0.002336449 0.02779941 0.02777778
## 2 0.003142655 0.05094967 0.05263158
## 3 0.002295198 0.02641800 0.02631579
## 4 0.002295198 0.02641800 0.02631579
## 5 0.002295198 0.02641800 0.02631579
## 6 0.005006676 0.07808869 0.08333333
#let's llok at fst distribution
hist(OF_SNPs$FST)
OF_out<-OutFLANK(FstDataFrame = OF_SNPs,NumberOfSamples = 3,qthreshold=0.1)
str(OF_out)
## List of 6
## $ FSTbar : num 0.00309
## $ FSTNoCorrbar : num 0.0789
## $ dfInferred : num 4.91
## $ numberLowFstOutliers : int 0
## $ numberHighFstOutliers: int 7
## $ results :'data.frame': 5317 obs. of 15 variables:
## ..$ LocusName : num [1:5317] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ He : num [1:5317] 0.054 0.0997 0.0512 0.0512 0.0512 ...
## ..$ FST : num [1:5317] 0 -0.019 0.00753 0.00753 0.00753 ...
## ..$ T1 : num [1:5317] 0 -0.000967 0.000199 0.000199 0.000199 ...
## ..$ T2 : num [1:5317] 0.0278 0.0509 0.0264 0.0264 0.0264 ...
## ..$ FSTNoCorr : num [1:5317] 0.084 0.0617 0.0869 0.0869 0.0869 ...
## ..$ T1NoCorr : num [1:5317] 0.00234 0.00314 0.0023 0.0023 0.0023 ...
## ..$ T2NoCorr : num [1:5317] 0.0278 0.0509 0.0264 0.0264 0.0264 ...
## ..$ meanAlleleFreq : num [1:5317] 0.0278 0.0526 0.0263 0.0263 0.0263 ...
## ..$ indexOrder : int [1:5317] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ GoodH : Factor w/ 2 levels "goodH","lowH": 2 2 2 2 2 1 1 2 2 1 ...
## ..$ qvalues : num [1:5317] 0.548 0.548 0.548 0.548 0.548 ...
## ..$ pvalues : num [1:5317] 0.754 0.879 0.715 0.715 0.715 ...
## ..$ pvaluesRightTail: num [1:5317] 0.377 0.561 0.357 0.357 0.357 ...
## ..$ OutlierFlag : logi [1:5317] FALSE FALSE FALSE FALSE FALSE FALSE ...
OutFLANKResultsPlotter(OF_out,withOutliers =T,NoCorr=T,Hmin=.1,binwidth=0.005)
outliers<-which(OF_out$results$OutlierFlag=="TRUE")
outliers
## [1] 1223 1452 3067 3068 3546 3602 4374
Extract annotations
Goal: Extract info abou the outliers by reading in the cvf file and match outliers with annotations.
#vcf1<-read.vcfR("data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf")
vcf1<-read.vcfR("../data/2017-04-03_Karl_outliertests/SSW_all_biallelic.MAF0.02.Miss0.8.recode.vcf") # read in vcf file
##
Meta line 8 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 5317
## Character matrix gt cols: 33
## skip: 0
## nrows: 5317
## row_num: 0
##
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant: 5317
## All variants processed
vcfann<-as.data.frame(getFIX(vcf1)) # converting to data frame
vcfann[outliers,] # matching the rows with outlier row IDs
## CHROM POS
## 1223 TRINITY_DN46509_c0_g1_TRINITY_DN46509_c0_g1_i1_g.22498_m.22498 1036
## 1452 TRINITY_DN46269_c0_g1_TRINITY_DN46269_c0_g1_i1_g.21774_m.21774 1242
## 3067 TRINITY_DN46834_c0_g2_TRINITY_DN46834_c0_g2_i3_g.23511_m.23511 408
## 3068 TRINITY_DN46834_c0_g2_TRINITY_DN46834_c0_g2_i3_g.23511_m.23511 411
## 3546 TRINITY_DN43783_c3_g1_TRINITY_DN43783_c3_g1_i6_g.15491_m.15491 110
## 3602 TRINITY_DN42222_c4_g2_TRINITY_DN42222_c4_g2_i1_g.12455_m.12455 61
## 4374 TRINITY_DN44735_c6_g1_TRINITY_DN44735_c6_g1_i3_g.17655_m.17655 80
## ID REF ALT QUAL FILTER
## 1223 <NA> G A <NA> PASS
## 1452 <NA> C T <NA> PASS
## 3067 <NA> A G <NA> PASS
## 3068 <NA> G A <NA> PASS
## 3546 <NA> T C <NA> PASS
## 3602 <NA> C T <NA> PASS
## 4374 <NA> C G <NA> PASS
What’s next
- Now that you have the ID’s of the transcripts, locate them in the “.cds” file on the server
- BLAST search it get an annotation (what the gene does)