In [2]:
### Notebook 5
### Data set 5 (Heliconius)
### Authors: Nadeau et al. (2013)
### Data Location: ERP000991
In [1]:
%%bash
## make a new directory for this analysis
mkdir -p empirical_5/fastq/
In [1]:
subsamples = ['ERS070268','ERS070269','ERS070270','ERS070271','ERS070272',
'ERS070273','ERS070274','ERS070275','ERS070276','ERS070277',
'ERS070246','ERS070247','ERS070248','ERS070249','ERS070245',
'ERS070252','ERS070253','ERS070254','ERS070256','ERS070255',
'ERS074398','ERS074399','ERS074400','ERS074401','ERS074402',
'ERS074403','ERS074404','ERS074405','ERS074406','ERS074407',
'ERS074408','ERS074409','ERS074410','ERS074411','ERS074412',
'ERS074413','ERS074414','ERS074415','ERS074416','ERS074417',
'ERS074418','ERS074419','ERS074420','ERS074421','ERS074422',
'ERS074423','ERS074424','ERS074425','ERS074426','ERS074427',
'ERS074428','ERS074429','ERS070250','ERS070251']
len(subsamples)
Out[1]:
In [82]:
## IPython code
import pandas as pd
import numpy as np
import urllib2
import os
## open the SRA run table from github url
url = "https://raw.githubusercontent.com/"+\
"dereneaton/RADmissing/master/empirical_9_EraRunTable.txt"
intable = urllib2.urlopen(url)
indata = pd.read_table(intable, sep="\t")
## print first few rows
print indata.head()
In [42]:
def wget_download_ERR(ERR, outdir, outname):
""" Python function to get sra data from ncbi and write to
outdir with a new name using bash call wget """
## get output name
output = os.path.join(outdir, outname+".fastq.gz")
## create a call string
call = "wget -q -r -nH --cut-dirs=9 -O "+output+" "+\
"ftp://ftp.sra.ebi.ac.uk/vol1/fastq/"+\
"{}/{}/{}_1.fastq.gz".format(ERR[:6], ERR, ERR)
## call bash script
! $call
Here we pass the SRR number and the sample name to the wget_download
function so that the files are saved with their sample names.
In [43]:
for ID, ERS, ERR in zip(indata.scientific_name,
indata.secondary_sample_accession,
indata.run_accession):
if ERS in subsamples:
name = ID.split()[1]
name += "_"+ERS+"_"+ERR
print "{:<35}\t{}\t{}".format(ID, ERS, ERR)
wget_download_ERR(ERR, "empirical_5/fastq/", name)
In [59]:
%%bash
mkdir -p empirical_5/fastq/merged
In [62]:
##IPython code
import glob
import os
## get all fastq files
taxa = [i.split("/")[-1].rsplit('_',1)[0] for i in \
glob.glob("empirical_5/fastq/*.gz")]
## iterate over individuals and create merge file
for taxon in set(taxa):
print taxon, "merged"
reps = glob.glob("empirical_5/fastq/"+taxon+"*")
if len(reps) > 1:
## get replicate files
with open("empirical_5/fastq/merged/"+taxon+".fastq.gz", 'wb') as out:
for rep in reps:
with open(rep, 'rb') as tempin:
out.write(tempin.read())
else:
dirs, ff = os.path.split(reps[0])
os.rename(os.path.join(dirs,ff), os.path.join(dirs,"merged",taxon+".fastq.gz"))
In [64]:
%%bash
pyrad --version
In [65]:
%%bash
## remove old params file if it exists
rm params.txt
## create a new default params file
pyrad -n
In [68]:
%%bash
## substitute new parameters into file
sed -i '/## 1. /c\empirical_5/ ## 1. working directory ' params.txt
sed -i '/## 6. /c\TGCAG ## 6. cutters ' params.txt
sed -i '/## 7. /c\20 ## 7. N processors ' params.txt
sed -i '/## 9. /c\6 ## 9. NQual ' params.txt
sed -i '/## 10./c\.85 ## 10. clust threshold ' params.txt
sed -i '/## 12./c\4 ## 12. MinCov ' params.txt
sed -i '/## 13./c\10 ## 13. maxSH ' params.txt
sed -i '/## 14./c\empirical_5_m4 ## 14. output name ' params.txt
sed -i '/## 18./c\empirical_5/fastq/merged/*.gz ## 18. data location ' params.txt
sed -i '/## 20./c\64 ## 20. phred offset ' params.txt
sed -i '/## 29./c\2,2 ## 29. trim overhang ' params.txt
sed -i '/## 30./c\p,n,s ## 30. output formats ' params.txt
In [69]:
cat params.txt
In [ ]:
%%bash
pyrad -p params.txt -s 234567 >> log.txt 2>&1
In [ ]:
%%bash
sed -i '/## 12./c\2 ## 12. MinCov ' params.txt
sed -i '/## 14./c\empirical_5_m2 ## 14. output name ' params.txt
In [ ]:
pyrad -p params.txt -s 7 >> log.txt 2>&1
In [3]:
## read in the data
s2dat = pd.read_table("empirical_5/stats/s2.rawedit.txt", header=0, nrows=55)
## print summary stats
print s2dat["passed.total"].describe()
## find which sample has the most raw data
maxraw = s2dat["passed.total"].max()
print "\nmost raw data in sample:"
print s2dat['sample '][s2dat['passed.total']==maxraw]
In [10]:
## read in the s3 results
s5dat = pd.read_table("empirical_5/stats/s3.clusters.txt", header=0, nrows=54)
## print summary stats
print "summary of means\n=================="
print s5dat['dpt.me'].describe()
## print summary stats
print "\nsummary of std\n=================="
print s5dat['dpt.sd'].describe()
## print summary stats
print "\nsummary of proportion lowdepth\n=================="
print pd.Series(1-s5dat['d>5.tot']/s5dat["total"]).describe()
## find which sample has the greatest depth of retained loci
max_hiprop = (s5dat["d>5.tot"]/s5dat["total"]).max()
print "\nhighest coverage in sample:"
print s5dat['taxa'][s5dat['d>5.tot']/s5dat["total"]==max_hiprop]
In [13]:
## print mean and std of coverage for the highest coverage sample
with open("empirical_5/clust.85/timareta_ERS070251.depths", 'rb') as indat:
depths = np.array(indat.read().strip().split(","), dtype=int)
print depths.mean(), depths.std()
In [16]:
import toyplot
import toyplot.svg
import numpy as np
## read in the depth information for this sample
with open("empirical_5/clust.85/timareta_ERS070251.depths", 'rb') as indat:
depths = np.array(indat.read().strip().split(","), dtype=int)
## make a barplot in Toyplot
canvas = toyplot.Canvas(width=350, height=300)
axes = canvas.axes(xlabel="Depth of coverage (N reads)",
ylabel="N loci",
label="dataset5/sample=timareta_ERS070251")
## select the loci with depth > 5 (kept)
keeps = depths[depths>5]
## plot kept and discarded loci
edat = np.histogram(depths, range(30)) # density=True)
kdat = np.histogram(keeps, range(30)) #, density=True)
axes.bars(edat)
axes.bars(kdat)
#toyplot.svg.render(canvas, "empirical_5_depthplot.svg")
Out[16]:
In [94]:
cat empirical_5/stats/empirical_5_m4.stats
In [18]:
%%bash
head -n 20 empirical_5/stats/empirical_5_m2.stats
In [ ]:
%%bash
## raxml argumement w/ ...
raxmlHPC-PTHREADS-AVX -f a -m GTRGAMMA -N 100 -x 12345 -p 12345 -T 20 \
-w /home/deren/Documents/RADmissing/empirical_5/ \
-n empirical_5_m4 -s empirical_5/outfiles/empirical_5_m4.phy
In [ ]:
%%bash
## raxml argumement w/ ...
raxmlHPC-PTHREADS-AVX -f a -m GTRGAMMA -N 100 -x 12345 -p 12345 -T 20 \
-w /home/deren/Documents/RADmissing/empirical_5/ \
-n empirical_5_m2 -s empirical_5/outfiles/empirical_5_m2.phy
In [96]:
%%bash
head -n 20 empirical_5/RAxML_info.empirical_5_m4
In [19]:
%%bash
head -n 20 empirical_5/RAxML_info.empirical_5_m2
In [ ]:
%load_ext rpy2.ipython
In [27]:
%%R
library(ape)
ltre <- read.tree()
mean(cophenetic.phylo(ltre))