In [14]:
### Notebook 9
### Data set 9 (Ohomopterus)
### Authors: Takahashi et al. 2014
### Data Location: DRP001067
Sequence data for this study are archived on the NCBI sequence read archive (SRA). Below I read in SraRunTable.txt for this project which contains all of the information we need to download the data.
In [15]:
%%bash
## make a new directory for this analysis
mkdir -p empirical_9/fastq/
In [19]:
import os
def wget_download_ddbj(SRR, outdir):
""" Python function to get sra data from ncbi and write to
outdir with a new name using bash call wget """
## create a call string
call = "wget -q -r -nH --cut-dirs=9 -P "+outdir+" "+\
"ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sra/ByExp/"+\
"sra/DRX/DRX011/DRX011{:03d}".format(SRR)
## run wget call
! $call
In [20]:
for ID in range(602,634):
wget_download_ddbj(ID, "empirical_9/fastq/")
Here we pass the SRR number and the sample name to the wget_download
function so that the files are saved with their sample names.
In [21]:
%%bash
## convert sra files to fastq using fastq-dump tool
## output as gzipped into the fastq directory
fastq-dump --gzip -O empirical_9/fastq/ empirical_9/fastq/*.sra
## remove .sra files
rm empirical_9/fastq/*.sra
In [22]:
%%bash
ls -lh empirical_9/fastq/
In [23]:
%%bash
pyrad --version
In [24]:
%%bash
## remove old params file if it exists
rm params.txt
## create a new default params file
pyrad -n
In [25]:
%%bash
## substitute new parameters into file
sed -i '/## 1. /c\empirical_9/ ## 1. working directory ' params.txt
sed -i '/## 6. /c\TGCAG ## 6. cutters ' params.txt
sed -i '/## 7. /c\20 ## 7. N processors ' params.txt
sed -i '/## 9. /c\6 ## 9. NQual ' params.txt
sed -i '/## 10./c\.85 ## 10. clust threshold ' params.txt
sed -i '/## 12./c\4 ## 12. MinCov ' params.txt
sed -i '/## 13./c\10 ## 13. maxSH ' params.txt
sed -i '/## 14./c\empirical_9_m4 ## 14. output name ' params.txt
sed -i '/## 18./c\empirical_9/fastq/*.gz ## 18. data location ' params.txt
sed -i '/## 29./c\2,2 ## 29. trim overhang ' params.txt
sed -i '/## 30./c\p,n,s ## 30. output formats ' params.txt
In [26]:
cat params.txt
In [9]:
os.path.splitext(fil)
Out[9]:
In [12]:
import glob
import itertools
import gzip
import os
for infile in glob.glob("empirical_9/fastq/DRR*"):
iofile = gzip.open(infile, 'rb')
dire, fil = os.path.split(infile)
fastq = os.path.splitext(fil)[0]
outhandle = os.path.join(dire, "T_"+fastq)
outfile = open(outhandle, 'wb')
data = iter(iofile)
store = []
while 1:
try:
line = data.next()
except StopIteration:
break
if len(line) < 80:
store.append(line)
else:
store.append(line[5:])
if len(store) == 10000:
outfile.write("".join(store))
store = []
iofile.close()
outfile.close()
! gzip $outhandle
In [ ]:
%%bash
pyrad -p params.txt -s 234567 >> log.txt 2>&1
In [13]:
%%bash
sed -i '/## 12./c\2 ## 12. MinCov ' params.txt
sed -i '/## 14./c\empirical_9_m2 ## 14. output name ' params.txt
In [47]:
%%bash
pyrad -p params.txt -s 7 >> log.txt 2>&1
In [1]:
import pandas as pd
import numpy as np
## read in the data
s2dat = pd.read_table("empirical_9/stats/s2.rawedit.txt", header=0, nrows=32)
## print summary stats
print s2dat["passed.total"].describe()
## find which sample has the most raw data
maxraw = s2dat["passed.total"].max()
print "\nmost raw data in sample:"
print s2dat['sample '][s2dat['passed.total']==maxraw]
In [3]:
## read in the s3 results
s9dat = pd.read_table("empirical_9/stats/s3.clusters.txt", header=0, nrows=33)
## print summary stats
print "summary of means\n=================="
print s9dat['dpt.me'].describe()
## print summary stats
print "\nsummary of std\n=================="
print s9dat['dpt.sd'].describe()
## print summary stats
print "\nsummary of proportion lowdepth\n=================="
print pd.Series(1-s9dat['d>5.tot']/s9dat["total"]).describe()
## find which sample has the greatest depth of retained loci
max_hiprop = (s9dat["d>5.tot"]/s9dat["total"]).max()
print "\nhighest coverage in sample:"
print s9dat['taxa'][s9dat['d>5.tot']/s9dat["total"]==max_hiprop]
In [4]:
maxprop =(s9dat['d>5.tot']/s9dat['total']).max()
print "\nhighest prop coverage in sample:"
print s9dat['taxa'][s9dat['d>5.tot']/s9dat['total']==maxprop]
In [5]:
## print mean and std of coverage for the highest coverage sample
with open("empirical_9/clust.85/T_DRR021966.depths", 'rb') as indat:
depths = np.array(indat.read().strip().split(","), dtype=int)
print "Means for sample T_DRR021966"
print depths.mean(), depths.std()
print depths[depths>5].mean(), depths[depths>5].std()
In [8]:
import toyplot
import toyplot.svg
import numpy as np
## read in the depth information for this sample
with open("empirical_9/clust.85/T_DRR021966.depths", 'rb') as indat:
depths = np.array(indat.read().strip().split(","), dtype=int)
## make a barplot in Toyplot
canvas = toyplot.Canvas(width=350, height=300)
axes = canvas.axes(xlabel="Depth of coverage (N reads)",
ylabel="N loci",
label="dataset9/sample=T_DRR021966")
## select the loci with depth > 5 (kept)
keeps = depths[depths>5]
## plot kept and discarded loci
edat = np.histogram(depths, range(30)) # density=True)
kdat = np.histogram(keeps, range(30)) #, density=True)
axes.bars(edat)
axes.bars(kdat)
#toyplot.svg.render(canvas, "empirical_9_depthplot.svg")
Out[8]:
In [26]:
cat empirical_9/stats/empirical_9_m4.stats
In [9]:
%%bash
head -n 20 empirical_9/stats/empirical_9_m2.stats
In [ ]:
%%bash
## raxml argumement w/ ...
raxmlHPC-PTHREADS-AVX -f a -m GTRGAMMA -N 100 -x 12345 -p 12345 -T 20 \
-w /home/deren/Documents/RADmissing/empirical_9/ \
-n empirical_9_m4 -s empirical_9/outfiles/empirical_9_m4.phy
In [ ]:
%%bash
## raxml argumement w/ ...
raxmlHPC-PTHREADS-AVX -f a -m GTRGAMMA -N 100 -x 12345 -p 12345 -T 20 \
-w /home/deren/Documents/RADmissing/empirical_9/ \
-n empirical_9_m2 -s empirical_9/outfiles/empirical_9_m2.phy
In [28]:
%%bash
head -n 20 empirical_9/RAxML_info.empirical_9_m4
In [ ]:
%%bash
head -n 20 empirical_9/RAxML_info.empirical_9_m2
In [10]:
%load_ext rpy2.ipython
In [12]:
%%R -h 800 -w 800
library(ape)
tre <- read.tree("empirical_9/RAxML_bipartitions.empirical_9")
ltre <- ladderize(tre)
par(mfrow=c(1,2))
plot(ltre, use.edge.length=F)
nodelabels(ltre$node.label)
plot(ltre, type='u')
In [13]:
%%R
mean(cophenetic.phylo(ltre))