In [99]:
import os
workDir = '/var/seq_data/ncbi_db/genome/Jan2016/'
genomeDir = os.path.join(workDir, 'bac_complete_rn')
primerFile = '/home/nick/notebook/SIPSim/dev/515F-806R.fna'
In [100]:
%load_ext rpy2.ipython
%load_ext pushnote
In [101]:
%%R
library(dplyr)
library(tidyr)
library(ggplot2)
In [102]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
%cd $workDir
In [104]:
#catGenomeFile = 'bac_complete_rn.fna'
#!rm -f $catGenomeFile
#!find $genomeDir -name "*.fna" | xargs -I % bash -c "cat %; echo" >> $catGenomeFile
In [105]:
# fixing issues of no line breaks for new sequence IDs in fastas
#!perl -pi -e 's/>/\n>/ if /.+>/' $catGenomeFile
In [109]:
# checking number of sequences
#!grep -c ">" $catGenomeFile
In [110]:
# checking for duplicate sequence IDs
#ret = !grep ">" $catGenomeFile | sort | uniq -c
#for x in ret:
# x = x.lstrip().split(' ')
# x[0] = int(x[0])
# if x[0] > 1:
# print x
In [112]:
rnammerDir = os.path.join(workDir, 'rnammer')
if not os.path.isdir(rnammerDir):
os.makedirs(rnammerDir)
In [ ]:
%%bash -s "$genomeDir" "$rnammerDir"
find $1 -name "*.fna" | \
perl -pe 's/.+\/|\.fna//g' | \
xargs -n 1 -I % -P 30 bash -c \
"rnammer -S bac -m ssu -gff $2/%_rrn.gff -f $2/%_rrn.fna -xml $2/%_rrn.xml < $1/%.fna"
In [ ]:
!cd $rnammerDir; \
egrep -v "^#" *.gff | \
grep "16s_rRNA" | \
perl -pe 's/:/\t/' > ssu_summary.txt
In [ ]:
%%bash -s "$rnammerDir"
cd $1
printf "ssu gene length distribution:\n"
cut -d $'\t' -f 7 ssu_summary.txt | NY_misc_perl stats_descriptive
In [ ]:
# combining sequences
!cd $rnammerDir; \
cat *_rrn.fna > ssu_all.fna
!printf "Number of sequences: "
!cd $rnammerDir; \
grep -c ">" ssu_all.fna
In [ ]:
%pushnote rnammer complete
In [10]:
seqs = !grep -v ">" $primerFile
oligoFile = os.path.splitext(primerFile)[0] + '.oligo'
with open(oligoFile, 'wb') as outFH:
for i,x in enumerate(seqs):
if i == 0:
primer = 'forward'
elif i == 1:
primer = 'reverse'
else:
break
outFH.write('{} {}\n'.format(primer, x))
# checking output
!head $oligoFile
In [ ]:
cmd = 'mothur "#pcr.seqs(fasta={}, oligos={}, pdiffs=1, processors=24)"'.format(catGenomeFile, oligoFile)
!$cmd
In [79]:
!seq_tools fasta_info --sn --sl bac_complete.pcr.fna > bac_complete.pcr_len.txt
!head -n 3 bac_complete.pcr_len.txt
In [74]:
%%R -i workDir -h 300 -w 600
F = file.path(workDir, 'bac_complete.pcr_len.txt')
df = read.delim(F, sep='\t', header=FALSE)
ggplot(df, aes(V3)) +
geom_histogram() +
labs(x='amplicon length (bp)')
In [75]:
%%R -h 300 -w 600
df.f = df %>%
filter(V3 < 500)
ggplot(df.f, aes(V3)) +
geom_histogram() +
labs(x='amplicon length (bp)')
In [84]:
%%R
outFile = 'bac_complete.pcr.accnos'
write.table(df.f %>% select(V1, V2), outFile, sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE)
In [85]:
cmd = 'mothur "#get.seqs(fasta={}, accnos={})"'.format('bac_complete.pcr.fna', 'bac_complete.pcr.accnos')
!$cmd
In [86]:
cmd = 'mothur "#summary.seqs(fasta={})"'.format('bac_complete.pcr.pick.fna')
!$cmd
In [87]:
# checking for multiple amplicons from the same genome
ret = !grep ">" bac_complete.pcr.pick.fna
cnt = {}
for x in ret:
x = x.split('\t')[0]
try:
cnt[x] += 1
except KeyError:
cnt[x] = 1
for x,y in cnt.items():
if y > 1:
print x
cnt = None
In [ ]: