In [1]:
!source activate qiime
import re
import sys
In [ ]:
def fix_silva(silva_fp, output_fp):
with open(output_fp, 'w') as f_o:
with open(silva_fp, 'r') as f_i:
is_target = False
for l in f:
if l.startswith('>'):
is_target = False
if l.split(' ')[1].startswith('Bacteria'):
is_target = True
if l.split(' ')[1].startswith('Archaea'):
is_target = True
if is_target:
f_o.write(l.rstrip() + '\n')
else:
seq = l.rstrip()
if is_target:
f_o.write(seq.replace('U','T') + '\n')
return
In [ ]:
silva_db_99_fp = 'SILVA_128_SSURef_Nr99_tax_silva.fasta'
silva_db_100_fp = 'SILVA_128_SSURef_tax_silva.fasta'
fix_silva(silva_db_99_fp, 'SILVA_128_SSURef_Nr99_tax_silva.prok.fasta')
fix_silva(silva_db_100_fp, 'SILVA_128_SSURef_tax_silva.prok.fasta')
In [5]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
'--id 1.0 '
'--maxaccepts 1000 '
'--maxrejects 32 '
'--db /home/jgsanders/ref_data/gg_13_8_otus/rep_set/99_otus.fasta '
'--uc ~/emp/mapping/Ghits_99_all.uc '
'--dbnotmatched /home/jgsanders/emp/mapping/dbs/gg_99_otus.unmatched.all.fasta '
'--dbmatched /home/jgsanders/emp/mapping/dbs/gg_99_otus.matched.all.fasta '
'--notmatched /home/jgsanders/emp/mapping/Ghits_99_unmatched.all.fasta '
'--matched /home/jgsanders/emp/mapping/Ghits_99_matched.all.fasta')
!echo "source activate qiime; $cmd" | qsub -k eo -N gg99 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00
In [18]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
'--id 1.0 '
'--maxaccepts 1000 '
'--maxrejects 32 '
'--db /home/jgsanders/ref_data/gg_13_8_otus/gg_13_5.fasta '
'--uc ~/emp/mapping/Ghits_100_all.uc '
'--dbnotmatched /home/jgsanders/emp/mapping/dbs/gg_100_otus.unmatched.all.fasta '
'--dbmatched /home/jgsanders/emp/mapping/dbs/gg_100_otus.matched.all.fasta '
'--notmatched /home/jgsanders/emp/mapping/Ghits_100_unmatched.all.fasta '
'--matched /home/jgsanders/emp/mapping/Ghits_100_matched.all.fasta')
!echo "source activate qiime; $cmd" | qsub -k eo -N gg100 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00
In [26]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
'--id 1.0 '
'--maxaccepts 1000 '
'--maxrejects 32 '
'--db /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.fasta '
'--uc ~/emp/mapping/Shits_99_all.uc '
'--dbnotmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.unmatched.all.fasta '
'--dbmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.matched.all.fasta '
'--notmatched /home/jgsanders/emp/mapping/Shits_99_unmatched.all.fasta '
'--matched /home/jgsanders/emp/mapping/Shits_99_matched.all.fasta')
!echo "source activate qiime; $cmd" | qsub -k eo -N silva99 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00
In [27]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
'--id 1.0 '
'--maxaccepts 1000 '
'--maxrejects 32 '
'--db /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.fasta '
'--uc ~/emp/mapping/Shits_100_all.uc '
'--dbnotmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.unmatched.all.fasta '
'--dbmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.matched.all.fasta '
'--notmatched /home/jgsanders/emp/mapping/Shits_100_unmatched.fasta '
'--notmatched /home/jgsanders/emp/mapping/Shits_100_unmatched.all.fasta '
'--matched /home/jgsanders/emp/mapping/Shits_100_matched.all.fasta')
!echo "source activate qiime; $cmd" | qsub -k eo -N silva100 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00
In [34]:
#get number of original seqs
deblur_seqs = !grep -c '>' /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa
gg_99_seqs = !grep -c '>' /home/jgsanders/ref_data/gg_13_8_otus/rep_set/99_otus.fasta
silva_99_seqs = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.fasta
gg_100_seqs = !grep -c '>' /home/jgsanders/ref_data/gg_13_8_otus/gg_13_5.fasta
silva_100_seqs = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.fasta
print('Deblur seqs: {0}\nGG 99 seqs: {1}\nSILVA 99 seqs: {2}\n'
'GG 100 seqs: {3}\nSILVA 100 seqs: {4}'.format(deblur_seqs[0], gg_99_seqs[0], silva_99_seqs[0],
gg_100_seqs[0], silva_100_seqs[0]))
In [44]:
#get number of gg 100 seqs matched
deblur_matched_gg = !grep -c '>' /home/jgsanders/emp/mapping/Ghits_100_matched.all.fasta
gg_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/gg_100_otus.matched.all.fasta
print('GG 100 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(gg_matched_deblur[0], float(gg_matched_deblur[0])/float(gg_100_seqs[0])*100))
print('Deblur seqs matching GG 100: {0} ({1:03.1f}%)'.format(deblur_matched_gg[0], float(deblur_matched_gg[0])/float(deblur_seqs[0])*100))
In [45]:
#get number of gg 99 seqs matched
deblur_matched_gg = !grep -c '>' /home/jgsanders/emp/mapping/Ghits_99_matched.all.fasta
gg_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/gg_99_otus.matched.all.fasta
print('GG 99 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(gg_matched_deblur[0], float(gg_matched_deblur[0])/float(gg_99_seqs[0])*100))
print('Deblur seqs matching GG 99: {0} ({1:03.1f}%)'.format(deblur_matched_gg[0], float(deblur_matched_gg[0])/float(deblur_seqs[0])*100))
In [46]:
#get number of silva 100 seqs matched
deblur_matched_silva = !grep -c '>' /home/jgsanders/emp/mapping/Shits_100_matched.all.fasta
silva_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.matched.all.fasta
print('Silva 100 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(silva_matched_deblur[0], float(silva_matched_deblur[0])/float(silva_100_seqs[0])*100))
print('Deblur seqs matching Silva 100: {0} ({1:03.1f}%)'.format(deblur_matched_silva[0], float(deblur_matched_silva[0])/float(deblur_seqs[0])*100))
In [43]:
#get number of silva 99 seqs matched
deblur_matched_silva = !grep -c '>' /home/jgsanders/emp/mapping/Shits_99_matched.all.fasta
silva_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.matched.all.fasta
print('Silva 99 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(silva_matched_deblur[0], float(silva_matched_deblur[0])/float(silva_99_seqs[0])*100))
print('Deblur seqs matching Silva 99: {0} ({1:03.1f}%)'.format(deblur_matched_silva[0], float(deblur_matched_silva[0])/float(deblur_seqs[0])*100))
In [47]:
import pandas as pd
In [54]:
gg100_df = pd.read_csv('../Ghits_100_all.uc',sep='\t',header=None)
gg100_hits = set(gg100_df[8])
In [55]:
silva100_df = pd.read_csv('../Shits_100.uc',sep='\t',header=None)
silva100_hits = set(silva100_df[8])
In [56]:
len(gg100_hits)
Out[56]:
In [57]:
len(silva100_hits)
Out[57]:
In [58]:
len(gg100_hits | silva100_hits)
Out[58]:
In [59]:
len(silva100_hits - gg100_hits)
Out[59]:
In [60]:
len(gg100_hits - silva100_hits)
Out[60]:
In [ ]: