author: jonsan@gmail.com
date: 9 Oct 2017
language: Python 3.5
license: BSD3

matches_deblur_to_gg_silva.ipynb


In [1]:
!source activate qiime
import re
import sys


discarding /home/jgsanders/miniconda/envs/qiime/bin from PATH
prepending /home/jgsanders/miniconda/envs/qiime/bin to PATH

Remove eukaryotic sequences from Silva DB

Also change 'U' to 'T' to match DNA sequences from EMP


In [ ]:
def fix_silva(silva_fp, output_fp):
    with open(output_fp, 'w') as f_o:
        with open(silva_fp, 'r') as f_i:
            is_target = False
            for l in f:
                if l.startswith('>'):
                    is_target = False
                    if l.split(' ')[1].startswith('Bacteria'):
                        is_target = True
                    if l.split(' ')[1].startswith('Archaea'):
                        is_target = True
                    if is_target:
                        f_o.write(l.rstrip() + '\n')    
                else:
                    seq = l.rstrip()
                    if is_target:
                        f_o.write(seq.replace('U','T') + '\n')
                        
    return

In [ ]:
silva_db_99_fp = 'SILVA_128_SSURef_Nr99_tax_silva.fasta'
silva_db_100_fp = 'SILVA_128_SSURef_tax_silva.fasta'

fix_silva(silva_db_99_fp, 'SILVA_128_SSURef_Nr99_tax_silva.prok.fasta')
fix_silva(silva_db_100_fp, 'SILVA_128_SSURef_tax_silva.prok.fasta')

Search deblur against all (or at least a max of 1000 identical) hits

Commands are given in a qsub framework for submission to a Torque cluster


In [5]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
       '--id 1.0 '
       '--maxaccepts 1000 '
       '--maxrejects 32 '
       '--db /home/jgsanders/ref_data/gg_13_8_otus/rep_set/99_otus.fasta '
       '--uc ~/emp/mapping/Ghits_99_all.uc '
       '--dbnotmatched /home/jgsanders/emp/mapping/dbs/gg_99_otus.unmatched.all.fasta '
       '--dbmatched /home/jgsanders/emp/mapping/dbs/gg_99_otus.matched.all.fasta '
       '--notmatched /home/jgsanders/emp/mapping/Ghits_99_unmatched.all.fasta '
       '--matched /home/jgsanders/emp/mapping/Ghits_99_matched.all.fasta')
!echo "source activate qiime; $cmd" | qsub -k eo -N gg99 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00


181685.barnacle.ucsd.edu

In [18]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
       '--id 1.0 '
       '--maxaccepts 1000 '
       '--maxrejects 32 '
       '--db /home/jgsanders/ref_data/gg_13_8_otus/gg_13_5.fasta '
       '--uc ~/emp/mapping/Ghits_100_all.uc '
       '--dbnotmatched /home/jgsanders/emp/mapping/dbs/gg_100_otus.unmatched.all.fasta '
       '--dbmatched /home/jgsanders/emp/mapping/dbs/gg_100_otus.matched.all.fasta '
       '--notmatched /home/jgsanders/emp/mapping/Ghits_100_unmatched.all.fasta '
       '--matched /home/jgsanders/emp/mapping/Ghits_100_matched.all.fasta')
!echo "source activate qiime; $cmd" | qsub -k eo -N gg100 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00


181687.barnacle.ucsd.edu

In [26]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
       '--id 1.0 '
       '--maxaccepts 1000 '
       '--maxrejects 32 '
       '--db /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.fasta '
       '--uc ~/emp/mapping/Shits_99_all.uc '
       '--dbnotmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.unmatched.all.fasta '
       '--dbmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.matched.all.fasta '
       '--notmatched /home/jgsanders/emp/mapping/Shits_99_unmatched.all.fasta '
       '--matched /home/jgsanders/emp/mapping/Shits_99_matched.all.fasta')

!echo "source activate qiime; $cmd" | qsub -k eo -N silva99 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00


181689.barnacle.ucsd.edu

In [27]:
cmd = ('vsearch --usearch_global /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa '
       '--id 1.0 '
       '--maxaccepts 1000 '
       '--maxrejects 32 '
       '--db /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.fasta '
       '--uc ~/emp/mapping/Shits_100_all.uc '
       '--dbnotmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.unmatched.all.fasta '
       '--dbmatched /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.matched.all.fasta '
       '--notmatched /home/jgsanders/emp/mapping/Shits_100_unmatched.fasta '
       '--notmatched /home/jgsanders/emp/mapping/Shits_100_unmatched.all.fasta '
       '--matched /home/jgsanders/emp/mapping/Shits_100_matched.all.fasta')

!echo "source activate qiime; $cmd" | qsub -k eo -N silva100 -l nodes=1:ppn=32 -l pmem=4gb -l walltime=12:00:00


181690.barnacle.ucsd.edu

In [34]:
#get number of original seqs
deblur_seqs = !grep -c '>' /projects/emp/03-otus/04-deblur/emp.90.min25.deblur.seq.fa
gg_99_seqs = !grep -c '>' /home/jgsanders/ref_data/gg_13_8_otus/rep_set/99_otus.fasta
silva_99_seqs = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.fasta
gg_100_seqs = !grep -c '>' /home/jgsanders/ref_data/gg_13_8_otus/gg_13_5.fasta
silva_100_seqs = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.fasta

print('Deblur seqs: {0}\nGG 99 seqs: {1}\nSILVA 99 seqs: {2}\n'
      'GG 100 seqs: {3}\nSILVA 100 seqs: {4}'.format(deblur_seqs[0], gg_99_seqs[0], silva_99_seqs[0],
                                                     gg_100_seqs[0], silva_100_seqs[0]))


Deblur seqs: 345624
GG 99 seqs: 203452
SILVA 99 seqs: 576542
GG 100 seqs: 1262986
SILVA 100 seqs: 1783660

In [44]:
#get number of gg 100 seqs matched
deblur_matched_gg = !grep -c '>' /home/jgsanders/emp/mapping/Ghits_100_matched.all.fasta
gg_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/gg_100_otus.matched.all.fasta

print('GG 100 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(gg_matched_deblur[0], float(gg_matched_deblur[0])/float(gg_100_seqs[0])*100))

print('Deblur seqs matching GG 100: {0} ({1:03.1f}%)'.format(deblur_matched_gg[0], float(deblur_matched_gg[0])/float(deblur_seqs[0])*100))


GG 100 seqs with Deblur hits: 579156 (45.9%)
Deblur seqs matching GG 100: 34807 (10.1%)

In [45]:
#get number of gg 99 seqs matched
deblur_matched_gg = !grep -c '>' /home/jgsanders/emp/mapping/Ghits_99_matched.all.fasta
gg_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/gg_99_otus.matched.all.fasta

print('GG 99 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(gg_matched_deblur[0], float(gg_matched_deblur[0])/float(gg_99_seqs[0])*100))

print('Deblur seqs matching GG 99: {0} ({1:03.1f}%)'.format(deblur_matched_gg[0], float(deblur_matched_gg[0])/float(deblur_seqs[0])*100))


GG 99 seqs with Deblur hits: 121598 (59.8%)
Deblur seqs matching GG 99: 32419 (9.4%)

In [46]:
#get number of silva 100 seqs matched
deblur_matched_silva = !grep -c '>' /home/jgsanders/emp/mapping/Shits_100_matched.all.fasta
silva_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_tax_silva.prok.matched.all.fasta

print('Silva 100 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(silva_matched_deblur[0], float(silva_matched_deblur[0])/float(silva_100_seqs[0])*100))

print('Deblur seqs matching Silva 100: {0} ({1:03.1f}%)'.format(deblur_matched_silva[0], float(deblur_matched_silva[0])/float(deblur_seqs[0])*100))


Silva 100 seqs with Deblur hits: 797766 (44.7%)
Deblur seqs matching Silva 100: 44247 (12.8%)

In [43]:
#get number of silva 99 seqs matched
deblur_matched_silva = !grep -c '>' /home/jgsanders/emp/mapping/Shits_99_matched.all.fasta
silva_matched_deblur = !grep -c '>' /home/jgsanders/emp/mapping/dbs/SILVA_128_SSURef_Nr99_tax_silva.prok.matched.all.fasta

print('Silva 99 seqs with Deblur hits: {0} ({1:03.1f}%)'.format(silva_matched_deblur[0], float(silva_matched_deblur[0])/float(silva_99_seqs[0])*100))

print('Deblur seqs matching Silva 99: {0} ({1:03.1f}%)'.format(deblur_matched_silva[0], float(deblur_matched_silva[0])/float(deblur_seqs[0])*100))


Silva 99 seqs with Deblur hits: 361459 (62.7%)
Deblur seqs matching Silva 99: 42317 (12.2%)

Are matching sets nonredundant?


In [47]:
import pandas as pd

In [54]:
gg100_df = pd.read_csv('../Ghits_100_all.uc',sep='\t',header=None)
gg100_hits = set(gg100_df[8])

In [55]:
silva100_df = pd.read_csv('../Shits_100.uc',sep='\t',header=None)
silva100_hits = set(silva100_df[8])

In [56]:
len(gg100_hits)


Out[56]:
34807

In [57]:
len(silva100_hits)


Out[57]:
44247

In [58]:
len(gg100_hits | silva100_hits)


Out[58]:
44473

In [59]:
len(silva100_hits - gg100_hits)


Out[59]:
9666

In [60]:
len(gg100_hits - silva100_hits)


Out[60]:
226

In [ ]: