author: josenavasmolina@gmail.com
date: 25 Sep 2017
license: BSD3

open_reference_otu_picking.ipynb

Commands to execute open reference OTU picking


In [2]:
from os.path import join

adaptor_cleanup_dir = '/path/to/output/cleanup_dir/'
open_ref_output = '/path/to/open_ref_output/'
gg_ref_fna = '/path/to/greengenes/97_otus.fasta'
gg_ref_tax = '/path/to/greengenes/97_otu_taxonomy.txt'
arc_mask = '/path/to/arc.arc-bac.rfam12.1183-1s.1508c.mask'
bac_mask = '/path/to/bac.arc-bac.rfam12.1183-1s.1582c.mask'

In [ ]:
%%bash
seqsfp=''
for i in `ls $adaptor_cleanup_dir/*/filtered_seqs.fna`
do
    seqsfp=$seqsfp','$i
done
seqsfp=${seqsfp:1}

echo "pick_otus:threads 31" > $open_ref_output/or_params.txt

# This will run Open Reference on Iterative mode
pick_open_reference_otus.py -i $seqsfp \
                            -r $gg_ref_fna \
                            -o $open_ref_output/open_ref_otus \
                            -a -O 100 \
                            -p $open_ref_output/or_params.txt \
                            -m sortmerna_sumaclust

cp $open_ref_output/open_ref_otus/otu_table_mc2_w_tax.biom \
   $open_ref_output/emp_or_gg_13_8.incl_sample_singletons.biom
cp $open_ref_output/open_ref_otus/rep_set.fna \
   $open_ref_output/rep_set.incl_sample_singletons.fna

Generate the tree


In [ ]:
%%bash
# Filter OTUs that are present in a single sample
filter_otus_from_otu_table.py -i $open_ref_output/emp_or_gg_13_8.incl_sample_singletons.biom \
                              -o $open_ref_output/emp_or_gg_13_8.no_sample_singletons.biom \
                              -s 2

# Drop the representative sequences for those OTUs
filter_fasta.py -i $open_ref_output/rep_set.incl_sample_singletons.fna \
                -o $open_ref_output/rep_set.no_sample_singletons.fna \
                -b $open_ref_output/emp_or_gg_13_8.no_sample_singletons.biom

# Align sequences - this commands are specific for a supercomputer
pushd $open_ref_output
echo -e 'echo "cd $PWD; \n" | qsub -N ssu-align -l pmem=32gb -l walltime=240:00:00' > presuf.txt
ssu-prep $open_ref_output/rep_set.no_sample_singletons.fna parallel_100 100 presuf.txt -f
./parallel_100.ssu-align.sh
ssu-mask -a parallel_100/parallel_100.archaea.stk \
         -s $arc_mask --key-out arc-bac-mask
ssu-mask -a parallel_100/parallel_100.bacteria.stk \
         -s $bac_mask --key-out arc-bac-mask
popd $open_ref_output

# Run FastTree
export OMP_NUM_THREADS=8
FastTreeMP -nosupport -fastest -nt \
    $open_ref_output/parallel_100.all.arc-bac-mask.afa \
    > $open_ref_output/rep_set.no_sample_singletons.tre