In [2]:
from os.path import join
adaptor_cleanup_dir = '/path/to/output/cleanup_dir/'
open_ref_output = '/path/to/open_ref_output/'
gg_ref_fna = '/path/to/greengenes/97_otus.fasta'
gg_ref_tax = '/path/to/greengenes/97_otu_taxonomy.txt'
arc_mask = '/path/to/arc.arc-bac.rfam12.1183-1s.1508c.mask'
bac_mask = '/path/to/bac.arc-bac.rfam12.1183-1s.1582c.mask'
In [ ]:
%%bash
seqsfp=''
for i in `ls $adaptor_cleanup_dir/*/filtered_seqs.fna`
do
seqsfp=$seqsfp','$i
done
seqsfp=${seqsfp:1}
echo "pick_otus:threads 31" > $open_ref_output/or_params.txt
# This will run Open Reference on Iterative mode
pick_open_reference_otus.py -i $seqsfp \
-r $gg_ref_fna \
-o $open_ref_output/open_ref_otus \
-a -O 100 \
-p $open_ref_output/or_params.txt \
-m sortmerna_sumaclust
cp $open_ref_output/open_ref_otus/otu_table_mc2_w_tax.biom \
$open_ref_output/emp_or_gg_13_8.incl_sample_singletons.biom
cp $open_ref_output/open_ref_otus/rep_set.fna \
$open_ref_output/rep_set.incl_sample_singletons.fna
In [ ]:
%%bash
# Filter OTUs that are present in a single sample
filter_otus_from_otu_table.py -i $open_ref_output/emp_or_gg_13_8.incl_sample_singletons.biom \
-o $open_ref_output/emp_or_gg_13_8.no_sample_singletons.biom \
-s 2
# Drop the representative sequences for those OTUs
filter_fasta.py -i $open_ref_output/rep_set.incl_sample_singletons.fna \
-o $open_ref_output/rep_set.no_sample_singletons.fna \
-b $open_ref_output/emp_or_gg_13_8.no_sample_singletons.biom
# Align sequences - this commands are specific for a supercomputer
pushd $open_ref_output
echo -e 'echo "cd $PWD; \n" | qsub -N ssu-align -l pmem=32gb -l walltime=240:00:00' > presuf.txt
ssu-prep $open_ref_output/rep_set.no_sample_singletons.fna parallel_100 100 presuf.txt -f
./parallel_100.ssu-align.sh
ssu-mask -a parallel_100/parallel_100.archaea.stk \
-s $arc_mask --key-out arc-bac-mask
ssu-mask -a parallel_100/parallel_100.bacteria.stk \
-s $bac_mask --key-out arc-bac-mask
popd $open_ref_output
# Run FastTree
export OMP_NUM_THREADS=8
FastTreeMP -nosupport -fastest -nt \
$open_ref_output/parallel_100.all.arc-bac-mask.afa \
> $open_ref_output/rep_set.no_sample_singletons.tre