In [1]:
cd ~/Desktop/SSUsearch/
In [2]:
mkdir -p ./workdir
In [3]:
#check seqfile files to process in data directory (make sure you still remember the data directory)
!ls ./data/test/data
Set "Cpu" higher number.
make more copies of this notebook (click "File" then "Make a copy" in menu bar), so you can run the step on multiple files at the same time.
(Again we assume the "Seqfile" is quality trimmed.)
e.g. for "/usr/local/notebooks/data/test/data/1c.fa", "1c" will the label of this sample.
In [4]:
Seqfile='./data/test/data/1c.fa'
In [5]:
Cpu='1' # number of maxixum threads for search and alignment
Hmm='./data/SSUsearch_db/Hmm.ssu.hmm' # hmm model for ssu
Gene='ssu'
Script_dir='./scripts'
Gene_model_org='./data/SSUsearch_db/Gene_model_org.16s_ecoli_J01695.fasta'
Ali_template='./data/SSUsearch_db/Ali_template.silva_ssu.fasta'
Start='577' #pick regions for de novo clustering
End='727'
Len_cutoff='100' # min length for reads picked for the region
Gene_tax='./data/SSUsearch_db/Gene_tax.silva_taxa_family.tax' # silva 108 ref
Gene_db='./data/SSUsearch_db/Gene_db.silva_108_rep_set.fasta'
Gene_tax_cc='./data/SSUsearch_db/Gene_tax_cc.greengene_97_otus.tax' # greengene 2012.10 ref for copy correction
Gene_db_cc='./data/SSUsearch_db/Gene_db_cc.greengene_97_otus.fasta'
In [6]:
# first part of file basename will the label of this sample
import os
Filename=os.path.basename(Seqfile)
Tag=Filename.split('.')[0]
In [8]:
import os
New_path = '{}:{}'.format('~/Desktop/SSUsearch/external_tools/bin/', os.environ['PATH'])
Hmm=os.path.abspath(Hmm)
Seqfile=os.path.abspath(Seqfile)
Script_dir=os.path.abspath(Script_dir)
Gene_model_org=os.path.abspath(Gene_model_org)
Ali_template=os.path.abspath(Ali_template)
Gene_tax=os.path.abspath(Gene_tax)
Gene_db=os.path.abspath(Gene_db)
Gene_tax_cc=os.path.abspath(Gene_tax_cc)
Gene_db_cc=os.path.abspath(Gene_db_cc)
os.environ.update(
{'PATH':New_path,
'Cpu':Cpu,
'Hmm':os.path.abspath(Hmm),
'Gene':Gene,
'Seqfile':os.path.abspath(Seqfile),
'Filename':Filename,
'Tag':Tag,
'Script_dir':os.path.abspath(Script_dir),
'Gene_model_org':os.path.abspath(Gene_model_org),
'Ali_template':os.path.abspath(Ali_template),
'Start':Start,
'End':End,
'Len_cutoff':Len_cutoff,
'Gene_tax':os.path.abspath(Gene_tax),
'Gene_db':os.path.abspath(Gene_db),
'Gene_tax_cc':os.path.abspath(Gene_tax_cc),
'Gene_db_cc':os.path.abspath(Gene_db_cc)})
In [9]:
!echo "*** make sure: parameters are right"
!echo "Seqfile: $Seqfile\nCpu: $Cpu\nFilename: $Filename\nTag: $Tag"
In [15]:
cd workdir
In [11]:
mkdir -p $Tag.ssu.out
In [12]:
### start hmmsearch
In [24]:
%%bash
echo "*** hmmsearch starting"
time hmmsearch --incE 10 --incdomE 10 --cpu $Cpu \
--domtblout $Tag.ssu.out/$Tag.qc.$Gene.hmmdomtblout \
-o /dev/null -A $Tag.ssu.out/$Tag.qc.$Gene.sto \
$Hmm $Seqfile
echo "*** hmmsearch finished"
In [25]:
!python $Script_dir/get-seq-from-hmmout.py \
$Tag.ssu.out/$Tag.qc.$Gene.hmmdomtblout \
$Tag.ssu.out/$Tag.qc.$Gene.sto \
$Tag.ssu.out/$Tag.qc.$Gene
In [26]:
%%bash
echo "*** Starting mothur align"
cat $Gene_model_org $Tag.ssu.out/$Tag.qc.$Gene > $Tag.ssu.out/$Tag.qc.$Gene.RFadded
# mothur does not allow tab between its flags, thus no indents here
time mothur "#align.seqs(candidate=$Tag.ssu.out/$Tag.qc.$Gene.RFadded, template=$Ali_template, threshold=0.5, flip=t, processors=$Cpu)"
rm -f mothur.*.logfile
In [27]:
!python $Script_dir/mothur-align-report-parser-cutoff.py \
$Tag.ssu.out/$Tag.qc.$Gene.align.report \
$Tag.ssu.out/$Tag.qc.$Gene.align \
$Tag.ssu.out/$Tag.qc.$Gene.align.filter \
0.5
In [28]:
!python $Script_dir/remove-gap.py $Tag.ssu.out/$Tag.qc.$Gene.align.filter $Tag.ssu.out/$Tag.qc.$Gene.align.filter.fa
In [29]:
!python $Script_dir/region-cut.py $Tag.ssu.out/$Tag.qc.$Gene.align.filter $Start $End $Len_cutoff
!mv $Tag.ssu.out/$Tag.qc.$Gene.align.filter."$Start"to"$End".cut.lenscreen $Tag.ssu.out/$Tag.forclust
In [32]:
%%bash
rm -f $Tag.ssu.out/$Tag.qc.$Gene.align.filter.silva_taxa_family*.taxonomy
mothur "#classify.seqs(fasta=$Tag.ssu.out/$Tag.qc.$Gene.align.filter.fa, template=$Gene_db, taxonomy=$Gene_tax, cutoff=50, processors=$Cpu)"
mv $Tag.ssu.out/$Tag.qc.$Gene.align.filter.silva_taxa_family*.taxonomy \
$Tag.ssu.out/$Tag.qc.$Gene.align.filter.wang.silva.taxonomy
In [34]:
!python $Script_dir/count-taxon.py \
$Tag.ssu.out/$Tag.qc.$Gene.align.filter.wang.silva.taxonomy \
$Tag.ssu.out/$Tag.qc.$Gene.align.filter.wang.silva.taxonomy.count
!rm -f mothur.*.logfile
In [38]:
%%bash
rm -f $Tag.ssu.out/$Tag.qc.$Gene.align.filter.greengene_97_otus*.taxonomy
mothur "#classify.seqs(fasta=$Tag.ssu.out/$Tag.qc.$Gene.align.filter.fa, template=$Gene_db_cc, taxonomy=$Gene_tax_cc, cutoff=50, processors=$Cpu)"
mv $Tag.ssu.out/$Tag.qc.$Gene.align.filter.greengene_97_otus*.taxonomy \
$Tag.ssu.out/$Tag.qc.$Gene.align.filter.wang.gg.taxonomy
In [39]:
!python $Script_dir/count-taxon.py \
$Tag.ssu.out/$Tag.qc.$Gene.align.filter.wang.gg.taxonomy \
$Tag.ssu.out/$Tag.qc.$Gene.align.filter.wang.gg.taxonomy.count
!rm -f mothur.*.logfile
In [40]:
# check the output directory
!ls $Tag.ssu.out
Following are files useful for community analysis:
In [41]:
!echo "*** pipeline runs successsfully :)"
In [ ]: