In [32]:
import os
baseDir = '/home/nick/notebook/SIPSim/dev/fullCyc/'
workDir = os.path.join(baseDir, 'CD-HIT')
rnammerSeqs = '/home/nick/notebook/SIPSim/dev/bac_genome1147/rnammer/bac_genome1147_16S.fna'
otuRepFile = '/home/nick/notebook/SIPSim/dev/fullCyc/OTU_reps.fna'
otuTaxFile = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7/OTU_binning/otusn_tax/otusn_tax_assignments.txt'
genomeDir = '/var/seq_data/ncbi_db/genome/Jan2016/bac_complete_spec-rep1_rn/'
In [33]:
import re
import glob
import itertools
import random
from pprint import pprint
%load_ext rpy2.ipython
In [34]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
In [35]:
if not os.path.isdir(workDir):
os.makedirs(workDir)
%cd $workDir
In [36]:
# concatenating sequences
!cat $otuRepFile $rnammerSeqs > ssu_all.fna
!printf 'Number of sequences: '
!grep -c ">" ssu_all.fna
In [9]:
!cd-hit-est -i ssu_all.fna -o ssu_all_cdhit -c 0.97 -d 0
In [10]:
inFile = os.path.join(workDir, 'ssu_all_cdhit.clstr')
tbl = {}
with open(inFile, 'rb') as inFH:
clst_id = None
for line in inFH:
line = line.rstrip()
if line.startswith('>'):
clst_id = line.lstrip('>Cluster ')
tbl[clst_id] = []
else:
tbl[clst_id].append(re.split('\t|, ', line))
print "Number of clusters loaded: {}".format(len(tbl.keys()))
In [11]:
# clusters that have '>OTU' and '>rRNA' (OTU and genome)
def shared_clust(x):
otus = any([y[2].startswith('>OTU') for y in x])
genomes = any([y[2].startswith('>rRNA') for y in x])
return otus == True and genomes == True
tbl_f = {x:y for x,y in tbl.items() if shared_clust(y)}
print "Number of clusters with OTUs and genomes: {}".format(len(tbl_f.keys()))
In [14]:
# loading tax file
tax = {}
with open(otuTaxFile, 'rb') as inFH:
for line in inFH:
line = line.rstrip()
if not line.startswith('OTU'):
continue
otu,cls,boot,_ = line.split('\t')
cls = [x.lstrip(' __') for x in cls.split(';')]
for i in range(8):
try:
len(cls[i])
except IndexError:
cls.append('Unclassified')
tax[otu] = cls
In [15]:
def printDict(d, n=10):
cnt = 0
for x,y in d.items():
pprint(x)
print(y)
cnt += 1
if cnt >= n:
break
In [16]:
printDict(tax, n=3)
In [17]:
# adding taxonomic classifications to OTUs in target clusters
for clstr,x in tbl_f.items():
for y in x:
ID = y[2].lstrip('>')
ID = re.sub('\.\.\..+','', ID)
#print 'ID: "{}"'.format(ID)
try:
y.append(tax[ID])
except KeyError:
y.append(None)
In [18]:
# gut check: manual check of OTU classifications & genome names
for clstr,x in tbl_f.items():
print 'Cluster: {}'.format(clstr)
for y in x:
ID = y[2].lstrip('>')
if ID.startswith('OTU'):
# classifications
try:
print ':'.join(y[3])[:100]
except IndexError:
print ':'.join(y[3])
elif ID.startswith('rRNA'):
# genome names
try:
print ID[:100]
except IndexError:
print ID
Notes:
In [24]:
# making a index of scaffolds for each genome
genome_list = glob.glob(os.path.join(genomeDir, '*.fna'))
def fasta_seqID(fastaFiles):
file_seqID = {}
for f in fastaFiles:
with open(f, 'rb') as iFH:
for line in iFH:
if line.startswith('>'):
line = line.lstrip('>').rstrip()
file_seqID[line] = f
return file_seqID
seqID_fastaID = fasta_seqID(genome_list)
print 'Index length: {}'.format(len(seqID_fastaID))
In [25]:
def write_targets(tbl, oFH, seqID_fastaID):
"""
Args:
tbl -- cd-hit results
oFH -- output file handle
seqID_fastaID -- dict(seqName : fastaFileName)
"""
oFH.write('\t'.join(['cluster', 'ssu_ID', 'genome_fileID', 'genomeID',
'genome_seqID', 'OTU', 'OTU_taxonomy']) + '\n')
for clstr,rows in tbl.items():
# parsing cluster; getting all OTUs and genome_IDs
targets = []
otus = []
for row in rows:
ID = row[2].lstrip('>')
ID = re.sub('\.\.\..+','', ID)
if ID.startswith('OTU'):
otu = [ID, ':'.join(row[3])]
otus.append(otu)
elif ID.startswith('rRNA'):
targets.append(ID)
# writing out list
## one 1 randomly selected genome is associated with OTU
random.shuffle(targets)
for otu, target in zip(otus, itertools.cycle(targets)):
# genome sequence name
seqID = target[5:] # removing rRNA_
seqID = re.sub('_\d+-\d+_DIR.+', '', seqID)
# genome file name
try:
fileID = seqID_fastaID[seqID]
except KeyError:
msg = 'Cannot find "{}"'
print msg.format(seqID)
fileID = ''
genomeID = ''
else:
# genome name
x = os.path.split(fileID)
genomeID = os.path.splitext(x[1])[0]
# writing row
oFH.write('\t'.join([clstr, target, fileID, genomeID, seqID] + otu) + '\n')
In [26]:
outFile = os.path.join(workDir, 'target_taxa.txt')
with open(outFile, 'wb') as oFH:
write_targets(tbl_f, oFH, seqID_fastaID)
In [27]:
!printf "Number of rows in table: "
!cd $workDir; \
tail -n +2 target_taxa.txt | wc -l
In [37]:
!printf "Number of clusters: "
!tail -n +2 target_taxa.txt | cut -f 1 | sort -u | wc -l
In [38]:
!printf "Number of target genomes: "
!tail -n +2 target_taxa.txt | cut -f 3 | sort -u | wc -l
In [39]:
!printf "Number of OTUs: "
!tail -n +2 target_taxa.txt | cut -f 4 | sort -u | wc -l
In [54]:
repGenomeAmplicons = '/home/nick/notebook/SIPSim/dev/fullCyc/CD-HIT/ssu_all.fna'
fullCycAmplicons = '/var/seq_data/fullCyc/MiSeq_16SrRNA/515f-806r/lib1-7rs/OTU_binning/otusn.pick.fasta'
In [56]:
!seq_tools fasta_unwrap ssu_all.fna | grep ">OTU" -A 1| egrep -v "^--$" > ssu_all_OTU.fna
In [57]:
!seq_tools fasta_info --fn --tgc ssu_all_OTU.fna $repGenomeAmplicons $fullCycAmplicons
In [ ]: