In [194]:
import os
import sys
import cPickle as pickle
def loadPkl(fname):
pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
f = os.path.join(pklDir,fname)
db = pickle.load(open(f))
return db
def savePkl(db,pklFname):
pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
f = os.path.join(pklDir,pklFname)
pickle.dump(db,open(f,'w'))
return
In [195]:
import time
accNbr_taxID_dict = loadPkl('accNbr_taxID.dict.pkl')
taxID_accNbr_dict = loadPkl('taxID_accNbr.dict.pkl')
cog_spList_dict = loadPkl('cog_spList.dict.pkl')
sp_cogList_dict = loadPkl('sp_cogList.dict.pkl')
cog_rankMembers_dict = loadPkl('cog_rankMembers.dict_dict.pkl')
sp_cogList = loadPkl('sp_cogList.dict.pkl')
org_cogList = loadPkl('org_cogList.dict.pkl')
genus_cogList = loadPkl('genus_cogList.dict.pkl')
print "Dictionaries loaded",time.ctime()
In [94]:
# After collapsing the genome sequences to species level, I want to redo the same plots to map out the diversity
# of the sampling.
# (a) Rank1(i.e. kingdom):[list of spp],... (b) COG1:[List of rank members (for Rank1)],[List of members (for Rank2)]
import os
import collections
from ete2 import NCBITaxa, AttrFace, TreeStyle
ncbi = NCBITaxa()
cog_rankMembers_dict = dict()
def make_RankDictionary(spList):
rankName_memberList = collections.defaultdict(list)
allSp = [item.replace('_',' ') for item in spList]
name2taxid = ncbi.get_name_translator(allSp)
for [taxid] in name2taxid.values():
lineage = ncbi.get_lineage(taxid)
taxid_rankName_dict = ncbi.get_rank(lineage)
for taxid,rankName in taxid_rankName_dict.items():
taxid2name = ncbi.get_taxid_translator([taxid])
name = taxid2name[taxid]
rankName_memberList[rankName].append(name)
return rankName_memberList
for cog, spList in cog_spList_dict.items():
rankName_memberList_dict = make_RankDictionary(spList)
cog_rankMembers_dict[cog] = rankName_memberList_dict
savePkl(cog_rankMembers_dict,'cog_rankMembers.dict_dict.pkl')
print "COG : [Rank:List],.."
In [175]:
from ete2 import NCBITaxa, AttrFace, TreeStyle
ncbi = NCBITaxa()
def my_phyloTree(taxid_list):
taxid2name = ncbi.get_taxid_translator(taxid_list)
tree = ncbi.get_topology(taxid_list,rank_limit=None,collapse_subspecies=False)
return tree
# custom layout: adds "rank" on top of branches, and sci_name as tip names
def my_layout(node):
if getattr(node, "rank", None):
rank_face = AttrFace("sci_name", fsize=12, fgcolor="#d55e00")
node.add_face(rank_face, column=0, position="branch-top")
if node.is_leaf():
sciname_face = AttrFace("sci_name", fsize=12, fgcolor="#0072B2")
node.add_face(sciname_face, column=0, position="branch-right")
In [147]:
from ete2 import NCBITaxa, AttrFace, TreeStyle
ncbi = NCBITaxa()
treeFig_dir = '/home/jaggu/research/projectFiles/operons/figures/treeFigures'
treeDir = '/home/jaggu/research/projectFiles/operons/tree_files'
allStrains_dir = '/home/jaggu/research/allGenomePttFiles'
allTaxID = list()
for path, dirnames, files in os.walk(allStrains_dir):
for f in files:
accNbr = os.path.split(f)[1][:-4]
taxid = accNbr_taxID_dict.get(accNbr,None)
allTaxID.append(taxid)
allTaxID = set(allTaxID)
ts = TreeStyle()
ts.layout_fn = my_layout
ts.show_leaf_name = False
ts.mode = "c"
#ts.arc_start = 0 # 0 degrees = 3 o'clock
#ts.arc_span = 180
tree = my_phyloTree(allTaxID)
#tree.show(tree_style=ts)
# Save Tree as figure
fname = 'allbacterial.tree.'
f = os.path.join(treeFig_dir,fname)
#tree.write(features=[],outfile=f,format=1)
tree.show(tree_style=ts)
#print "Rendered", f
In [63]:
# The idea is to get the statistics of the database I am working with; How many species in each category etc.
# The rank nomenclature is : 1. Root 2.Cellular organisms; 3.Superkingdom; 4.Phylum; 5.Class; 6.Order; 7.Family;
# 8.Genus; 9.Species; 10.Strains
# Will get the counts for ranks = Phylum, class, order, family, genus, species
import collections
def populateRANK(allTaxID_list):
allRANK_dict = collections.defaultdict(list)
for taxid in allTaxID_list:
lineage = ncbi.get_lineage(taxid)
rank_dict = ncbi.get_rank(lineage)
for rankID,rankName in rank_dict.items():
allRANK_dict[rankName].append(rankID)
return allRANK_dict
def getAllTaxID():
allStrains_dir = '/home/jaggu/research/allGenomePttFiles'
allTaxID = list()
for path, dirnames, files in os.walk(allStrains_dir):
for f in files:
accNbr = os.path.split(f)[1][:-4]
taxid = accNbr_taxID_dict.get(accNbr,None)
allTaxID.append(taxid)
return allTaxID
def listMakeUp(idList):
name_Counts_dict = collections.defaultdict(int)
for taxid in idList:
[name] = ncbi.translate_to_names([taxid])
name_Counts_dict[name]+=1
return name_Counts_dict
allTaxID = getAllTaxID()
allTaxID_list = list(set(allTaxID))
print "Number of organisms (unique Taxon ID) : ",len(allTaxID_list)
allRANK_dict = populateRANK(allTaxID_list)
rankNames = allRANK_dict.keys()
composition_dict = dict()
for rankName, allID in allRANK_dict.items():
name_Counts_dict = listMakeUp(allID)
composition_dict[rankName] = name_Counts_dict
allPhylum = allRANK_dict['phylum']
allClass = allRANK_dict['class']
allOrder = allRANK_dict['order']
allFamily = allRANK_dict['family']
allGenus = allRANK_dict['genus']
allSpecies = allRANK_dict['species']
print "Number of unique species",len(allSpecies)
print composition_dict['phylum']
In [197]:
# Making new plots after the collapse of organisms (strains) to species level.
# Needs the sp_cogList.pkl for getting the list
# Taxonomic ranks considered - superkingdom (domain), phylum, class, order, family, genus, species
# But the taxonomy id maps it to more stratified divisions namely -
from __future__ import division
% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter
def plot_domainPhylaDistribution(imgDir, rankName_memberList_dict,level='sp'):
#Plotting the composition figure (a) pie chart for domain and (b) bar chart for phylum
def getXYList(rank):
rank_dict = getMembers(rank,rankName_memberList_dict)
nameCount_list = list()
for name, counts in rank_dict.items():
nameCount_list.append((name,counts))
nameCount_list = sorted(nameCount_list,key=itemgetter(1))
xNames,yList = zip(*nameCount_list)
xList = np.arange(len(xNames))
return xList, yList, xNames
# Bar chart highlighting the phyla represented
#rank = 'phylum'
#rank = 'class'
rank = 'order'
width = 0.5
xList, yList, xNames = getXYList(rank)
#new_yList = [item/sum(yList) for item in yList]
rects = plt.barh(xList, yList, width, color='#0072B2')
plt.yticks(xList+width,xNames)
plt.ylabel('Number of genomes present')
plt.title('Composition at the rank of '+rank + ' and level - '+level)
#plt.ylim([0,1000])
# Saving figure
fname = level+'-level.'+rank+'_composition.bar.svg'
f = os.path.join(imgDir,fname)
print f
plt.savefig(f,dpi=300)
plt.show()
# Pie chart highlighting the different superkingdoms represented
rank = 'superkingdom'
xList, yList, xNames = getXYList(rank)
colorList = ['#E69F00','#D55E00']
plt.pie(yList,labels=xNames,autopct='%1.1f%%',shadow=True,startangle=90,colors=colorList)
plt.axis('equal')
# Saving figure
fname = level+'-level.superkingdom_compositon.pie.svg'
f = os.path.join(imgDir,fname)
print f
plt.savefig(f,dpi=300)
plt.show()
return True
def listMakeUp(itemList):
name_counts_dict = collections.defaultdict(int)
for item in itemList:
name_counts_dict[item]+=1
return name_counts_dict
def make_RankDictionary(taxNameList,accNbr=False):
rankName_memberList = collections.defaultdict(list)
if accNbr:
allTaxID = [accNbr_taxID_dict.get(accNbr,None) for accNbr in taxNameList]
taxid_list = [item for item in allTaxID if item]
taxid2name = ncbi.get_taxid_translator(taxid_list)
allSp = taxid2name.values()
else:
allSp = [item.replace('_',' ') for item in taxNameList if not item == ''] #IMPORTANT
name2taxid = ncbi.get_name_translator(allSp)
for taxid_asList in name2taxid.values():
if len(taxid_asList) == 1: [taxid] = taxid_asList
else: taxid = max(taxid_asList) #There are multiple taxid mapped
lineage = ncbi.get_lineage(taxid)
taxid_rankName_dict = ncbi.get_rank(lineage)
for taxid,rankName in taxid_rankName_dict.items():
taxid2name = ncbi.get_taxid_translator([taxid])
name = taxid2name[taxid]
rankName_memberList[rankName].append(name)
return rankName_memberList
def getMembers(rank,rankName_memberList_dict):
member_counts = listMakeUp(rankName_memberList_dict[rank])
return member_counts
def org_levelDistribution():
allOrgs_accNbr = [item[1] for item in org_cogList.keys()]
org_rankName_memberList_dict = make_RankDictionary(allOrgs_accNbr,accNbr=True)
print org_rankName_memberList_dict.keys()
org_imgDir = '/home/jaggu/research/projectFiles/operons/newFigures/org_level_figures'
if not os.path.exists(org_imgDir): os.makedirs(org_imgDir)
plot_domainPhylaDistribution(org_imgDir, org_rankName_memberList_dict,level='org')
def sp_levelDistribution():
allSpecies = sp_cogList.keys()
sp_rankName_memberList_dict = make_RankDictionary(allSpecies)
sp_imgDir = '/home/jaggu/research/projectFiles/operons/newFigures/sp_level_figures'
if not os.path.exists(sp_imgDir): os.makedirs(sp_imgDir)
plot_domainPhylaDistribution(sp_imgDir, sp_rankName_memberList_dict,level='sp')
def genus_levelDistribution():
allGenus = genus_cogList.keys()
genus_rankName_memberList_dict = make_RankDictionary(allGenus)
genus_imgDir = '/home/jaggu/research/projectFiles/operons/newFigures/genus_level_figures'
if not os.path.exists(genus_imgDir): os.makedirs(genus_imgDir)
plot_domainPhylaDistribution(genus_imgDir, genus_rankName_memberList_dict,level='sp')
#genus_levelDistribution()
sp_levelDistribution()
#org_levelDistribution()
print "Composition of different ranks done"
In [164]:
lineage = ncbi.get_lineage(404)
print lineage
In [130]:
In [223]:
# Composition dictionary is made; Now I have to plot it as appropriate bar or pie chart
imgDir = '/home/jaggu/research/projectFiles/operons/figures'
#% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
print "Composition dictionary keys : ",composition_dict.keys()
def getXYList(rank):
nameCount_list = list()
name_counts_dict = composition_dict[rank]
for name, counts in name_counts_dict.items():
nameCount_list.append((name,counts))
xNames,yList = zip(*nameCount_list)
xList = np.arange(len(xNames))
return xList, yList, xNames
rank = 'phylum'
width = 0.5
xList, yList, xNames = getXYList(rank)
rects = plt.bar(xList, yList, width, color='#0072B2')
plt.xticks(xList+width,xNames,rotation='vertical')
plt.ylabel('Number of genomes present')
plt.title('Composition at the rank of '+rank)
# Saving figure
fname = 'phylum_composition.bar.svg'
f = os.path.join(imgDir,fname)
print f
plt.savefig(f,dpi=300)
plt.show()
# Pie chart highlighting the different superkingdoms represented
rank = 'superkingdom'
xList, yList, xNames = getXYList(rank)
explode = (0.75, 0.5,0) # only "explode" the 2nd slice (i.e. 'Hogs')
colorList = ['#E69F00','#D55E00','#0072B2']
plt.pie(yList,explode=explode,labels=xNames,autopct='%1.1f%%',shadow=True,startangle=90,colors=colorList)
plt.axis('equal')
# Saving figure
fname = 'superkingdom_compositon.pie.svg'
f = os.path.join(imgDir,fname)
print f
plt.savefig(f,dpi=300)
plt.show()
In [ ]:
In [222]:
# Plotting a tree of the phylum - proteobacteria; Getting the taxon ID of all the species from the given phylum
treeFig_dir = '/home/jaggu/research/projectFiles/operons/figures/treeFigures'
treeDir = '/home/jaggu/research/projectFiles/operons/treeFiles'
print "Keys in allRANK_dict",allRANK_dict.keys()
def drawTree(tree):
# Tree manipulation
ts = TreeStyle()
ts.layout_fn = my_layout
ts.show_leaf_name = False
ts.mode = "c"
fname = 'proteobacteria.tree.newick'
f = os.path.join(treeDir,fname)
tree.write(outfile=f,format=1)
print "Written",f
#tree.render("%%inline",tree_style = ts)
#tree.show(tree_style=ts)
def get_taxonID_fromRank(rankID,allTaxID_list):
filtered_list = list()
for taxid in allTaxID_list:
lineage = ncbi.get_lineage(taxid)
if rankID in lineage:
filtered_list.append(taxid)
return filtered_list
def listMakeUp(idList):
name_Counts_dict = collections.defaultdict(int)
for taxid in idList:
[name] = ncbi.translate_to_names([taxid])
name_Counts_dict[name]+=1
return name_Counts_dict
allTaxID = getAllTaxID()
allTaxID_list = list(set(allTaxID))
print "Number of organisms (unique Taxon ID) : ",len(allTaxID_list)
#print allRANK_dict['phylum']
def getComposition(name):
xNames = list()
yList = list()
name2taxid = ncbi.get_name_translator([name])
[rankID] = name2taxid[name]
filtered_taxonList = get_taxonID_fromRank(rankID,allTaxID_list)
tree = my_phyloTree(filtered_taxonList)
for node in tree.get_children():
taxid = int(node.name)
[name] = ncbi.translate_to_names([taxid])
xNames.append(name)
yList.append(len(node.get_leaves()))
xList = np.arange(len(xNames))
return xList, yList, xNames
#name = 'Proteobacteria'
#name = 'Firmicutes'
name = 'Bacilli'
#name = 'Gammaproteobacteria'
#name = 'Enterobacteriaceae'
#name = 'Escherichia'
xList, yList, xNames = getComposition(name)
print xList, xNames, yList
print sum(yList)
width = 0.5
rects = plt.bar(xList, yList, width, color='#0072B2')
plt.xticks(xList+0.25,xNames,rotation='vertical')
plt.ylabel('Number of genomes present')
plt.title('Composition of '+name)
# Saving figure
fname = name + '_composition.bar.svg'
f = os.path.join(imgDir,fname)
print f
plt.savefig(f,dpi=300)
plt.show()
In [40]:
allTaxID_list = list(allTaxID)
type(allTaxID_list)
taxid = allTaxID_list[0]
lineage = ncbi.get_lineage(1420014)
print lineage
lineage = ncbi.get_lineage(taxid)
print lineage
print ncbi.translate_to_names([1239])
print ncbi.translate_to_names(lineage)
db = ncbi.get_rank(lineage)
for rankid,rankname in db.items():
if rankname == 'phylum':
print rankid, rankname
In [70]:
from ete2 import Tree, TreeStyle
t = Tree()
t.populate(10)
ts = TreeStyle()
ts.show_leaf_name = True
ts.branch_vertical_margin = 10 # 10 pixels between adjacent branches
t.render("%%inline",tree_style=ts)
Out[70]:
In [ ]: