In [5]:
from __future__ import division
import pickle
import os
import time
import collections
def loadPkl(fname):
pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
f = os.path.join(pklDir,fname)
db = pickle.load(open(f))
return db
def savePkl(db,pklFname):
pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
f = os.path.join(pklDir,pklFname)
pickle.dump(db,open(f,'w'))
return
In [6]:
org_cogPair = loadPkl('org_cogPair.dict.pkl')
org_eggNOGPair = loadPkl('org_eggNOGPair.dict.pkl')
cogPair_sp_fgocInfo = loadPkl('cogPair_fgocInfo.spp.dict.pkl')
cogPair_genus_fgocInfo = loadPkl('cogPair_fgocInfo.genus.dict.pkl')
eNogPair_sp_fgocInfo = loadPkl('eNOGPair_fgocInfo.spp.dict.pkl')
eNogPair_genus_fgocInfo = loadPkl('eNOGPair_fgocInfo.genus.dict.pkl')
print "Loaded dictionaries",time.ctime()
In [21]:
# A general way of collapsing to species and genus leves; What you require is org:[(OG1,OG2),(OG2,OG3),...]
# Can currently handle the COG and eggNOG
# Making species and genus level dictionary: i.e sp_cogPair; genus_cogPair
# sp:[(COG1,COG2),(COG2,COG3),..]
# genus:[(COG1,COG2),(COG2,COG3),..]
# The basic idea is to collapse the cogpairs to be the union of all the cogpairs found in the encompassing orgs (strains)
class CollapsedRank:
def __init__(self):
self.sp_allOGPairs = collections.defaultdict(list)
self.genus_allOGPairs = collections.defaultdict(list)
self.sp_ogPair = dict()
self.sp_ogList = dict()
self.genus_ogPair = dict()
self.genus_ogList = dict()
def unionOgPair(self,ogPair_list):
ogPair_set = [set(i) for i in ogPair_list]
ogPair_union = list(set.union(*ogPair_set))
return ogPair_union
def getOgList(self,ogPair_list):
ogList = list()
for pair in ogPair_list:
ogList.extend(pair)
return list(set(ogList))
def collapseSpeciesGenus(self,org_ogPair):
for (orgName,accNbr),ogPair_list in org_ogPair.items():
sp = orgName.split('_')[0]+'_'+orgName.split('_')[1]
genus = orgName.split('_')[0]
self.sp_allOGPairs[sp].append(ogPair_list)
self.genus_allOGPairs[genus].append(ogPair_list)
for sp, ogPair_list in self.sp_allOGPairs.items():
ogPair_union = self.unionOgPair(ogPair_list)
self.sp_ogPair[sp] = ogPair_union
ogList = self.getOgList(ogPair_union)
self.sp_ogList[sp] = ogList
for genus, ogPair_list in self.genus_allOGPairs.items():
ogPair_union = self.unionOgPair(ogPair_list)
self.genus_ogPair[genus] = ogPair_union
ogList = self.getOgList(ogPair_union)
self.genus_ogList[genus] = ogList
eggNOG = CollapsedRank()
eggNOG.collapseSpeciesGenus(org_eggNOGPair)
savePkl(eggNOG.sp_ogPair,'sp_eggNOGPair.dict.pkl')
savePkl(eggNOG.sp_ogList,'sp_eggNOGList.dict.pkl')
savePkl(eggNOG.genus_ogPair,'genus_eggNOGPair.dict.pkl')
savePkl(eggNOG.genus_ogList,'genus_eggNOGcogList.dict.pkl')
print "Collapsed done; Species and Genus level collapse of COGPairs done;",time.ctime()
In [26]:
# Calculating FGOC and dFGOC score for each COGPair
# ['COG-A','COG-B','COUNT','dirFGOC','FGOC','COUNT(COG-A)','COUNT(COG-B)','\n']
# I am going to run through the list of species_cogPair and make a dictionary for the COG Pair;
# Similar function for the genus level
# Making the following intermediate dictionaries -
# (a) cogPair = #(A --> B); COUNT
# (b) cogA = sp1,sp2,...; (c) cogB = sp1,sp2,...
cogPair_fgocInfo_sp = collections.defaultdict(list)
cogPair_fgocInfo_genus = collections.defaultdict(list)
def getCogPair_fgocInfo(db):
cogPair_count = collections.defaultdict(int)
cog_nameList_duplicated = collections.defaultdict(list)
cogPair_fgocInfo = dict()
cog_nameList = dict()
for name,cogPair_list in db.items():
for cogPair in cogPair_list:
cogA, cogB = cogPair
if cogA == '-' and cogB == '-': continue;
else:
cog_nameList_duplicated[cogA].append(name)
cog_nameList_duplicated[cogB].append(name)
cogPair_count[cogPair]+=1
# Due to the way the spp name was added to the dictionary, there are duplicates for each COG
for cog,sp_duplicated in cog_nameList_duplicated.items():
cog_nameList[cog] = list(set(sp_duplicated))
for cogPair,count in cogPair_count.items():
cogA, cogB = cogPair
if not (cogA == '-' or cogB == '-'): #Both COG is annotated
countA = len(cog_nameList[cogA])
countB = len(cog_nameList[cogB])
fgoc = count / (countA + countB)
dfgoc = count / (countA)
else:
if cogA == '-': countA = 'nan'
elif cogB == '-': countB = 'nan'
else: raise SystemError("Not possible; Should have been eliminated")
if countB == 'nan':
countA = len(cog_nameList[cogA])
fgoc = 'nan'
dfgoc = count/(countA)
else:
countB = len(cog_nameList[cogB])
fgoc = 'nan'
dfgoc = 'nan'
fgocInfo = [cogA, cogB, count, dfgoc, fgoc, countA, countB]
cogPair_fgocInfo[(cogA,cogB)] = fgocInfo
return cogPair_count, cog_nameList, cogPair_fgocInfo
def makeFGOC(ogName):
if ogName == 'COG':
cogPair_sp_count, cog_spList, cogPair_sp_fgocInfo = getCogPair_fgocInfo(sp_cogPair)
savePkl(cogPair_sp_fgocInfo,'cogPair_fgocInfo.spp.dict.pkl')
savePkl(cog_spList, 'cog_spList.dict.pkl')
cogPair_genus_count, cog_genusList, cogPair_genus_fgocInfo = getCogPair_fgocInfo(genus_cogPair)
savePkl(cogPair_genus_fgocInfo,'cogPair_fgocInfo.genus.dict.pkl')
savePkl(cog_genusList, 'cog_genusList.dict.pkl')
elif ogName == 'eggNOG':
eNogPair_sp_count, eNog_spList, eNogPair_sp_fgocInfo = getCogPair_fgocInfo(sp_eNOGPair)
savePkl(eNogPair_sp_fgocInfo,'eNOGPair_fgocInfo.spp.dict.pkl')
savePkl(eNog_spList, 'eNOG_spList.dict.pkl')
eNogPair_genus_count, eNog_genusList, eNogPair_genus_fgocInfo = getCogPair_fgocInfo(genus_eNOGPair)
savePkl(eNogPair_genus_fgocInfo,'eNOGPair_fgocInfo.genus.dict.pkl')
savePkl(eNog_genusList, 'eNOG_genusList.dict.pkl')
makeFGOC('eggNOG')
print "OG_Pair FGOC info dictionaries created;",time.ctime()
In [12]:
# Making Graph; With cutoffs 0, 0.1 and 0.3 at the species and genus levels
# Need cogPair_sp_fgocInfo and cogPair_genus_fgocInfo
# For eggNOG graphs - there are many orthogroups that are generated by non-supervised clustering; These are
# named as ENOG; The other groups are COG
sourceDestDir = '/home/jaggu/research/projectFiles/operons/graphFiles'
def makeGraphFile(ogName, rank,cutoff,nanStatus,rank_fgocInfo,filtered=True):
if filtered:
print "Filtered by counts"
countCutoff = 10
graphDestDir = os.path.join(sourceDestDir,ogName,rank,'countFilter')
if not os.path.exists(graphDestDir): os.makedirs(graphDestDir)
f = 'basic.'+rank+'.cutoff_'+str(cutoff)+'.'+str(nanStatus)+ '.countCutoff_'+str(countCutoff)+ '.tab'
else:
print "Not filtered"
countCutoff = 0
graphDestDir = os.path.join(sourceDestDir,ogName,rank)
if not os.path.exists(graphDestDir): os.makedirs(graphDestDir)
f = 'basic.'+rank+'.cutoff_'+str(cutoff)+'.'+str(nanStatus)+'.tab'
ofname = os.path.join(graphDestDir,f)
ofile = open(ofname,'w')
header = '\t'.join(['OG-A','OG-B','COUNT','dirFGOC','FGOC','COUNT(OG-A)','COUNT(OG-B)','\n'])
ofile.write(header)
for cogPair, fgocInfo in rank_fgocInfo.items():
# fgocInfo = [cogA, cogB, count, dfgoc, fgoc, countA, countB]
[cogA, cogB, count, dfgoc, fgoc, countA, countB] = fgocInfo
if ogName == 'eggNOG':
cogA = cogA[1:]
cogB = cogB[1:]
fgocInfo = [cogA,cogB,count,dfgoc,fgoc,countA,countB]
if dfgoc > cutoff:
if count > countCutoff:
if nanStatus == 'noNaN':
if not fgoc == 'nan': #Because if either cogA or cogB is nan; then fgoc = nan
l = '\t'.join(map(str,fgocInfo)) + '\n'
ofile.write(l)
else:
l = '\t'.join(map(str,fgocInfo)) + '\n'
ofile.write(l)
ofile.close()
return ofname
def writeGraphFile(rank,cutoff,nanStatus,ogName,filtered):
if ogName == 'COG':
if rank == 'species':
f = makeGraphFile(ogName,rank,cutoff,nanStatus,cogPair_sp_fgocInfo,filtered) #for species
if rank == 'genus':
f = makeGraphFile(ogName,rank,cutoff,nanStatus,cogPair_genus_fgocInfo,filtered) #for genus
if ogName == 'eggNOG':
if rank == 'species':
f = makeGraphFile(ogName,rank,cutoff,nanStatus,eNogPair_sp_fgocInfo,filtered) #for species
if rank == 'genus':
f = makeGraphFile(ogName,rank,cutoff,nanStatus,eNogPair_genus_fgocInfo,filtered) #for genus
return True
ogName = 'eggNOG'
for rank in ('genus','species'):
for cutoff in (0.0,0.1,0.2,0.3):
for nanStatus in ('nan','noNaN'):
for filtered in (True,False):
print "Processing ",ogName,"...",rank,cutoff,nanStatus
writeGraphFile(rank,cutoff,nanStatus,ogName,filtered)
In [ ]: