Author: Jagannath

Date: 25th Sept 2015

Last modified : 25th Sept 2015

The file parses the .ptt file to extract information about the CDS (protein coding genes).
Total number of species analyzed : 2,774
Total number of genomes (including plasmids) : 5,220

Example of the format (tab delimited)- Acaryochloris marina MBIC11017 chromosome, complete genome - 1..6503724 6254 proteins
Location Strand Length PID Gene Synonym Code COG Product 1627..2319 - 230 158333234 - AM1_0003 - COG1051F NUDIX hydrolase
Locus_tag is used as the identifier although PID (protein ID) is also provided. The dictionaries created are -

{ locus_tag : [accNbr, rank, geneName, PID (protein ID), (startPos,endPos), COG (COG number with Code)] }

{ COG (w/ code) : ( [locus_tag1, locus_tag2, ...],Count of locus_tags ) }

{ COG(A)-COG(B) : count } # Count of number of times when COG(A) before COG(B); A --> B on the same strand

{ (orgName,accNbr):( (COG(A),COG(B)),....) )

{ COG(A)-COG(B) : set([(orgName,accNbr),...])} </ol> [! In future, maybe can think of A --> B (but A and B are on different strands); Also this NCBI link has details about its precomputed clusters link



In [2]:

    
from __future__ import division
import os
from collections import defaultdict
import fnmatch
import pickle
import re
import time
import sys
sys.path.append('/home/jaggu/research/downloads/vienna/viennaRNA/lib/python2.7/site-packages')
import RNA

class PTTfile:

    def __init__(self,pttFile):
        self.pttFile = pttFile
        self.accNbr = os.path.split(pttFile)[-1].split('.')[0]
        self.orgName = os.path.split(os.path.split(pttFile)[0])[1]
        self.fnaFile = self.getFnaFile(pttFile) 
        #This is the fasta file of the entire sequence; Note there is substitution of directory
    
    def getFnaFile(self,pttFile):
        s = pttFile
        p = re.compile('allGenomePttFiles')
        fnaFile = p.sub('allGenomeFnaFiles',s)
        fnaFile = fnaFile[:-4]+'.fna'
        return fnaFile
    
    def openPttFile(self):
        f = self.pttFile
        ifile = open(f,'r')
        allLines = ifile.readlines()
        orgInfo = allLines[0]
        self.genome_size = int(orgInfo.strip().split('..')[-1])
        lines = allLines[3:] #Skipping the header
        ifile.close()
        return lines
        
    def getPos(self,loc,strand):
        if strand is '+':
            startPos, endPos = map(int,loc.split('..'))
        else:
            endPos, startPos = map(int,loc.split('..'))
        return (startPos,endPos)

    def makeLinePairs(self,lines):
        # Makes pairs of adjacent genes; By default all genomes are considered circular. So the last pair in the lines 
        # is the last gene --> first gene
        # Returns list of [(line0,line1,rank0),(line1,line2,rank1),...(line4175,line0,rank4175)]
        nbrGenes = len(lines)
        line_first = lines[0]
        line_last = lines[-1]
        lines_shift = lines[1:]
        lines_shift.append(line_first)
        linePairs = zip(lines,lines_shift,range(nbrGenes))
        return linePairs

    def getCDSinfo(self,line,rank):
        # Column information
        # [0]Location	[1]Strand	[2]Length	[3]PID	[4]Gene	[5]Synonym	[6]Code	[7]COG	[8]Product
        (loc, strand, pid, gene, locus_tag, COG) = (line.split('\t')[i] for i in [0,1,3,4,5,7])
        if ',' in COG: # There are some wierd .ptt files wherein the COG is written as COG1,COG1
            COG = COG.split(',')[0]
        if not COG.startswith('-'):
            [COG] = re.findall("COG\d+",COG)
        #pos = getPos(loc,strand)
        pos = map(int,loc.split('..'))
        cds_info = [self.accNbr, rank, locus_tag, gene, pid, pos, strand, COG]
        return cds_info

    def getCOGpair(self,geneA,geneB):
        # Returns a tuple of COGpairs; If orientation is '-' and '-', then the cogpairs are reversed accordingly
        cogPair = ('-','-')
        strA = geneA[-2]
        strB = geneB[-2]
        if strA == strB == '+':
            cogPair = (geneA[-1],geneB[-1])
        elif strA == strB == '-':
            cogPair = (geneB[-1],geneA[-1])
        else:
            cogPair = ('-','-')
        return cogPair
    
    def get_DNAsequence(self):
        ifile = open(self.fnaFile,'r')
        lines = ifile.readlines()
        ifile.close()
        allDNA = ''
        for l in lines[1:]:
            allDNA+=l.strip()
        return allDNA
    
    def getSeq(self,start,end):
        return self.allDNA[start:end]

    def getGC(self,seq):
        try:
            gc = (seq.count('G')+seq.count('C'))/len(seq)
        except ZeroDivisionError:
            gc = 0
        return gc
    
    def getStartEnd(self,lposB,rposA):
        dist = lposB - rposA
        if dist >= 0:
            start = rposA
            end = lposB
        else:
            if abs(dist) > self.genome_size/100: # Some large number; This is for gene pairs (first and last gene)
                startA = rposA
                endA = self.genome_size
                startB = 0
                endB = lposB
                return ((startA,endA),(startB,endB))
            else:
                start = lposB
                end = rposA
        return ((start,end),True)

    def calculate_intergenicDistance(self,lposB,rposA):
        dist = lposB - rposA
        if abs(dist) > self.genome_size/100: # Some large number; This is for gene pairs (first and last gene)
            dist = (self.genome_size - rposA) + lposB
        return dist
    
    def getGCcontent(self,(lposA,rposA),(lposB,rposB)):
        seqA = self.getSeq(lposA,rposA)
        seqB = self.getSeq(lposB,rposB)
        geneA_GC = self.getGC(seqA)
        geneB_GC = self.getGC(seqB)
        
        # Intergenic
        between1,between2 = self.getStartEnd(lposB,rposA)
        if between2 is True:
            start, end = between1
            seq = self.getSeq(start,end)
        else: #If these are the last gene pairs on the chromosome
            startA,endA = between1
            seq1 = self.getSeq(startA,endA)
            startB,endB = between2
            seq2 = self.getSeq(startB,endB)
            seq = seq1 + seq2

        intergenic_GC = self.getGC(seq)
        return geneA_GC,geneB_GC,intergenic_GC,seq
    
    def getSecStr(self,dnaSeq):
        dotNot = RNA.fold(dnaSeq)[0]
        re.findall("\(+",dotNot)
        _rep = re.sub("\(+","S",dotNot)
        _rep = re.sub("\)+","S",_rep)
        slNotation = re.sub("\.+","L",_rep)
        return slNotation
    
    def populateDict(self):
        self.allDNA = self.get_DNAsequence()
        lines = self.openPttFile()
        linePairs = self.makeLinePairs(lines)
        for pair in linePairs:
            cogPair = ('-','-')
            lineA,lineB,rankA = pair
            geneA = self.getCDSinfo(lineA,rankA)
            geneB = self.getCDSinfo(lineB,rankA+1)
            lTagA = geneA[2]; lTagB = geneB[2]
            cogPair = self.getCOGpair(geneA,geneB)
            cogPair_count_dict[cogPair]+=1
            cog_locusTagList_dict[geneA[-1]].append(geneA[2])
            locus_cog_dict[geneA[2]] = geneA[7]
            lposA,rposA = geneA[5]
            lposB,rposB = geneB[5]
            geneA_GC,geneB_GC, intergenic_GC,intergenic_seq = self.getGCcontent((lposA,rposA),(lposB,rposB))
            #org_lTagPairInterGeneSeq_dict[(self.orgName,self.accNbr)].append([lTagA,lTagB,intergenic_seq])
            secStr = self.getSecStr(intergenic_seq)
            print secStr
            bp_dist = self.calculate_intergenicDistance(lposB,rposA)
            if not cogPair == ('-','-'):
                org_cogPair_dict[(self.orgName,self.accNbr)].append(cogPair)
                org_cog_dict[(self.orgName,self.accNbr)].append(cogPair[0])
                cogPair_org_dict[cogPair].append((self.orgName,self.accNbr))
                cogPair_bpdist_dict[cogPair].append(bp_dist)
                cogPair_GCcontent_dict[cogPair].append((geneA_GC,geneB_GC,intergenic_GC))
                org_lTagPair_dict[(self.orgName,self.accNbr)].append((lTagA,lTagB))
                
# Defining dictionary
cogPair_count_dict = defaultdict(int)
cog_locusTagList_dict = defaultdict(list)
locus_cog_dict = dict()
org_cogPair_dict = defaultdict(list) 
org_cog_dict = defaultdict(list)
cogPair_org_dict = defaultdict(list)
cogPair_bpdist_dict = defaultdict(list)
cogPair_GCcontent_dict = defaultdict(list)
org_lTagPair_dict = defaultdict(list)
org_lTagPairInterGeneSeq_dict = defaultdict(list)

sourceDir = '/home/jaggu/research/allGenomePttFiles'
pklPath = '/home/jaggu/research/projectFiles/operons/pklFiles'

def locate(pattern, root=os.curdir):
    '''Locate all files matching supplied filename pattern in and
    below supplied root directory.'''
    allFiles = []
    for path, dirs, files in os.walk(os.path.abspath(root)):
        for filename in fnmatch.filter(files, pattern):
            allFiles.append(os.path.join(path,filename))
    return allFiles

def savePkl(db,pklFname):
    f = os.path.join(pklPath,pklFname)
    pickle.dump(db,open(f,'w'))
    return 

def main(sourceDir):
    allPttFiles = locate('*.ptt',sourceDir)
    for pttFile in allPttFiles:
        #print "Parsing %s ...."%(pttFile)
        genome = PTTfile(pttFile)
        genome.populateDict()
    savePkl(cogPair_count_dict,'cogPair_count.dict.pkl')
    savePkl(cog_locusTagList_dict,'cog_locusTag.dict.pkl')
    savePkl(locus_cog_dict,'locus_cog.dict.pkl')
    savePkl(org_cogPair_dict,'org_cogPair.dict.pkl')
    savePkl(org_cog_dict,'org_cogList.dict.pkl')
    savePkl(cogPair_org_dict,'cogPair_org.dict.pkl')
    savePkl(cogPair_bpdist_dict,'cogPair_bpdist.dict.pkl')
    savePkl(cogPair_GCcontent_dict,'cogPair_GCcontent.dict.pkl')
    savePkl(org_lTagPair_dict,'org_locusTagPair.dict.pkl')
    savePkl(org_lTagPairInterGeneSeq_dict,'org_locusTagPairInterGeneSeq.dict.pkl')
    print "Parsed all files at",time.ctime()
        
def test_case():
    orgName = 'Bacillus_subtilis_168_uid57675'
    pttFile = 'NC_000964.ptt'
    test = PTTfile(os.path.join(sourceDir,orgName,pttFile))
    test.get_DNAsequence()
    test.populateDict()
    
    #print org_cogPair_dict.items()[0]
    #print len(set(org_cog_dict.values()[0]))
    #print len(org_cog_dict.values()[0])
    #print cogPair_count_dict
    #print cogPair_bpdist_dict
    #print cogPair_GCcontent_dict.items()[10]

print "Starting ",time.ctime()
#main(sourceDir)
test_case()
print "Parsing all genomes done",time.ctime()









    



Starting  Thu Dec 24 14:02:46 2015
LSLSLSLSLSLSLSLSLSLSLSLSLSLSLSLSLSL
LSLSLSLSSLSLSLSLSLSLSLSLSLSLSLSLSLSLSL
LSLSL
L
LSLSLSLSSLSLSLSLSL
LSLSSLSSLSLSLSLSLSLSLSLSLSLSLSLSLSLSL






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-2-90aae8b3c07d> in <module>()
    244 print "Starting ",time.ctime()
    245 #main(sourceDir)
--> 246 test_case()
    247 print "Parsing all genomes done",time.ctime()

<ipython-input-2-90aae8b3c07d> in test_case()
    233     test = PTTfile(os.path.join(sourceDir,orgName,pttFile))
    234     test.get_DNAsequence()
--> 235     test.populateDict()
    236 
    237     #print org_cogPair_dict.items()[0]

<ipython-input-2-90aae8b3c07d> in populateDict(self)
    170             geneA_GC,geneB_GC, intergenic_GC,intergenic_seq = self.getGCcontent((lposA,rposA),(lposB,rposB))
    171             #org_lTagPairInterGeneSeq_dict[(self.orgName,self.accNbr)].append([lTagA,lTagB,intergenic_seq])
--> 172             secStr = self.getSecStr(intergenic_seq)
    173             print secStr
    174             bp_dist = self.calculate_intergenicDistance(lposB,rposA)

<ipython-input-2-90aae8b3c07d> in getSecStr(self, dnaSeq)
    145 
    146     def getSecStr(self,dnaSeq):
--> 147         dotNot = RNA.fold(dnaSeq)[0]
    148         re.findall("\(+",dotNot)
    149         _rep = re.sub("\(+","S",dotNot)

KeyboardInterrupt:



In [26]:

    
# Mapping KEGG orthogroup (kog) to COG orthogroup or a list of COG (cog)
# This is a many to many set mapping; The file was downloaded from KEGG database.
import os
import re
from collections import defaultdict
import cPickle as pickle

sourceDir = '/home/jaggu/research/downloads/cogDB'
fname = 'kog2cog.txt'
ifile = open(os.path.join(sourceDir,fname),'r')
lines = ifile.readlines()
ifile.close()

def savePkl(db,pklFname):
    pklPath = '/home/jaggu/research/projectFiles/operons/pklFiles'
    f = os.path.join(pklPath,pklFname)
    pickle.dump(db,open(f,'w'))
    return 

# Dictionary 
kog_cogList_dict = dict()
cog_kogList_dict = defaultdict(list)

for line in lines[1:]:
    kog,cogStr = line.split('\t')
    cogList = re.findall("COG\d+",cogStr)
    kog_cogList_dict[kog]=cogList
    for cog in cogList:
        cog_kogList_dict[cog].append(kog) 

savePkl(kog_cogList_dict,'kog_cogList.dict.pkl')
savePkl(cog_kogList_dict,'cog_kogList.dict.pkl')

print "Dictionaries pickled"









    



Dictionaries pickled



In [1]:

    
import time
import os
import pickle

def loadPkl(fname):
    pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
    f = os.path.join(pklDir,fname)
    db = pickle.load(open(f))
    return db

def savePkl(db,pklFname):
    pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'    
    f = os.path.join(pklDir,pklFname)
    pickle.dump(db,open(f,'w'))
    return 

#cogPair_fgocInfo_dict = loadPkl('cogPair_fgocInfo.dict.pkl')
#cog_func_dict = loadPkl('cogFunc.dict.pkl')
#cog_locusTag_dict = loadPkl('cog_locusTag.dict.pkl')
print "Dictionaries loaded",time.ctime()









    



Dictionaries loaded Wed Dec 23 16:01:24 2015



In [10]:

    
cog_spList = loadPkl('cog_spList.dict.pkl')
cog_func_dict = loadPkl('cogFunc.dict.pkl')
cog_locusTag_dict = loadPkl('cog_locusTag.dict.pkl')


print "Dictionaries loaded",time.ctime()









    



Dictionaries loaded Wed Dec 23 16:21:15 2015



In [12]:

    
# Statistics of the COG functions
# Redid this figure at species level collapse
import os
import pickle
import collections
from operator import itemgetter

imgDir = '/home/jaggu/research/projectFiles/operons/figures'

% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

code_fname = os.path.join('/home/jaggu/research/downloads/cogDB','fun2003-2014.tab')
ifile = open(code_fname)
lines = ifile.readlines()
ifile.close()

code_category_dict = dict()
for line in lines[1:]:
    code, name = line.strip().split('\t')
    code_category_dict[code]=name

category_counts_dict = collections.defaultdict(int)

for cog, ltag in cog_locusTag_dict.items():
    codeNames, func = cog_func_dict.get(cog,(None,None))
    if func:
        if len(codeNames)>1:
            catSplit = list(codeNames)
            for code in catSplit: 
                category_counts_dict[code]+=len(ltag)
        else:
            category_counts_dict[codeNames]+=len(ltag)

            
xNames = list()
xList = list()
yList = list()
xyList = list()


for code, nbr in sorted(category_counts_dict.items()):
    wname = code + ' : ' + code_category_dict[code]
    xyList.append([wname,nbr])
    
xyList = sorted(xyList,key=itemgetter(1))
xNames,yList = zip(*xyList)
print "Total number of genes mapped : ",sum(yList)
width = 0.5
xList = np.arange(len(xNames))
rects = plt.barh(xList, yList, width, color='#0072B2')
plt.yticks(xList+0.2,xNames,rotation='horizontal')
plt.ylabel('Number of categories')
plt.title('Composition of COG functional categories ')
plt.xlim([0,400000])
# Saving figure
fname = 'COG_function_composition.totalLtag.bar.svg'
f = os.path.join(imgDir,fname)
print f
#plt.savefig(f,dpi=300)
plt.show()









    



Total number of genes mapped :  4046852
/home/jaggu/research/projectFiles/operons/figures/COG_function_composition.totalLtag.bar.svg

Generating the entire orthologous pair for eggNOG

I first had to download the mapping file - eggNOG uses something called proteinid (PID or pid here) = taxid.seqid; This file is - eggnog4.protein_id_conversion.tsv; I ran a shell The other files are mainly members.tsv file downloaded from [eggNOG website](http://eggnogdb.embl.de/#/app/downloads) on 18th December 2015.
The files are - (a) NOG.members.tsv (b) bactNOG.members.tsv and (c) arNOG.members.tsv

Dictionaries

Finally going to make the dictionaries -
1. org : [(eNOG1,eNOG2),(eNOG2,eNOG3)...] --> org_eNOGPair_dict
2. org : [eNOG1,eNOG2,...] --> org_eNOGList_dict
5. eNOG : [sp1,sp2,...] (unique spp) --> eNOG_spList_dict
6.



In [19]:

    
# Converting locusTag to taxid.seqID (this is PID - protein ID); 
# Then there is a tab delimited file, where eggNOG members are represented by this PID  
# Dictionary - (a) locusTag : PID (taxid.seqid); (b) PID : locusTag
# (c)eggNOG : [locusTag1,...] (from the members file) (d) locusTag : eggNOG
import collections

def convertLocusTag_PID(f):
    lTag_PID_dict = dict()
    PID_lTag_dict = dict()
    for line in open(f):
        taxid,seqid,keggid,srcDB = [line.strip().split('\t')[i] for i in (0,1,2,3)]
        lTag = keggid.split(':')[-1]
        pid = taxid + '.' + seqid
        lTag_PID_dict[lTag] = pid
        PID_lTag_dict[pid] = lTag
    return lTag_PID_dict, PID_lTag_dict

eggNOGsourceDir = '/home/jaggu/research/downloads/eggNOG'
lTag_eNOG_dict = dict()

convFname = 'eggNOG.keggid_conversion.tsv' #Made this file in shell
f1 = os.path.join(eggNOGsourceDir,convFname)

lTag_PID_dict,PID_lTag_dict = convertLocusTag_PID(f1)
savePkl(lTag_PID_dict,'locusTag_PID.dict.pkl')
savePkl(PID_lTag_dict,'PID_locusTag.dict.pkl')
print "locus Tag <-> PID mapped at",time.ctime()









    



locus Tag <-> PID mapped at Fri Dec 18 16:48:26 2015



In [5]:

    
# Loading dictionaries
lTag_PID_dict = loadPkl('locusTag_PID.dict.pkl')
PID_lTag_dict = loadPkl('PID_locusTag.dict.pkl')
org_lTagPair_dict = loadPkl('org_locusTagPair.dict.pkl')
lTag_eNOG_dict = loadPkl('locusTag_eggNOG.dict.pkl')

print "Dictionaries loaded ",time.ctime()









    



Dictionaries loaded  Fri Dec 18 20:33:18 2015



In [8]:

    
# Using the NOG members file to create the dictionaries.
import collections

def getENOG_lTag(f):
    lTag_eNOG_dict = dict()
    for line in open(f):
        NOG,pid_list = [line.strip().split('\t')[i] for i in (1,5)]
        eNOG = 'e'+NOG
        for pid in pid_list.split(','): 
            try:
                lTag = PID_lTag_dict[pid]
                lTag_eNOG_dict[lTag] = eNOG
            except KeyError: pass
    return lTag_eNOG_dict
            
eggNOGsourceDir = '/home/jaggu/research/downloads/eggNOG'

NOG_membersFname = 'NOG.members.tsv'
f2 = os.path.join(eggNOGsourceDir,NOG_membersFname)

lTag_eNOG_dict = getENOG_lTag(f2)
savePkl(lTag_eNOG_dict,'locusTag_eggNOG.dict.pkl')

print "locus Tag eggNOG dictionary created",time.ctime()









    



locus Tag eggNOG dictionary created Fri Dec 18 17:18:20 2015



In [6]:

    
# Now I have locus Tag : eggNOG dictionary. Will use the org: (locusTagPairs)... to generate an org:(eNOG1,eNOG2),..
# dictionary. This can be collapsed to species and genus level.
import collections
import time

def get_eNOG(lTag):
    eNOG = lTag_eNOG_dict.get(lTag,'-')
    return eNOG

org_eggNOGPair = collections.defaultdict(list)

for org,lTagPair_list in org_lTagPair_dict.items():
    for [lTag1,lTag2] in lTagPair_list:
        eNOG1 = get_eNOG(lTag1)
        eNOG2 = get_eNOG(lTag2)
        if not (eNOG1 == '-' and eNOG2 == '-'):
            org_eggNOGPair[org].extend([(eNOG1,eNOG2)])

savePkl(org_eggNOGPair,'org_eggNOGPair.dict.pkl')
print "organism:eggNOGPair List created",time.ctime()









    



organism:eggNOGPair List created Fri Dec 18 20:33:35 2015



In [ ]:

    
# Creating a eggNOG functional file; Making a node attribute