Author: Jagannath

Date: 25th Sept 2015

Last modified : 25th Sept 2015

The file parses the .ptt file to extract information about the CDS (protein coding genes).
Total number of species analyzed : 2,774
Total number of genomes (including plasmids) : 5,220

Example of the format (tab delimited)- Acaryochloris marina MBIC11017 chromosome, complete genome - 1..6503724 6254 proteins
Location Strand Length PID Gene Synonym Code COG Product 1627..2319 - 230 158333234 - AM1_0003 - COG1051F NUDIX hydrolase
Locus_tag is used as the identifier although PID (protein ID) is also provided. The dictionaries created are -

  1. { locus_tag : [accNbr, rank, geneName, PID (protein ID), (startPos,endPos), COG (COG number with Code)] }
  2. { COG (w/ code) : ( [locus_tag1, locus_tag2, ...],Count of locus_tags ) }
  3. { COG(A)-COG(B) : count } # Count of number of times when COG(A) before COG(B); A --> B on the same strand
  4. { (orgName,accNbr):( (COG(A),COG(B)),....) )
  5. { COG(A)-COG(B) : set([(orgName,accNbr),...])} </ol> [! In future, maybe can think of A --> B (but A and B are on different strands); Also this NCBI link has details about its precomputed clusters link
  6. 
    
    In [2]:
    from __future__ import division
    import os
    from collections import defaultdict
    import fnmatch
    import pickle
    import re
    import time
    import sys
    sys.path.append('/home/jaggu/research/downloads/vienna/viennaRNA/lib/python2.7/site-packages')
    import RNA
    
    class PTTfile:
    
        def __init__(self,pttFile):
            self.pttFile = pttFile
            self.accNbr = os.path.split(pttFile)[-1].split('.')[0]
            self.orgName = os.path.split(os.path.split(pttFile)[0])[1]
            self.fnaFile = self.getFnaFile(pttFile) 
            #This is the fasta file of the entire sequence; Note there is substitution of directory
        
        def getFnaFile(self,pttFile):
            s = pttFile
            p = re.compile('allGenomePttFiles')
            fnaFile = p.sub('allGenomeFnaFiles',s)
            fnaFile = fnaFile[:-4]+'.fna'
            return fnaFile
        
        def openPttFile(self):
            f = self.pttFile
            ifile = open(f,'r')
            allLines = ifile.readlines()
            orgInfo = allLines[0]
            self.genome_size = int(orgInfo.strip().split('..')[-1])
            lines = allLines[3:] #Skipping the header
            ifile.close()
            return lines
            
        def getPos(self,loc,strand):
            if strand is '+':
                startPos, endPos = map(int,loc.split('..'))
            else:
                endPos, startPos = map(int,loc.split('..'))
            return (startPos,endPos)
    
        def makeLinePairs(self,lines):
            # Makes pairs of adjacent genes; By default all genomes are considered circular. So the last pair in the lines 
            # is the last gene --> first gene
            # Returns list of [(line0,line1,rank0),(line1,line2,rank1),...(line4175,line0,rank4175)]
            nbrGenes = len(lines)
            line_first = lines[0]
            line_last = lines[-1]
            lines_shift = lines[1:]
            lines_shift.append(line_first)
            linePairs = zip(lines,lines_shift,range(nbrGenes))
            return linePairs
    
        def getCDSinfo(self,line,rank):
            # Column information
            # [0]Location	[1]Strand	[2]Length	[3]PID	[4]Gene	[5]Synonym	[6]Code	[7]COG	[8]Product
            (loc, strand, pid, gene, locus_tag, COG) = (line.split('\t')[i] for i in [0,1,3,4,5,7])
            if ',' in COG: # There are some wierd .ptt files wherein the COG is written as COG1,COG1
                COG = COG.split(',')[0]
            if not COG.startswith('-'):
                [COG] = re.findall("COG\d+",COG)
            #pos = getPos(loc,strand)
            pos = map(int,loc.split('..'))
            cds_info = [self.accNbr, rank, locus_tag, gene, pid, pos, strand, COG]
            return cds_info
    
        def getCOGpair(self,geneA,geneB):
            # Returns a tuple of COGpairs; If orientation is '-' and '-', then the cogpairs are reversed accordingly
            cogPair = ('-','-')
            strA = geneA[-2]
            strB = geneB[-2]
            if strA == strB == '+':
                cogPair = (geneA[-1],geneB[-1])
            elif strA == strB == '-':
                cogPair = (geneB[-1],geneA[-1])
            else:
                cogPair = ('-','-')
            return cogPair
        
        def get_DNAsequence(self):
            ifile = open(self.fnaFile,'r')
            lines = ifile.readlines()
            ifile.close()
            allDNA = ''
            for l in lines[1:]:
                allDNA+=l.strip()
            return allDNA
        
        def getSeq(self,start,end):
            return self.allDNA[start:end]
    
        def getGC(self,seq):
            try:
                gc = (seq.count('G')+seq.count('C'))/len(seq)
            except ZeroDivisionError:
                gc = 0
            return gc
        
        def getStartEnd(self,lposB,rposA):
            dist = lposB - rposA
            if dist >= 0:
                start = rposA
                end = lposB
            else:
                if abs(dist) > self.genome_size/100: # Some large number; This is for gene pairs (first and last gene)
                    startA = rposA
                    endA = self.genome_size
                    startB = 0
                    endB = lposB
                    return ((startA,endA),(startB,endB))
                else:
                    start = lposB
                    end = rposA
            return ((start,end),True)
    
        def calculate_intergenicDistance(self,lposB,rposA):
            dist = lposB - rposA
            if abs(dist) > self.genome_size/100: # Some large number; This is for gene pairs (first and last gene)
                dist = (self.genome_size - rposA) + lposB
            return dist
        
        def getGCcontent(self,(lposA,rposA),(lposB,rposB)):
            seqA = self.getSeq(lposA,rposA)
            seqB = self.getSeq(lposB,rposB)
            geneA_GC = self.getGC(seqA)
            geneB_GC = self.getGC(seqB)
            
            # Intergenic
            between1,between2 = self.getStartEnd(lposB,rposA)
            if between2 is True:
                start, end = between1
                seq = self.getSeq(start,end)
            else: #If these are the last gene pairs on the chromosome
                startA,endA = between1
                seq1 = self.getSeq(startA,endA)
                startB,endB = between2
                seq2 = self.getSeq(startB,endB)
                seq = seq1 + seq2
    
            intergenic_GC = self.getGC(seq)
            return geneA_GC,geneB_GC,intergenic_GC,seq
        
        def getSecStr(self,dnaSeq):
            dotNot = RNA.fold(dnaSeq)[0]
            re.findall("\(+",dotNot)
            _rep = re.sub("\(+","S",dotNot)
            _rep = re.sub("\)+","S",_rep)
            slNotation = re.sub("\.+","L",_rep)
            return slNotation
        
        def populateDict(self):
            self.allDNA = self.get_DNAsequence()
            lines = self.openPttFile()
            linePairs = self.makeLinePairs(lines)
            for pair in linePairs:
                cogPair = ('-','-')
                lineA,lineB,rankA = pair
                geneA = self.getCDSinfo(lineA,rankA)
                geneB = self.getCDSinfo(lineB,rankA+1)
                lTagA = geneA[2]; lTagB = geneB[2]
                cogPair = self.getCOGpair(geneA,geneB)
                cogPair_count_dict[cogPair]+=1
                cog_locusTagList_dict[geneA[-1]].append(geneA[2])
                locus_cog_dict[geneA[2]] = geneA[7]
                lposA,rposA = geneA[5]
                lposB,rposB = geneB[5]
                geneA_GC,geneB_GC, intergenic_GC,intergenic_seq = self.getGCcontent((lposA,rposA),(lposB,rposB))
                #org_lTagPairInterGeneSeq_dict[(self.orgName,self.accNbr)].append([lTagA,lTagB,intergenic_seq])
                secStr = self.getSecStr(intergenic_seq)
                print secStr
                bp_dist = self.calculate_intergenicDistance(lposB,rposA)
                if not cogPair == ('-','-'):
                    org_cogPair_dict[(self.orgName,self.accNbr)].append(cogPair)
                    org_cog_dict[(self.orgName,self.accNbr)].append(cogPair[0])
                    cogPair_org_dict[cogPair].append((self.orgName,self.accNbr))
                    cogPair_bpdist_dict[cogPair].append(bp_dist)
                    cogPair_GCcontent_dict[cogPair].append((geneA_GC,geneB_GC,intergenic_GC))
                    org_lTagPair_dict[(self.orgName,self.accNbr)].append((lTagA,lTagB))
                    
    # Defining dictionary
    cogPair_count_dict = defaultdict(int)
    cog_locusTagList_dict = defaultdict(list)
    locus_cog_dict = dict()
    org_cogPair_dict = defaultdict(list) 
    org_cog_dict = defaultdict(list)
    cogPair_org_dict = defaultdict(list)
    cogPair_bpdist_dict = defaultdict(list)
    cogPair_GCcontent_dict = defaultdict(list)
    org_lTagPair_dict = defaultdict(list)
    org_lTagPairInterGeneSeq_dict = defaultdict(list)
    
    sourceDir = '/home/jaggu/research/allGenomePttFiles'
    pklPath = '/home/jaggu/research/projectFiles/operons/pklFiles'
    
    def locate(pattern, root=os.curdir):
        '''Locate all files matching supplied filename pattern in and
        below supplied root directory.'''
        allFiles = []
        for path, dirs, files in os.walk(os.path.abspath(root)):
            for filename in fnmatch.filter(files, pattern):
                allFiles.append(os.path.join(path,filename))
        return allFiles
    
    def savePkl(db,pklFname):
        f = os.path.join(pklPath,pklFname)
        pickle.dump(db,open(f,'w'))
        return 
    
    def main(sourceDir):
        allPttFiles = locate('*.ptt',sourceDir)
        for pttFile in allPttFiles:
            #print "Parsing %s ...."%(pttFile)
            genome = PTTfile(pttFile)
            genome.populateDict()
        savePkl(cogPair_count_dict,'cogPair_count.dict.pkl')
        savePkl(cog_locusTagList_dict,'cog_locusTag.dict.pkl')
        savePkl(locus_cog_dict,'locus_cog.dict.pkl')
        savePkl(org_cogPair_dict,'org_cogPair.dict.pkl')
        savePkl(org_cog_dict,'org_cogList.dict.pkl')
        savePkl(cogPair_org_dict,'cogPair_org.dict.pkl')
        savePkl(cogPair_bpdist_dict,'cogPair_bpdist.dict.pkl')
        savePkl(cogPair_GCcontent_dict,'cogPair_GCcontent.dict.pkl')
        savePkl(org_lTagPair_dict,'org_locusTagPair.dict.pkl')
        savePkl(org_lTagPairInterGeneSeq_dict,'org_locusTagPairInterGeneSeq.dict.pkl')
        print "Parsed all files at",time.ctime()
            
    def test_case():
        orgName = 'Bacillus_subtilis_168_uid57675'
        pttFile = 'NC_000964.ptt'
        test = PTTfile(os.path.join(sourceDir,orgName,pttFile))
        test.get_DNAsequence()
        test.populateDict()
        
        #print org_cogPair_dict.items()[0]
        #print len(set(org_cog_dict.values()[0]))
        #print len(org_cog_dict.values()[0])
        #print cogPair_count_dict
        #print cogPair_bpdist_dict
        #print cogPair_GCcontent_dict.items()[10]
    
    print "Starting ",time.ctime()
    #main(sourceDir)
    test_case()
    print "Parsing all genomes done",time.ctime()
    
    
    
    
    Starting  Thu Dec 24 14:02:46 2015
    LSLSLSLSLSLSLSLSLSLSLSLSLSLSLSLSLSL
    LSLSLSLSSLSLSLSLSLSLSLSLSLSLSLSLSLSLSL
    LSLSL
    L
    LSLSLSLSSLSLSLSLSL
    LSLSSLSSLSLSLSLSLSLSLSLSLSLSLSLSLSLSL
    
    ---------------------------------------------------------------------------
    KeyboardInterrupt                         Traceback (most recent call last)
    <ipython-input-2-90aae8b3c07d> in <module>()
        244 print "Starting ",time.ctime()
        245 #main(sourceDir)
    --> 246 test_case()
        247 print "Parsing all genomes done",time.ctime()
    
    <ipython-input-2-90aae8b3c07d> in test_case()
        233     test = PTTfile(os.path.join(sourceDir,orgName,pttFile))
        234     test.get_DNAsequence()
    --> 235     test.populateDict()
        236 
        237     #print org_cogPair_dict.items()[0]
    
    <ipython-input-2-90aae8b3c07d> in populateDict(self)
        170             geneA_GC,geneB_GC, intergenic_GC,intergenic_seq = self.getGCcontent((lposA,rposA),(lposB,rposB))
        171             #org_lTagPairInterGeneSeq_dict[(self.orgName,self.accNbr)].append([lTagA,lTagB,intergenic_seq])
    --> 172             secStr = self.getSecStr(intergenic_seq)
        173             print secStr
        174             bp_dist = self.calculate_intergenicDistance(lposB,rposA)
    
    <ipython-input-2-90aae8b3c07d> in getSecStr(self, dnaSeq)
        145 
        146     def getSecStr(self,dnaSeq):
    --> 147         dotNot = RNA.fold(dnaSeq)[0]
        148         re.findall("\(+",dotNot)
        149         _rep = re.sub("\(+","S",dotNot)
    
    KeyboardInterrupt: 
    
    
    In [26]:
    # Mapping KEGG orthogroup (kog) to COG orthogroup or a list of COG (cog)
    # This is a many to many set mapping; The file was downloaded from KEGG database.
    import os
    import re
    from collections import defaultdict
    import cPickle as pickle
    
    sourceDir = '/home/jaggu/research/downloads/cogDB'
    fname = 'kog2cog.txt'
    ifile = open(os.path.join(sourceDir,fname),'r')
    lines = ifile.readlines()
    ifile.close()
    
    def savePkl(db,pklFname):
        pklPath = '/home/jaggu/research/projectFiles/operons/pklFiles'
        f = os.path.join(pklPath,pklFname)
        pickle.dump(db,open(f,'w'))
        return 
    
    # Dictionary 
    kog_cogList_dict = dict()
    cog_kogList_dict = defaultdict(list)
    
    for line in lines[1:]:
        kog,cogStr = line.split('\t')
        cogList = re.findall("COG\d+",cogStr)
        kog_cogList_dict[kog]=cogList
        for cog in cogList:
            cog_kogList_dict[cog].append(kog) 
    
    savePkl(kog_cogList_dict,'kog_cogList.dict.pkl')
    savePkl(cog_kogList_dict,'cog_kogList.dict.pkl')
    
    print "Dictionaries pickled"
    
    
    
    
    Dictionaries pickled
    
    
    
    In [1]:
    import time
    import os
    import pickle
    
    def loadPkl(fname):
        pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
        f = os.path.join(pklDir,fname)
        db = pickle.load(open(f))
        return db
    
    def savePkl(db,pklFname):
        pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'    
        f = os.path.join(pklDir,pklFname)
        pickle.dump(db,open(f,'w'))
        return 
    
    #cogPair_fgocInfo_dict = loadPkl('cogPair_fgocInfo.dict.pkl')
    #cog_func_dict = loadPkl('cogFunc.dict.pkl')
    #cog_locusTag_dict = loadPkl('cog_locusTag.dict.pkl')
    print "Dictionaries loaded",time.ctime()
    
    
    
    
    Dictionaries loaded Wed Dec 23 16:01:24 2015
    
    
    
    In [10]:
    cog_spList = loadPkl('cog_spList.dict.pkl')
    cog_func_dict = loadPkl('cogFunc.dict.pkl')
    cog_locusTag_dict = loadPkl('cog_locusTag.dict.pkl')
    
    
    print "Dictionaries loaded",time.ctime()
    
    
    
    
    Dictionaries loaded Wed Dec 23 16:21:15 2015
    
    
    
    In [12]:
    # Statistics of the COG functions
    # Redid this figure at species level collapse
    import os
    import pickle
    import collections
    from operator import itemgetter
    
    imgDir = '/home/jaggu/research/projectFiles/operons/figures'
    
    % matplotlib inline
    import numpy as np
    import matplotlib.pyplot as plt
    
    code_fname = os.path.join('/home/jaggu/research/downloads/cogDB','fun2003-2014.tab')
    ifile = open(code_fname)
    lines = ifile.readlines()
    ifile.close()
    
    code_category_dict = dict()
    for line in lines[1:]:
        code, name = line.strip().split('\t')
        code_category_dict[code]=name
    
    category_counts_dict = collections.defaultdict(int)
    
    for cog, ltag in cog_locusTag_dict.items():
        codeNames, func = cog_func_dict.get(cog,(None,None))
        if func:
            if len(codeNames)>1:
                catSplit = list(codeNames)
                for code in catSplit: 
                    category_counts_dict[code]+=len(ltag)
            else:
                category_counts_dict[codeNames]+=len(ltag)
    
                
    xNames = list()
    xList = list()
    yList = list()
    xyList = list()
    
    
    for code, nbr in sorted(category_counts_dict.items()):
        wname = code + ' : ' + code_category_dict[code]
        xyList.append([wname,nbr])
        
    xyList = sorted(xyList,key=itemgetter(1))
    xNames,yList = zip(*xyList)
    print "Total number of genes mapped : ",sum(yList)
    width = 0.5
    xList = np.arange(len(xNames))
    rects = plt.barh(xList, yList, width, color='#0072B2')
    plt.yticks(xList+0.2,xNames,rotation='horizontal')
    plt.ylabel('Number of categories')
    plt.title('Composition of COG functional categories ')
    plt.xlim([0,400000])
    # Saving figure
    fname = 'COG_function_composition.totalLtag.bar.svg'
    f = os.path.join(imgDir,fname)
    print f
    #plt.savefig(f,dpi=300)
    plt.show()
    
    
    
    
    Total number of genes mapped :  4046852
    /home/jaggu/research/projectFiles/operons/figures/COG_function_composition.totalLtag.bar.svg
    

    Generating the entire orthologous pair for eggNOG

    I first had to download the mapping file - eggNOG uses something called proteinid (PID or pid here) = taxid.seqid; This file is - eggnog4.protein_id_conversion.tsv; I ran a shell The other files are mainly members.tsv file downloaded from [eggNOG website](http://eggnogdb.embl.de/#/app/downloads) on 18th December 2015.
    The files are - (a) NOG.members.tsv (b) bactNOG.members.tsv and (c) arNOG.members.tsv

    Dictionaries

    Finally going to make the dictionaries -
    1. org : [(eNOG1,eNOG2),(eNOG2,eNOG3)...] --> org_eNOGPair_dict
    2. org : [eNOG1,eNOG2,...] --> org_eNOGList_dict
    5. eNOG : [sp1,sp2,...] (unique spp) --> eNOG_spList_dict
    6.
    
    
    In [19]:
    # Converting locusTag to taxid.seqID (this is PID - protein ID); 
    # Then there is a tab delimited file, where eggNOG members are represented by this PID  
    # Dictionary - (a) locusTag : PID (taxid.seqid); (b) PID : locusTag
    # (c)eggNOG : [locusTag1,...] (from the members file) (d) locusTag : eggNOG
    import collections
    
    def convertLocusTag_PID(f):
        lTag_PID_dict = dict()
        PID_lTag_dict = dict()
        for line in open(f):
            taxid,seqid,keggid,srcDB = [line.strip().split('\t')[i] for i in (0,1,2,3)]
            lTag = keggid.split(':')[-1]
            pid = taxid + '.' + seqid
            lTag_PID_dict[lTag] = pid
            PID_lTag_dict[pid] = lTag
        return lTag_PID_dict, PID_lTag_dict
    
    eggNOGsourceDir = '/home/jaggu/research/downloads/eggNOG'
    lTag_eNOG_dict = dict()
    
    convFname = 'eggNOG.keggid_conversion.tsv' #Made this file in shell
    f1 = os.path.join(eggNOGsourceDir,convFname)
    
    lTag_PID_dict,PID_lTag_dict = convertLocusTag_PID(f1)
    savePkl(lTag_PID_dict,'locusTag_PID.dict.pkl')
    savePkl(PID_lTag_dict,'PID_locusTag.dict.pkl')
    print "locus Tag <-> PID mapped at",time.ctime()
    
    
    
    
    locus Tag <-> PID mapped at Fri Dec 18 16:48:26 2015
    
    
    
    In [5]:
    # Loading dictionaries
    lTag_PID_dict = loadPkl('locusTag_PID.dict.pkl')
    PID_lTag_dict = loadPkl('PID_locusTag.dict.pkl')
    org_lTagPair_dict = loadPkl('org_locusTagPair.dict.pkl')
    lTag_eNOG_dict = loadPkl('locusTag_eggNOG.dict.pkl')
    
    print "Dictionaries loaded ",time.ctime()
    
    
    
    
    Dictionaries loaded  Fri Dec 18 20:33:18 2015
    
    
    
    In [8]:
    # Using the NOG members file to create the dictionaries.
    import collections
    
    def getENOG_lTag(f):
        lTag_eNOG_dict = dict()
        for line in open(f):
            NOG,pid_list = [line.strip().split('\t')[i] for i in (1,5)]
            eNOG = 'e'+NOG
            for pid in pid_list.split(','): 
                try:
                    lTag = PID_lTag_dict[pid]
                    lTag_eNOG_dict[lTag] = eNOG
                except KeyError: pass
        return lTag_eNOG_dict
                
    eggNOGsourceDir = '/home/jaggu/research/downloads/eggNOG'
    
    NOG_membersFname = 'NOG.members.tsv'
    f2 = os.path.join(eggNOGsourceDir,NOG_membersFname)
    
    lTag_eNOG_dict = getENOG_lTag(f2)
    savePkl(lTag_eNOG_dict,'locusTag_eggNOG.dict.pkl')
    
    print "locus Tag eggNOG dictionary created",time.ctime()
    
    
    
    
    locus Tag eggNOG dictionary created Fri Dec 18 17:18:20 2015
    
    
    
    In [6]:
    # Now I have locus Tag : eggNOG dictionary. Will use the org: (locusTagPairs)... to generate an org:(eNOG1,eNOG2),..
    # dictionary. This can be collapsed to species and genus level.
    import collections
    import time
    
    def get_eNOG(lTag):
        eNOG = lTag_eNOG_dict.get(lTag,'-')
        return eNOG
    
    org_eggNOGPair = collections.defaultdict(list)
    
    for org,lTagPair_list in org_lTagPair_dict.items():
        for [lTag1,lTag2] in lTagPair_list:
            eNOG1 = get_eNOG(lTag1)
            eNOG2 = get_eNOG(lTag2)
            if not (eNOG1 == '-' and eNOG2 == '-'):
                org_eggNOGPair[org].extend([(eNOG1,eNOG2)])
    
    savePkl(org_eggNOGPair,'org_eggNOGPair.dict.pkl')
    print "organism:eggNOGPair List created",time.ctime()
    
    
    
    
    organism:eggNOGPair List created Fri Dec 18 20:33:35 2015
    
    
    
    In [ ]:
    # Creating a eggNOG functional file; Making a node attribute