Author: Jagannath
Date: 25th Sept 2015
Last modified : 25th Sept 2015
The file parses the .ptt file to extract information about the CDS (protein coding genes).
Total number of species analyzed : 2,774
Total number of genomes (including plasmids) : 5,220
Example of the format (tab delimited)-
Acaryochloris marina MBIC11017 chromosome, complete genome - 1..6503724
6254 proteins
Location Strand Length PID Gene Synonym Code COG Product
1627..2319 - 230 158333234 - AM1_0003 - COG1051F NUDIX hydrolase
Locus_tag is used as the identifier although PID (protein ID) is also provided. The dictionaries created are -
In [2]:
from __future__ import division
import os
from collections import defaultdict
import fnmatch
import pickle
import re
import time
import sys
sys.path.append('/home/jaggu/research/downloads/vienna/viennaRNA/lib/python2.7/site-packages')
import RNA
class PTTfile:
def __init__(self,pttFile):
self.pttFile = pttFile
self.accNbr = os.path.split(pttFile)[-1].split('.')[0]
self.orgName = os.path.split(os.path.split(pttFile)[0])[1]
self.fnaFile = self.getFnaFile(pttFile)
#This is the fasta file of the entire sequence; Note there is substitution of directory
def getFnaFile(self,pttFile):
s = pttFile
p = re.compile('allGenomePttFiles')
fnaFile = p.sub('allGenomeFnaFiles',s)
fnaFile = fnaFile[:-4]+'.fna'
return fnaFile
def openPttFile(self):
f = self.pttFile
ifile = open(f,'r')
allLines = ifile.readlines()
orgInfo = allLines[0]
self.genome_size = int(orgInfo.strip().split('..')[-1])
lines = allLines[3:] #Skipping the header
ifile.close()
return lines
def getPos(self,loc,strand):
if strand is '+':
startPos, endPos = map(int,loc.split('..'))
else:
endPos, startPos = map(int,loc.split('..'))
return (startPos,endPos)
def makeLinePairs(self,lines):
# Makes pairs of adjacent genes; By default all genomes are considered circular. So the last pair in the lines
# is the last gene --> first gene
# Returns list of [(line0,line1,rank0),(line1,line2,rank1),...(line4175,line0,rank4175)]
nbrGenes = len(lines)
line_first = lines[0]
line_last = lines[-1]
lines_shift = lines[1:]
lines_shift.append(line_first)
linePairs = zip(lines,lines_shift,range(nbrGenes))
return linePairs
def getCDSinfo(self,line,rank):
# Column information
# [0]Location [1]Strand [2]Length [3]PID [4]Gene [5]Synonym [6]Code [7]COG [8]Product
(loc, strand, pid, gene, locus_tag, COG) = (line.split('\t')[i] for i in [0,1,3,4,5,7])
if ',' in COG: # There are some wierd .ptt files wherein the COG is written as COG1,COG1
COG = COG.split(',')[0]
if not COG.startswith('-'):
[COG] = re.findall("COG\d+",COG)
#pos = getPos(loc,strand)
pos = map(int,loc.split('..'))
cds_info = [self.accNbr, rank, locus_tag, gene, pid, pos, strand, COG]
return cds_info
def getCOGpair(self,geneA,geneB):
# Returns a tuple of COGpairs; If orientation is '-' and '-', then the cogpairs are reversed accordingly
cogPair = ('-','-')
strA = geneA[-2]
strB = geneB[-2]
if strA == strB == '+':
cogPair = (geneA[-1],geneB[-1])
elif strA == strB == '-':
cogPair = (geneB[-1],geneA[-1])
else:
cogPair = ('-','-')
return cogPair
def get_DNAsequence(self):
ifile = open(self.fnaFile,'r')
lines = ifile.readlines()
ifile.close()
allDNA = ''
for l in lines[1:]:
allDNA+=l.strip()
return allDNA
def getSeq(self,start,end):
return self.allDNA[start:end]
def getGC(self,seq):
try:
gc = (seq.count('G')+seq.count('C'))/len(seq)
except ZeroDivisionError:
gc = 0
return gc
def getStartEnd(self,lposB,rposA):
dist = lposB - rposA
if dist >= 0:
start = rposA
end = lposB
else:
if abs(dist) > self.genome_size/100: # Some large number; This is for gene pairs (first and last gene)
startA = rposA
endA = self.genome_size
startB = 0
endB = lposB
return ((startA,endA),(startB,endB))
else:
start = lposB
end = rposA
return ((start,end),True)
def calculate_intergenicDistance(self,lposB,rposA):
dist = lposB - rposA
if abs(dist) > self.genome_size/100: # Some large number; This is for gene pairs (first and last gene)
dist = (self.genome_size - rposA) + lposB
return dist
def getGCcontent(self,(lposA,rposA),(lposB,rposB)):
seqA = self.getSeq(lposA,rposA)
seqB = self.getSeq(lposB,rposB)
geneA_GC = self.getGC(seqA)
geneB_GC = self.getGC(seqB)
# Intergenic
between1,between2 = self.getStartEnd(lposB,rposA)
if between2 is True:
start, end = between1
seq = self.getSeq(start,end)
else: #If these are the last gene pairs on the chromosome
startA,endA = between1
seq1 = self.getSeq(startA,endA)
startB,endB = between2
seq2 = self.getSeq(startB,endB)
seq = seq1 + seq2
intergenic_GC = self.getGC(seq)
return geneA_GC,geneB_GC,intergenic_GC,seq
def getSecStr(self,dnaSeq):
dotNot = RNA.fold(dnaSeq)[0]
re.findall("\(+",dotNot)
_rep = re.sub("\(+","S",dotNot)
_rep = re.sub("\)+","S",_rep)
slNotation = re.sub("\.+","L",_rep)
return slNotation
def populateDict(self):
self.allDNA = self.get_DNAsequence()
lines = self.openPttFile()
linePairs = self.makeLinePairs(lines)
for pair in linePairs:
cogPair = ('-','-')
lineA,lineB,rankA = pair
geneA = self.getCDSinfo(lineA,rankA)
geneB = self.getCDSinfo(lineB,rankA+1)
lTagA = geneA[2]; lTagB = geneB[2]
cogPair = self.getCOGpair(geneA,geneB)
cogPair_count_dict[cogPair]+=1
cog_locusTagList_dict[geneA[-1]].append(geneA[2])
locus_cog_dict[geneA[2]] = geneA[7]
lposA,rposA = geneA[5]
lposB,rposB = geneB[5]
geneA_GC,geneB_GC, intergenic_GC,intergenic_seq = self.getGCcontent((lposA,rposA),(lposB,rposB))
#org_lTagPairInterGeneSeq_dict[(self.orgName,self.accNbr)].append([lTagA,lTagB,intergenic_seq])
secStr = self.getSecStr(intergenic_seq)
print secStr
bp_dist = self.calculate_intergenicDistance(lposB,rposA)
if not cogPair == ('-','-'):
org_cogPair_dict[(self.orgName,self.accNbr)].append(cogPair)
org_cog_dict[(self.orgName,self.accNbr)].append(cogPair[0])
cogPair_org_dict[cogPair].append((self.orgName,self.accNbr))
cogPair_bpdist_dict[cogPair].append(bp_dist)
cogPair_GCcontent_dict[cogPair].append((geneA_GC,geneB_GC,intergenic_GC))
org_lTagPair_dict[(self.orgName,self.accNbr)].append((lTagA,lTagB))
# Defining dictionary
cogPair_count_dict = defaultdict(int)
cog_locusTagList_dict = defaultdict(list)
locus_cog_dict = dict()
org_cogPair_dict = defaultdict(list)
org_cog_dict = defaultdict(list)
cogPair_org_dict = defaultdict(list)
cogPair_bpdist_dict = defaultdict(list)
cogPair_GCcontent_dict = defaultdict(list)
org_lTagPair_dict = defaultdict(list)
org_lTagPairInterGeneSeq_dict = defaultdict(list)
sourceDir = '/home/jaggu/research/allGenomePttFiles'
pklPath = '/home/jaggu/research/projectFiles/operons/pklFiles'
def locate(pattern, root=os.curdir):
'''Locate all files matching supplied filename pattern in and
below supplied root directory.'''
allFiles = []
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in fnmatch.filter(files, pattern):
allFiles.append(os.path.join(path,filename))
return allFiles
def savePkl(db,pklFname):
f = os.path.join(pklPath,pklFname)
pickle.dump(db,open(f,'w'))
return
def main(sourceDir):
allPttFiles = locate('*.ptt',sourceDir)
for pttFile in allPttFiles:
#print "Parsing %s ...."%(pttFile)
genome = PTTfile(pttFile)
genome.populateDict()
savePkl(cogPair_count_dict,'cogPair_count.dict.pkl')
savePkl(cog_locusTagList_dict,'cog_locusTag.dict.pkl')
savePkl(locus_cog_dict,'locus_cog.dict.pkl')
savePkl(org_cogPair_dict,'org_cogPair.dict.pkl')
savePkl(org_cog_dict,'org_cogList.dict.pkl')
savePkl(cogPair_org_dict,'cogPair_org.dict.pkl')
savePkl(cogPair_bpdist_dict,'cogPair_bpdist.dict.pkl')
savePkl(cogPair_GCcontent_dict,'cogPair_GCcontent.dict.pkl')
savePkl(org_lTagPair_dict,'org_locusTagPair.dict.pkl')
savePkl(org_lTagPairInterGeneSeq_dict,'org_locusTagPairInterGeneSeq.dict.pkl')
print "Parsed all files at",time.ctime()
def test_case():
orgName = 'Bacillus_subtilis_168_uid57675'
pttFile = 'NC_000964.ptt'
test = PTTfile(os.path.join(sourceDir,orgName,pttFile))
test.get_DNAsequence()
test.populateDict()
#print org_cogPair_dict.items()[0]
#print len(set(org_cog_dict.values()[0]))
#print len(org_cog_dict.values()[0])
#print cogPair_count_dict
#print cogPair_bpdist_dict
#print cogPair_GCcontent_dict.items()[10]
print "Starting ",time.ctime()
#main(sourceDir)
test_case()
print "Parsing all genomes done",time.ctime()
In [26]:
# Mapping KEGG orthogroup (kog) to COG orthogroup or a list of COG (cog)
# This is a many to many set mapping; The file was downloaded from KEGG database.
import os
import re
from collections import defaultdict
import cPickle as pickle
sourceDir = '/home/jaggu/research/downloads/cogDB'
fname = 'kog2cog.txt'
ifile = open(os.path.join(sourceDir,fname),'r')
lines = ifile.readlines()
ifile.close()
def savePkl(db,pklFname):
pklPath = '/home/jaggu/research/projectFiles/operons/pklFiles'
f = os.path.join(pklPath,pklFname)
pickle.dump(db,open(f,'w'))
return
# Dictionary
kog_cogList_dict = dict()
cog_kogList_dict = defaultdict(list)
for line in lines[1:]:
kog,cogStr = line.split('\t')
cogList = re.findall("COG\d+",cogStr)
kog_cogList_dict[kog]=cogList
for cog in cogList:
cog_kogList_dict[cog].append(kog)
savePkl(kog_cogList_dict,'kog_cogList.dict.pkl')
savePkl(cog_kogList_dict,'cog_kogList.dict.pkl')
print "Dictionaries pickled"
In [1]:
import time
import os
import pickle
def loadPkl(fname):
pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
f = os.path.join(pklDir,fname)
db = pickle.load(open(f))
return db
def savePkl(db,pklFname):
pklDir = '/home/jaggu/research/projectFiles/operons/pklFiles'
f = os.path.join(pklDir,pklFname)
pickle.dump(db,open(f,'w'))
return
#cogPair_fgocInfo_dict = loadPkl('cogPair_fgocInfo.dict.pkl')
#cog_func_dict = loadPkl('cogFunc.dict.pkl')
#cog_locusTag_dict = loadPkl('cog_locusTag.dict.pkl')
print "Dictionaries loaded",time.ctime()
In [10]:
cog_spList = loadPkl('cog_spList.dict.pkl')
cog_func_dict = loadPkl('cogFunc.dict.pkl')
cog_locusTag_dict = loadPkl('cog_locusTag.dict.pkl')
print "Dictionaries loaded",time.ctime()
In [12]:
# Statistics of the COG functions
# Redid this figure at species level collapse
import os
import pickle
import collections
from operator import itemgetter
imgDir = '/home/jaggu/research/projectFiles/operons/figures'
% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
code_fname = os.path.join('/home/jaggu/research/downloads/cogDB','fun2003-2014.tab')
ifile = open(code_fname)
lines = ifile.readlines()
ifile.close()
code_category_dict = dict()
for line in lines[1:]:
code, name = line.strip().split('\t')
code_category_dict[code]=name
category_counts_dict = collections.defaultdict(int)
for cog, ltag in cog_locusTag_dict.items():
codeNames, func = cog_func_dict.get(cog,(None,None))
if func:
if len(codeNames)>1:
catSplit = list(codeNames)
for code in catSplit:
category_counts_dict[code]+=len(ltag)
else:
category_counts_dict[codeNames]+=len(ltag)
xNames = list()
xList = list()
yList = list()
xyList = list()
for code, nbr in sorted(category_counts_dict.items()):
wname = code + ' : ' + code_category_dict[code]
xyList.append([wname,nbr])
xyList = sorted(xyList,key=itemgetter(1))
xNames,yList = zip(*xyList)
print "Total number of genes mapped : ",sum(yList)
width = 0.5
xList = np.arange(len(xNames))
rects = plt.barh(xList, yList, width, color='#0072B2')
plt.yticks(xList+0.2,xNames,rotation='horizontal')
plt.ylabel('Number of categories')
plt.title('Composition of COG functional categories ')
plt.xlim([0,400000])
# Saving figure
fname = 'COG_function_composition.totalLtag.bar.svg'
f = os.path.join(imgDir,fname)
print f
#plt.savefig(f,dpi=300)
plt.show()
In [19]:
# Converting locusTag to taxid.seqID (this is PID - protein ID);
# Then there is a tab delimited file, where eggNOG members are represented by this PID
# Dictionary - (a) locusTag : PID (taxid.seqid); (b) PID : locusTag
# (c)eggNOG : [locusTag1,...] (from the members file) (d) locusTag : eggNOG
import collections
def convertLocusTag_PID(f):
lTag_PID_dict = dict()
PID_lTag_dict = dict()
for line in open(f):
taxid,seqid,keggid,srcDB = [line.strip().split('\t')[i] for i in (0,1,2,3)]
lTag = keggid.split(':')[-1]
pid = taxid + '.' + seqid
lTag_PID_dict[lTag] = pid
PID_lTag_dict[pid] = lTag
return lTag_PID_dict, PID_lTag_dict
eggNOGsourceDir = '/home/jaggu/research/downloads/eggNOG'
lTag_eNOG_dict = dict()
convFname = 'eggNOG.keggid_conversion.tsv' #Made this file in shell
f1 = os.path.join(eggNOGsourceDir,convFname)
lTag_PID_dict,PID_lTag_dict = convertLocusTag_PID(f1)
savePkl(lTag_PID_dict,'locusTag_PID.dict.pkl')
savePkl(PID_lTag_dict,'PID_locusTag.dict.pkl')
print "locus Tag <-> PID mapped at",time.ctime()
In [5]:
# Loading dictionaries
lTag_PID_dict = loadPkl('locusTag_PID.dict.pkl')
PID_lTag_dict = loadPkl('PID_locusTag.dict.pkl')
org_lTagPair_dict = loadPkl('org_locusTagPair.dict.pkl')
lTag_eNOG_dict = loadPkl('locusTag_eggNOG.dict.pkl')
print "Dictionaries loaded ",time.ctime()
In [8]:
# Using the NOG members file to create the dictionaries.
import collections
def getENOG_lTag(f):
lTag_eNOG_dict = dict()
for line in open(f):
NOG,pid_list = [line.strip().split('\t')[i] for i in (1,5)]
eNOG = 'e'+NOG
for pid in pid_list.split(','):
try:
lTag = PID_lTag_dict[pid]
lTag_eNOG_dict[lTag] = eNOG
except KeyError: pass
return lTag_eNOG_dict
eggNOGsourceDir = '/home/jaggu/research/downloads/eggNOG'
NOG_membersFname = 'NOG.members.tsv'
f2 = os.path.join(eggNOGsourceDir,NOG_membersFname)
lTag_eNOG_dict = getENOG_lTag(f2)
savePkl(lTag_eNOG_dict,'locusTag_eggNOG.dict.pkl')
print "locus Tag eggNOG dictionary created",time.ctime()
In [6]:
# Now I have locus Tag : eggNOG dictionary. Will use the org: (locusTagPairs)... to generate an org:(eNOG1,eNOG2),..
# dictionary. This can be collapsed to species and genus level.
import collections
import time
def get_eNOG(lTag):
eNOG = lTag_eNOG_dict.get(lTag,'-')
return eNOG
org_eggNOGPair = collections.defaultdict(list)
for org,lTagPair_list in org_lTagPair_dict.items():
for [lTag1,lTag2] in lTagPair_list:
eNOG1 = get_eNOG(lTag1)
eNOG2 = get_eNOG(lTag2)
if not (eNOG1 == '-' and eNOG2 == '-'):
org_eggNOGPair[org].extend([(eNOG1,eNOG2)])
savePkl(org_eggNOGPair,'org_eggNOGPair.dict.pkl')
print "organism:eggNOGPair List created",time.ctime()
In [ ]:
# Creating a eggNOG functional file; Making a node attribute