In [124]:
from sys import argv
import re
import pandas as pd
from Bio.KEGG import Enzyme
Get Unique EC Numbers from Tables of RS-only Clusters
In [125]:
path_table = "/Users/luke/singlecell/clusters/orthomcl-pro3/table.RSonly_pro.txt" # argv[1]
path_enzyme = "/Users/luke/singlecell/clusters/enzyme" # argv[2]
In [126]:
# list of EC numbers from table
ec_list = []
# regular expression matching EC number
regex_ec = re.compile(r'EC [0-9]*\.[0-9]*\.[0-9]*\.[0-9]*')
In [127]:
# go through each line and add any matching strings to list
for line in open(path_table).readlines():
ec_list.append(regex_ec.findall(line))
# dataframe to series to get unique EC numbers, remove first 'None' value
df = pd.DataFrame(ec_list)
ec_unique = pd.Series(df.values.ravel()).unique()[1:]
Parse KEGG Enzyme Records
In [128]:
records = Enzyme.parse(open(path_enzyme))
In [129]:
rlist = list(records)
In [130]:
z = None
for record in rlist:
for ec in ec_unique:
z = re.search(record.entry + '$', ec) # match EC number at end of line
if z is not None:
if len(record.pathway) > 0:
for pathway in record.pathway:
print "EC %s\t%s\t%s" % (record.entry, pathway[1], pathway[2])
else:
print "EC %s\tNA\tNA" % (record.entry)
In [ ]: