In [1]:
from sys import argv
import re
import numpy as np
import pandas as pd
from cogent.parse import kegg_ko
from collections import Counter
from collections import OrderedDict
import csv
Set Paths and Variables
In [2]:
path_goi = '/Users/luke/krse2011/ordination/CCA_genes_filt3_loadings_goi.tsv' #CCA_genes_loadings_goi_test.tsv # argv[1]
path_ko = '/Users/luke/kegg/ko' # argv[2]
path_out = '/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/cca_top_pathways_by_KO.csv' # argv[3]
In [167]:
top_level2 = 10
top_level1 = 10
num_cutoff = 5
Get KO Numbers from Table of CCA Loadings
In [4]:
# list of KO numbers from table
ko_list = []
# regular expression matching KO number
regex_ko = re.compile(r'K[0-9]{5}')
In [5]:
# go through each line and add any matching strings to list
for line in open(path_goi).readlines():
ko_list.append(regex_ko.findall(line))
# dataframe to series to get unique KO numbers, remove first 'None' value (neither should be necessary)
df = pd.DataFrame(ko_list)
ko_unique = pd.Series(df.values.ravel()).unique()[1:]
Parse KEGG KO Record
In [6]:
kdata = kegg_ko.parse_ko(open(path_ko))
In [7]:
klist = list(kdata)
Make Dictionary of My Chosen KOs and Their Pathways
In [311]:
# generate list of lists, with each entry having [KO, all pathway hierarchy], removing pathways named BR:koXXXXX and None
mykpath = []
regex_BR = re.compile(r'BR:ko')
for record in klist:
for ko in ko_unique:
if ko == record['ENTRY']:
filtered = []
filtered = [i for i in record['CLASS'] if i[0] is not None and not regex_BR.search(i[0])]
#filtered = record['CLASS'] #use this if don't wish to filter BR:ko
newpath = [record['ENTRY'], filtered]
mykpath.append(newpath)
In [312]:
len(mykpath)
Out[312]:
In [313]:
# convert list of [KO, all pathway hierarchy] to dictionary, and make a copy for lookup at the end
dict_mykpath = dict(zip([x[0] for x in mykpath], [x[1] for x in mykpath]))
dict_mykpath_copy = dict_mykpath.copy()
With Chosen KOs (GOIs), Find KOs with Level-2 Pathways Among Top
In [314]:
# extract pathway level-2, count list entries and sort
mykpath2 = []
for ko, paths in dict_mykpath.iteritems():
for path in paths:
mykpath2.append(path[1][2])
pathcount2 = Counter(mykpath2)
In [315]:
# get top level-2 pathways, convert to dict, find KOs that contain those pathways
top_pathcount2 = pathcount2.most_common(top_level2)
dict_top_pathcount2 = OrderedDict(zip([x[0] for x in top_pathcount2], [x[1] for x in top_pathcount2]))
ko_level2 = []
for ko, paths in dict_mykpath.iteritems():
for path in paths:
if path[1][2] in dict_top_pathcount2:
ko_level2.append(ko)
In [316]:
# get unique level-2 KOs, as set
ko_level2_unique = list(set(ko_level2))
In [317]:
# remove KOs that match top level-2 pathways
for ko in ko_level2_unique:
dict_mykpath.pop(ko, None)
With Remaining KOs, Find KOs with Level-1 Pathways Among Top
In [318]:
# extract pathway level-1, count list entries and sort
mykpath1 = []
for ko, paths in dict_mykpath.iteritems():
for path in paths:
mykpath1.append(path[1][1])
pathcount1 = Counter(mykpath1)
In [319]:
# get top level-1 pathways, convert to dict, find KOs that contain those pathways
top_pathcount1 = pathcount1.most_common(top_level1)
dict_top_pathcount1 = OrderedDict(zip([x[0] for x in top_pathcount1], [x[1] for x in top_pathcount1]))
ko_level1 = []
for ko, paths in dict_mykpath.iteritems():
for path in paths:
if path[1][1] in dict_top_pathcount1:
ko_level1.append(ko)
In [320]:
# get unique level-1 KOs, as set
ko_level1_unique = list(set(ko_level1))
In [321]:
# remove KOs that match top level-1 pathways
for ko in ko_level1_unique:
dict_mykpath.pop(ko, None)
With Remaining KOs, Group as Other
In [322]:
# get list of remaining KOs
ko_other = dict_mykpath.keys()
Assign KOs to Pathways: Top Level-2, Recycle Unassigned KOs, Top Level-1, then Other
In [323]:
# initialize dictionaries
dict_level2 = OrderedDict(zip(dict_top_pathcount2.keys(), [[] for x in dict_top_pathcount2.keys()]))
dict_level1 = OrderedDict(zip(dict_top_pathcount1.keys(), [[] for x in dict_top_pathcount1.keys()]))
dict_other = OrderedDict()
dict_other['Other'] = []
In [324]:
# level-2 assign highest pathway per KO, then remove pathways with too few counts, but recycle unassigned KOs
ko_bool_level2 = OrderedDict(zip([x for x in ko_level2_unique], [True for x in ko_level2_unique]))
for ko in ko_level2_unique:
for path, count in dict_top_pathcount2.iteritems():
if ko_bool_level2[ko]:
for entry in dict_mykpath_copy[ko]:
if entry[1][2] == path:
dict_level2[path].append(ko)
ko_bool_level2[ko] = False
break
ko_level1_unique_plus = list(ko_level1_unique)
for key, value in dict_level2.iteritems():
if len(value) < num_cutoff:
recycle = dict_level2.pop(key, None)
if len(recycle) > 0:
for element in recycle:
ko_level1_unique_plus.append(element)
print element
In [ ]:
In [325]:
# level-1 assign highest pathway per KO, then remove pathways with too few counts
ko_bool_level1 = OrderedDict(zip([x for x in ko_level1_unique_plus], [True for x in ko_level1_unique_plus]))
for ko in ko_level1_unique_plus:
for path, count in dict_top_pathcount1.iteritems():
if ko_bool_level1[ko]:
for entry in dict_mykpath_copy[ko]:
if entry[1][1] == path:
dict_level1[path].append(ko)
ko_bool_level1[ko] = False
break
ko_other_plus = list(ko_other)
for key, value in dict_level1.iteritems():
if len(value) < num_cutoff:
recycle1 = dict_level1.pop(key, None)
if len(recycle1) > 0:
for element in recycle1:
ko_other_plus.append(element)
print element
In [326]:
# other
for ko in ko_other_plus:
dict_other['Other'].append(ko)
# hacky solution for K01488, whose level-2 pathway (Purine metabolism)
# only has 2 counts but level-1 pathways (Nucleotide Metabolism and
# Immune System Diseases) are not in my list -- THIS IS A BUG!
dict_other['Other'].append('K01488')
Combine Dicts of Pathway => KOs and Write to File
In [327]:
# combine 3 dicts into 1
output_dict = OrderedDict(dict_level2.items() + dict_level1.items() + dict_other.items())
In [328]:
# remove commas from pathway names
output_dict2 = OrderedDict()
regex_comma = re.compile(r',')
for key, value in output_dict.iteritems():
output_dict2[regex_comma.sub('',key)] = value
In [329]:
# write output
writer = csv.writer(open(path_out, 'wb'))
for key, value in output_dict2.items():
writer.writerow([key, value])
Printing for Reference (note commas not removed in this output but are in written file)
In [330]:
mysum = 0
for path, count in dict_top_pathcount2.iteritems():
print path, count
mysum = mysum + count
print mysum
In [331]:
mysum = 0
for path, count in dict_top_pathcount1.iteritems():
print path, count
mysum = mysum + count
print mysum
In [332]:
len(ko_level2)
Out[332]:
In [333]:
len(ko_level2_unique)
Out[333]:
In [334]:
len(ko_level1)
Out[334]:
In [335]:
len(ko_level1_unique)
Out[335]:
In [336]:
len(ko_level1_unique_plus) # includes KOs from ko_level2_unique not matched due to low pathway count
Out[336]:
In [337]:
len(ko_other)
Out[337]:
In [338]:
len(ko_level2_unique) + len(ko_level1_unique) + len(ko_other)
Out[338]:
In [339]:
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test.txt", "w")
x = 0
for key, value in output_dict2.iteritems():
out_file.write('\n'.join(value))
out_file.write('\n')
#print key, '\t', len(value)
x = x + len(value)
print x
out_file.close()
In [340]:
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test2.txt", "w")
out_file.write('\n'.join(ko_level2_unique + ko_level1_unique + ko_other))
out_file.write('\n')
out_file.close()
In [ ]: