In [1]:
from sys import argv
import re
import numpy as np
import pandas as pd
from cogent.parse import kegg_ko
from collections import Counter
from collections import OrderedDict
import csv

Set Paths and Variables


In [2]:
path_goi = '/Users/luke/krse2011/ordination/CCA_genes_filt3_loadings_goi.tsv' #CCA_genes_loadings_goi_test.tsv # argv[1]
path_ko = '/Users/luke/kegg/ko' # argv[2]
path_out = '/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/cca_top_pathways_by_KO.csv' # argv[3]

In [167]:
top_level2 = 10
top_level1 = 10
num_cutoff = 5

Get KO Numbers from Table of CCA Loadings


In [4]:
# list of KO numbers from table
ko_list = []

# regular expression matching KO number
regex_ko = re.compile(r'K[0-9]{5}')

In [5]:
# go through each line and add any matching strings to list
for line in open(path_goi).readlines():
    ko_list.append(regex_ko.findall(line))

# dataframe to series to get unique KO numbers, remove first 'None' value (neither should be necessary)
df = pd.DataFrame(ko_list)
ko_unique = pd.Series(df.values.ravel()).unique()[1:]

Parse KEGG KO Record


In [6]:
kdata = kegg_ko.parse_ko(open(path_ko))

In [7]:
klist = list(kdata)

Make Dictionary of My Chosen KOs and Their Pathways


In [311]:
# generate list of lists, with each entry having [KO, all pathway hierarchy], removing pathways named BR:koXXXXX and None
mykpath = []
regex_BR = re.compile(r'BR:ko')
for record in klist:
    for ko in ko_unique:
        if ko == record['ENTRY']:
            filtered = []
            filtered = [i for i in record['CLASS'] if i[0] is not None and not regex_BR.search(i[0])]
            #filtered = record['CLASS'] #use this if don't wish to filter BR:ko
            newpath = [record['ENTRY'], filtered]
            mykpath.append(newpath)

In [312]:
len(mykpath)


Out[312]:
224

In [313]:
# convert list of [KO, all pathway hierarchy] to dictionary, and make a copy for lookup at the end
dict_mykpath = dict(zip([x[0] for x in mykpath], [x[1] for x in mykpath]))
dict_mykpath_copy = dict_mykpath.copy()

With Chosen KOs (GOIs), Find KOs with Level-2 Pathways Among Top


In [314]:
# extract pathway level-2, count list entries and sort
mykpath2 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        mykpath2.append(path[1][2])
pathcount2 = Counter(mykpath2)

In [315]:
# get top level-2 pathways, convert to dict, find KOs that contain those pathways
top_pathcount2 = pathcount2.most_common(top_level2)
dict_top_pathcount2 = OrderedDict(zip([x[0] for x in top_pathcount2], [x[1] for x in top_pathcount2]))
ko_level2 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        if path[1][2] in dict_top_pathcount2:
            ko_level2.append(ko)

In [316]:
# get unique level-2 KOs, as set
ko_level2_unique = list(set(ko_level2))

In [317]:
# remove KOs that match top level-2 pathways
for ko in ko_level2_unique:
    dict_mykpath.pop(ko, None)

With Remaining KOs, Find KOs with Level-1 Pathways Among Top


In [318]:
# extract pathway level-1, count list entries and sort
mykpath1 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        mykpath1.append(path[1][1])
pathcount1 = Counter(mykpath1)

In [319]:
# get top level-1 pathways, convert to dict, find KOs that contain those pathways
top_pathcount1 = pathcount1.most_common(top_level1)
dict_top_pathcount1 = OrderedDict(zip([x[0] for x in top_pathcount1], [x[1] for x in top_pathcount1]))
ko_level1 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        if path[1][1] in dict_top_pathcount1:
            ko_level1.append(ko)

In [320]:
# get unique level-1 KOs, as set
ko_level1_unique = list(set(ko_level1))

In [321]:
# remove KOs that match top level-1 pathways
for ko in ko_level1_unique:
    dict_mykpath.pop(ko, None)

With Remaining KOs, Group as Other


In [322]:
# get list of remaining KOs
ko_other = dict_mykpath.keys()

Assign KOs to Pathways: Top Level-2, Recycle Unassigned KOs, Top Level-1, then Other


In [323]:
# initialize dictionaries
dict_level2 = OrderedDict(zip(dict_top_pathcount2.keys(), [[] for x in dict_top_pathcount2.keys()]))
dict_level1 = OrderedDict(zip(dict_top_pathcount1.keys(), [[] for x in dict_top_pathcount1.keys()]))
dict_other = OrderedDict()
dict_other['Other'] = []

In [324]:
# level-2 assign highest pathway per KO, then remove pathways with too few counts, but recycle unassigned KOs
ko_bool_level2 = OrderedDict(zip([x for x in ko_level2_unique], [True for x in ko_level2_unique]))
for ko in ko_level2_unique:
    for path, count in dict_top_pathcount2.iteritems():
        if ko_bool_level2[ko]:
            for entry in dict_mykpath_copy[ko]:
                if entry[1][2] == path:
                    dict_level2[path].append(ko)
                    ko_bool_level2[ko] = False
                    break
ko_level1_unique_plus = list(ko_level1_unique)
for key, value in dict_level2.iteritems():
    if len(value) < num_cutoff:
        recycle = dict_level2.pop(key, None)
        if len(recycle) > 0:
            for element in recycle:
                ko_level1_unique_plus.append(element)
                print element


K01692
K00249
K01488
K00958
K01610
K01644

In [ ]:


In [325]:
# level-1 assign highest pathway per KO, then remove pathways with too few counts
ko_bool_level1 = OrderedDict(zip([x for x in ko_level1_unique_plus], [True for x in ko_level1_unique_plus]))
for ko in ko_level1_unique_plus:
    for path, count in dict_top_pathcount1.iteritems():
        if ko_bool_level1[ko]:
            for entry in dict_mykpath_copy[ko]:
                if entry[1][1] == path:
                    dict_level1[path].append(ko)
                    ko_bool_level1[ko] = False
                    break
ko_other_plus = list(ko_other)
for key, value in dict_level1.iteritems():
    if len(value) < num_cutoff:
        recycle1 = dict_level1.pop(key, None)
        if len(recycle1) > 0:
            for element in recycle1:
                ko_other_plus.append(element)
                print element


K01247
K01971
K01972
K03702
K01919
K01256
K01920
K00799
K02313
K03406
K07657

In [326]:
# other
for ko in ko_other_plus:
    dict_other['Other'].append(ko)
    
# hacky solution for K01488, whose level-2 pathway (Purine metabolism) 
# only has 2 counts but level-1 pathways (Nucleotide Metabolism and 
# Immune System Diseases) are not in my list -- THIS IS A BUG!
dict_other['Other'].append('K01488')

Combine Dicts of Pathway => KOs and Write to File


In [327]:
# combine 3 dicts into 1
output_dict = OrderedDict(dict_level2.items() + dict_level1.items() + dict_other.items())

In [328]:
# remove commas from pathway names
output_dict2 = OrderedDict()
regex_comma = re.compile(r',')
for key, value in output_dict.iteritems():
    output_dict2[regex_comma.sub('',key)] = value

In [329]:
# write output
writer = csv.writer(open(path_out, 'wb'))
for key, value in output_dict2.items():
   writer.writerow([key, value])

Printing for Reference (note commas not removed in this output but are in written file)


In [330]:
mysum = 0
for path, count in dict_top_pathcount2.iteritems():
    print path, count
    mysum = mysum + count
print mysum


Oxidative phosphorylation 23
ABC transporters 15
Pyrimidine metabolism 13
Carbon fixation pathways in prokaryotes 12
Glycine, serine and threonine metabolism 10
Ribosome 9
Propanoate metabolism 9
Purine metabolism 9
Porphyrin and chlorophyll metabolism 9
Citrate cycle (TCA cycle) 9
118

In [331]:
mysum = 0
for path, count in dict_top_pathcount1.iteritems():
    print path, count
    mysum = mysum + count
print mysum


Amino Acid Metabolism 19
Carbohydrate Metabolism 14
Energy Metabolism 10
Metabolism of Cofactors and Vitamins 10
Replication and Repair 10
Metabolism of Terpenoids and Polyketides 7
Metabolism of Other Amino Acids 7
Folding, Sorting and Degradation 6
Translation 5
Signal Transduction 4
92

In [332]:
len(ko_level2)


Out[332]:
118

In [333]:
len(ko_level2_unique)


Out[333]:
94

In [334]:
len(ko_level1)


Out[334]:
92

In [335]:
len(ko_level1_unique)


Out[335]:
65

In [336]:
len(ko_level1_unique_plus) # includes KOs from ko_level2_unique not matched due to low pathway count


Out[336]:
71

In [337]:
len(ko_other)


Out[337]:
65

In [338]:
len(ko_level2_unique) + len(ko_level1_unique) + len(ko_other)


Out[338]:
224

In [339]:
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test.txt", "w")

x = 0
for key, value in output_dict2.iteritems():
    out_file.write('\n'.join(value))
    out_file.write('\n')
    #print key, '\t', len(value)
    x = x + len(value)
print x

out_file.close()


224

In [340]:
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test2.txt", "w")
out_file.write('\n'.join(ko_level2_unique + ko_level1_unique + ko_other))
out_file.write('\n')
out_file.close()

In [ ]: