notebook.community

Edit and run



In [1]:

    
from sys import argv
import re
import numpy as np
import pandas as pd
from cogent.parse import kegg_ko
from collections import Counter
from collections import OrderedDict
import csv

Set Paths and Variables



In [2]:

    
path_goi = '/Users/luke/krse2011/ordination/CCA_genes_filt3_loadings_goi.tsv' #CCA_genes_loadings_goi_test.tsv # argv[1]
path_ko = '/Users/luke/kegg/ko' # argv[2]
path_out = '/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/cca_top_pathways_by_KO.csv' # argv[3]



In [167]:

    
top_level2 = 10
top_level1 = 10
num_cutoff = 5

Get KO Numbers from Table of CCA Loadings



In [4]:

    
# list of KO numbers from table
ko_list = []

# regular expression matching KO number
regex_ko = re.compile(r'K[0-9]{5}')



In [5]:

    
# go through each line and add any matching strings to list
for line in open(path_goi).readlines():
    ko_list.append(regex_ko.findall(line))

# dataframe to series to get unique KO numbers, remove first 'None' value (neither should be necessary)
df = pd.DataFrame(ko_list)
ko_unique = pd.Series(df.values.ravel()).unique()[1:]

Parse KEGG KO Record



In [6]:

    
kdata = kegg_ko.parse_ko(open(path_ko))



In [7]:

    
klist = list(kdata)

Make Dictionary of My Chosen KOs and Their Pathways



In [311]:

    
# generate list of lists, with each entry having [KO, all pathway hierarchy], removing pathways named BR:koXXXXX and None
mykpath = []
regex_BR = re.compile(r'BR:ko')
for record in klist:
    for ko in ko_unique:
        if ko == record['ENTRY']:
            filtered = []
            filtered = [i for i in record['CLASS'] if i[0] is not None and not regex_BR.search(i[0])]
            #filtered = record['CLASS'] #use this if don't wish to filter BR:ko
            newpath = [record['ENTRY'], filtered]
            mykpath.append(newpath)



In [312]:

    
len(mykpath)









    Out[312]:





224



In [313]:

    
# convert list of [KO, all pathway hierarchy] to dictionary, and make a copy for lookup at the end
dict_mykpath = dict(zip([x[0] for x in mykpath], [x[1] for x in mykpath]))
dict_mykpath_copy = dict_mykpath.copy()

With Chosen KOs (GOIs), Find KOs with Level-2 Pathways Among Top



In [314]:

    
# extract pathway level-2, count list entries and sort
mykpath2 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        mykpath2.append(path[1][2])
pathcount2 = Counter(mykpath2)



In [315]:

    
# get top level-2 pathways, convert to dict, find KOs that contain those pathways
top_pathcount2 = pathcount2.most_common(top_level2)
dict_top_pathcount2 = OrderedDict(zip([x[0] for x in top_pathcount2], [x[1] for x in top_pathcount2]))
ko_level2 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        if path[1][2] in dict_top_pathcount2:
            ko_level2.append(ko)



In [316]:

    
# get unique level-2 KOs, as set
ko_level2_unique = list(set(ko_level2))



In [317]:

    
# remove KOs that match top level-2 pathways
for ko in ko_level2_unique:
    dict_mykpath.pop(ko, None)

With Remaining KOs, Find KOs with Level-1 Pathways Among Top



In [318]:

    
# extract pathway level-1, count list entries and sort
mykpath1 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        mykpath1.append(path[1][1])
pathcount1 = Counter(mykpath1)



In [319]:

    
# get top level-1 pathways, convert to dict, find KOs that contain those pathways
top_pathcount1 = pathcount1.most_common(top_level1)
dict_top_pathcount1 = OrderedDict(zip([x[0] for x in top_pathcount1], [x[1] for x in top_pathcount1]))
ko_level1 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        if path[1][1] in dict_top_pathcount1:
            ko_level1.append(ko)



In [320]:

    
# get unique level-1 KOs, as set
ko_level1_unique = list(set(ko_level1))



In [321]:

    
# remove KOs that match top level-1 pathways
for ko in ko_level1_unique:
    dict_mykpath.pop(ko, None)

With Remaining KOs, Group as Other



In [322]:

    
# get list of remaining KOs
ko_other = dict_mykpath.keys()

Assign KOs to Pathways: Top Level-2, Recycle Unassigned KOs, Top Level-1, then Other



In [323]:

    
# initialize dictionaries
dict_level2 = OrderedDict(zip(dict_top_pathcount2.keys(), [[] for x in dict_top_pathcount2.keys()]))
dict_level1 = OrderedDict(zip(dict_top_pathcount1.keys(), [[] for x in dict_top_pathcount1.keys()]))
dict_other = OrderedDict()
dict_other['Other'] = []



In [324]:

    
# level-2 assign highest pathway per KO, then remove pathways with too few counts, but recycle unassigned KOs
ko_bool_level2 = OrderedDict(zip([x for x in ko_level2_unique], [True for x in ko_level2_unique]))
for ko in ko_level2_unique:
    for path, count in dict_top_pathcount2.iteritems():
        if ko_bool_level2[ko]:
            for entry in dict_mykpath_copy[ko]:
                if entry[1][2] == path:
                    dict_level2[path].append(ko)
                    ko_bool_level2[ko] = False
                    break
ko_level1_unique_plus = list(ko_level1_unique)
for key, value in dict_level2.iteritems():
    if len(value) < num_cutoff:
        recycle = dict_level2.pop(key, None)
        if len(recycle) > 0:
            for element in recycle:
                ko_level1_unique_plus.append(element)
                print element



In [ ]:



In [325]:

    
# level-1 assign highest pathway per KO, then remove pathways with too few counts
ko_bool_level1 = OrderedDict(zip([x for x in ko_level1_unique_plus], [True for x in ko_level1_unique_plus]))
for ko in ko_level1_unique_plus:
    for path, count in dict_top_pathcount1.iteritems():
        if ko_bool_level1[ko]:
            for entry in dict_mykpath_copy[ko]:
                if entry[1][1] == path:
                    dict_level1[path].append(ko)
                    ko_bool_level1[ko] = False
                    break
ko_other_plus = list(ko_other)
for key, value in dict_level1.iteritems():
    if len(value) < num_cutoff:
        recycle1 = dict_level1.pop(key, None)
        if len(recycle1) > 0:
            for element in recycle1:
                ko_other_plus.append(element)
                print element



In [326]:

    
# other
for ko in ko_other_plus:
    dict_other['Other'].append(ko)
    
# hacky solution for K01488, whose level-2 pathway (Purine metabolism) 
# only has 2 counts but level-1 pathways (Nucleotide Metabolism and 
# Immune System Diseases) are not in my list -- THIS IS A BUG!
dict_other['Other'].append('K01488')

Combine Dicts of Pathway => KOs and Write to File



In [327]:

    
# combine 3 dicts into 1
output_dict = OrderedDict(dict_level2.items() + dict_level1.items() + dict_other.items())



In [328]:

    
# remove commas from pathway names
output_dict2 = OrderedDict()
regex_comma = re.compile(r',')
for key, value in output_dict.iteritems():
    output_dict2[regex_comma.sub('',key)] = value



In [329]:

    
# write output
writer = csv.writer(open(path_out, 'wb'))
for key, value in output_dict2.items():
   writer.writerow([key, value])

Printing for Reference (note commas not removed in this output but are in written file)



In [330]:

    
mysum = 0
for path, count in dict_top_pathcount2.iteritems():
    print path, count
    mysum = mysum + count
print mysum









    



Oxidative phosphorylation 23
ABC transporters 15
Pyrimidine metabolism 13
Carbon fixation pathways in prokaryotes 12
Glycine, serine and threonine metabolism 10
Ribosome 9
Propanoate metabolism 9
Purine metabolism 9
Porphyrin and chlorophyll metabolism 9
Citrate cycle (TCA cycle) 9
118



In [331]:

    
mysum = 0
for path, count in dict_top_pathcount1.iteritems():
    print path, count
    mysum = mysum + count
print mysum









    



Amino Acid Metabolism 19
Carbohydrate Metabolism 14
Energy Metabolism 10
Metabolism of Cofactors and Vitamins 10
Replication and Repair 10
Metabolism of Terpenoids and Polyketides 7
Metabolism of Other Amino Acids 7
Folding, Sorting and Degradation 6
Translation 5
Signal Transduction 4
92



In [332]:

    
len(ko_level2)









    Out[332]:





118



In [333]:

    
len(ko_level2_unique)









    Out[333]:





94



In [334]:

    
len(ko_level1)









    Out[334]:





92



In [335]:

    
len(ko_level1_unique)









    Out[335]:





65



In [336]:

    
len(ko_level1_unique_plus) # includes KOs from ko_level2_unique not matched due to low pathway count









    Out[336]:





71



In [337]:

    
len(ko_other)









    Out[337]:





65



In [338]:

    
len(ko_level2_unique) + len(ko_level1_unique) + len(ko_other)









    Out[338]:





224



In [339]:

    
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test.txt", "w")

x = 0
for key, value in output_dict2.iteritems():
    out_file.write('\n'.join(value))
    out_file.write('\n')
    #print key, '\t', len(value)
    x = x + len(value)
print x

out_file.close()



In [340]:

    
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test2.txt", "w")
out_file.write('\n'.join(ko_level2_unique + ko_level1_unique + ko_other))
out_file.write('\n')
out_file.close()



In [ ]: