In [27]:
from sys import argv
import re
import numpy as np
import pandas as pd
from cogent.parse import kegg_ko
from cogent.parse import kegg_taxonomy
from collections import Counter
from collections import OrderedDict
from collections import defaultdict
import csv

Set Paths and Variables


In [28]:
path_cpm = '/Users/luke/vibrio/results/CPM.all.Plk.Swt.Vnt.descr.tsv'
path_gff = '/Users/luke/vibrio/genome/VfES114_fixed.CDS.tsv'
path_plk_swt = '/Users/luke/vibrio/results/mydeseq2.all.Plk.Swt.tsv'
path_plk_vnt = '/Users/luke/vibrio/results/mydeseq2.all.Plk.Vnt.tsv'
path_swt_vnt = '/Users/luke/vibrio/results/mydeseq2.all.Swt.Vnt.tsv'
path_vfi_ko = '/Users/luke/kegg/vfi/vfi_ko.list'
path_ko = '/Users/luke/kegg/ko'
path_out = '/Users/luke/vibrio/results/results_rpkm.csv'

Make RPKM DataFrame with KO Numbers Appended


In [29]:
# Import CPM data
cpm = pd.read_csv(path_cpm, sep='\t')

In [30]:
# Import GFF file
gff = pd.read_csv(path_gff, sep='\t', header=None)
gff.columns = ['CDS_number', 'length_bp', 'description']

In [31]:
# Convert CPM to RPKM
rpkm = pd.merge(cpm, gff, left_on='CDS_number', right_on='CDS_number', how='left')
del rpkm['description']
for i in ['Plk1', 'Plk2', 'Plk3', 'Swt1', 'Swt2', 'Swt3', 'Vnt1', 'Vnt2', 'Vnt3']:
    series_norm = rpkm[i]/rpkm['length_bp']
    rpkm['%s_rpkm' % i] = series_norm/series_norm.sum()*1e6
    del rpkm[i]
del rpkm['length_bp']
# Reorder columns
cols = rpkm.columns.tolist()
newcols = cols[0:2] + cols[3:12] + cols[2:3]
rpkm = rpkm[newcols]

In [32]:
# Add mean and std columns for plk, swt, vnt
for i in ['Plk', 'Swt', 'Vnt']:
    rpkm['%s_rpkm_mean' % i] = rpkm[['%s1_rpkm' % i, '%s2_rpkm' % i, '%s3_rpkm' % i]].mean(axis=1)
    rpkm['%s_rpkm_std' % i] = rpkm[['%s1_rpkm' % i, '%s2_rpkm' % i, '%s3_rpkm' % i]].std(axis=1)

In [33]:
# Reorder columns
cols = rpkm.columns.tolist()
newcols = cols[0:5] + cols[12:14] + cols[5:8] + cols[14:16] + cols[8:11] + cols[16:18] + cols[11:12]
rpkm = rpkm[newcols]

In [34]:
vfi_ko = pd.read_csv(path_vfi_ko, sep='\t', header=None)
vfi_ko[0] = vfi_ko[0].str.replace(r'vfi:', '')
vfi_ko[1] = vfi_ko[1].str.replace(r'ko:', '')
vf2ko = defaultdict(str, zip(vfi_ko[0], vfi_ko[1]))

In [35]:
rpkm['KO_number'] = [vf2ko[i] for i in rpkm.VF_number]

Parse KEGG KO Record and Append Pathway (Pathways) to DataFrame


In [36]:
# Long step! Only run if necessary
kdata = kegg_ko.parse_ko(open(path_ko))
klist = list(kdata)
kframe = pd.DataFrame(klist)
ko2class = defaultdict(str, zip(kframe.ENTRY, kframe.CLASS))

In [37]:
# Add column for pathways ("Class")
rpkm['Pathways'] = [ko2class[i] for i in rpkm.KO_number]

Import and Append DESeq2 Results


In [38]:
# Import DESeq2 data to DataFrames
deseq_plk_swt = pd.read_csv(path_plk_swt, sep='\t')
deseq_plk_vnt = pd.read_csv(path_plk_vnt, sep='\t')
deseq_swt_vnt = pd.read_csv(path_swt_vnt, sep='\t')

In [39]:
# Remove columns except VF/CDS numbers (to be used as keys) and log2fc and padj
deseq_plk_swt = deseq_plk_swt[['VF_number', 'CDS_number', 'log2FoldChange', 'padj']]
deseq_plk_vnt = deseq_plk_vnt[['VF_number', 'CDS_number', 'log2FoldChange', 'padj']]
deseq_swt_vnt = deseq_swt_vnt[['VF_number', 'CDS_number', 'log2FoldChange', 'padj']]

In [40]:
# Reverse sign to get Vnt-vs-Plk and Vnt-vs-Swt
deseq_plk_vnt['log2FoldChange'] = deseq_plk_vnt['log2FoldChange'] * -1
deseq_swt_vnt['log2FoldChange'] = deseq_swt_vnt['log2FoldChange'] * -1

In [41]:
# Rename columns
deseq_plk_swt.columns = ['VF_number','CDS_number','Plk-vs-Swt_log2fc','Plk-vs-Swt_padj']
deseq_plk_vnt.columns = ['VF_number','CDS_number','Vnt-vs-Plk_log2fc','Vnt-vs-Plk_padj']
deseq_swt_vnt.columns = ['VF_number','CDS_number','Vnt-vs-Swt_log2fc','Vnt-vs-Swt_padj']

In [42]:
# Merge DESeq2 data into normalized count and KO/pathway table
rpkm = pd.merge(rpkm, deseq_plk_swt, how='outer')
rpkm = pd.merge(rpkm, deseq_plk_vnt, how='outer')
rpkm = pd.merge(rpkm, deseq_swt_vnt, how='outer')

In [43]:
# Reorder columns
cols = rpkm.columns.tolist()
newcols = cols[0:17] + cols[20:26] + cols[17:20]
rpkm = rpkm[newcols]

In [44]:
# Save DataFrame as csv
rpkm.to_csv(path_out, index=False)

In [ ]:

Reformat as biom file and mapping file


In [19]:
biom = rpkm[['VF_number', 'Plk1_rpkm', 'Plk2_rpkm', 'Plk3_rpkm', 'Swt1_rpkm', 'Swt2_rpkm', 'Swt3_rpkm', 'Vnt1_rpkm', 'Vnt2_rpkm', 'Vnt3_rpkm']]
biom.index = biom.VF_number
del biom['VF_number']
biom = biom.T
biom.to_csv('vibrio_biom.csv', index=True)

In [20]:
mapping = pd.DataFrame({'treatment': ['planktonic', 'planktonic', 'planktonic', 'SWT_medium', 'SWT_medium', 'SWT_medium', 'vented', 'vented', 'vented']}, index=biom.index)
mapping.to_csv('vibrio_mapping.csv', index=True)

In [ ]:


In [ ]:


In [ ]:


In [ ]:

IGNORE BELOW FOR NOW


In [ ]:
top_level2 = 10
top_level1 = 10
num_cutoff = 5

Make Dictionary of My Chosen KOs and Their Pathways


In [ ]:
# convert list of [KO, all pathway hierarchy] to dictionary, and make a copy for lookup at the end
dict_mykpath = dict(zip([x[0] for x in mykpath], [x[1] for x in mykpath]))
dict_mykpath_copy = dict_mykpath.copy()

With Chosen KOs (GOIs), Find KOs with Level-2 Pathways Among Top


In [ ]:
# extract pathway level-2, count list entries and sort
mykpath2 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        mykpath2.append(path[1][2])
pathcount2 = Counter(mykpath2)

In [ ]:
# get top level-2 pathways, convert to dict, find KOs that contain those pathways
top_pathcount2 = pathcount2.most_common(top_level2)
dict_top_pathcount2 = OrderedDict(zip([x[0] for x in top_pathcount2], [x[1] for x in top_pathcount2]))
ko_level2 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        if path[1][2] in dict_top_pathcount2:
            ko_level2.append(ko)

In [ ]:
# get unique level-2 KOs, as set
ko_level2_unique = list(set(ko_level2))

In [ ]:
# remove KOs that match top level-2 pathways
for ko in ko_level2_unique:
    dict_mykpath.pop(ko, None)

With Remaining KOs, Find KOs with Level-1 Pathways Among Top


In [ ]:
# extract pathway level-1, count list entries and sort
mykpath1 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        mykpath1.append(path[1][1])
pathcount1 = Counter(mykpath1)

In [ ]:
# get top level-1 pathways, convert to dict, find KOs that contain those pathways
top_pathcount1 = pathcount1.most_common(top_level1)
dict_top_pathcount1 = OrderedDict(zip([x[0] for x in top_pathcount1], [x[1] for x in top_pathcount1]))
ko_level1 = []
for ko, paths in dict_mykpath.iteritems():
    for path in paths:
        if path[1][1] in dict_top_pathcount1:
            ko_level1.append(ko)

In [ ]:
# get unique level-1 KOs, as set
ko_level1_unique = list(set(ko_level1))

In [ ]:
# remove KOs that match top level-1 pathways
for ko in ko_level1_unique:
    dict_mykpath.pop(ko, None)

With Remaining KOs, Group as Other


In [ ]:
# get list of remaining KOs
ko_other = dict_mykpath.keys()

Assign KOs to Pathways: Top Level-2, Recycle Unassigned KOs, Top Level-1, then Other


In [ ]:
# initialize dictionaries
dict_level2 = OrderedDict(zip(dict_top_pathcount2.keys(), [[] for x in dict_top_pathcount2.keys()]))
dict_level1 = OrderedDict(zip(dict_top_pathcount1.keys(), [[] for x in dict_top_pathcount1.keys()]))
dict_other = OrderedDict()
dict_other['Other'] = []

In [ ]:
# level-2 assign highest pathway per KO, then remove pathways with too few counts, but recycle unassigned KOs
ko_bool_level2 = OrderedDict(zip([x for x in ko_level2_unique], [True for x in ko_level2_unique]))
for ko in ko_level2_unique:
    for path, count in dict_top_pathcount2.iteritems():
        if ko_bool_level2[ko]:
            for entry in dict_mykpath_copy[ko]:
                if entry[1][2] == path:
                    dict_level2[path].append(ko)
                    ko_bool_level2[ko] = False
                    break
ko_level1_unique_plus = list(ko_level1_unique)
for key, value in dict_level2.iteritems():
    if len(value) < num_cutoff:
        recycle = dict_level2.pop(key, None)
        if len(recycle) > 0:
            for element in recycle:
                ko_level1_unique_plus.append(element)
                print element

In [ ]:


In [ ]:
# level-1 assign highest pathway per KO, then remove pathways with too few counts
ko_bool_level1 = OrderedDict(zip([x for x in ko_level1_unique_plus], [True for x in ko_level1_unique_plus]))
for ko in ko_level1_unique_plus:
    for path, count in dict_top_pathcount1.iteritems():
        if ko_bool_level1[ko]:
            for entry in dict_mykpath_copy[ko]:
                if entry[1][1] == path:
                    dict_level1[path].append(ko)
                    ko_bool_level1[ko] = False
                    break
ko_other_plus = list(ko_other)
for key, value in dict_level1.iteritems():
    if len(value) < num_cutoff:
        recycle1 = dict_level1.pop(key, None)
        if len(recycle1) > 0:
            for element in recycle1:
                ko_other_plus.append(element)
                print element

In [ ]:
# other
for ko in ko_other_plus:
    dict_other['Other'].append(ko)
    
# hacky solution for K01488, whose level-2 pathway (Purine metabolism) 
# only has 2 counts but level-1 pathways (Nucleotide Metabolism and 
# Immune System Diseases) are not in my list -- THIS IS A BUG!
dict_other['Other'].append('K01488')

Combine Dicts of Pathway => KOs and Write to File


In [ ]:
# combine 3 dicts into 1
output_dict = OrderedDict(dict_level2.items() + dict_level1.items() + dict_other.items())

In [ ]:
# remove commas from pathway names
output_dict2 = OrderedDict()
regex_comma = re.compile(r',')
for key, value in output_dict.iteritems():
    output_dict2[regex_comma.sub('',key)] = value

In [ ]:
# write output
writer = csv.writer(open(path_out, 'wb'))
for key, value in output_dict2.items():
   writer.writerow([key, value])

Printing for Reference (note commas not removed in this output but are in written file)


In [ ]:
mysum = 0
for path, count in dict_top_pathcount2.iteritems():
    print path, count
    mysum = mysum + count
print mysum

In [ ]:
mysum = 0
for path, count in dict_top_pathcount1.iteritems():
    print path, count
    mysum = mysum + count
print mysum

In [ ]:
len(ko_level2)

In [ ]:
len(ko_level2_unique)

In [ ]:
len(ko_level1)

In [ ]:
len(ko_level1_unique)

In [ ]:
len(ko_level1_unique_plus) # includes KOs from ko_level2_unique not matched due to low pathway count

In [ ]:
len(ko_other)

In [ ]:
len(ko_level2_unique) + len(ko_level1_unique) + len(ko_other)

In [ ]:
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test.txt", "w")

x = 0
for key, value in output_dict2.iteritems():
    out_file.write('\n'.join(value))
    out_file.write('\n')
    #print key, '\t', len(value)
    x = x + len(value)
print x

out_file.close()

In [ ]:
out_file = open("/Users/luke/krse2011/ordination/cca-by-pathway-filt3-goi/test2.txt", "w")
out_file.write('\n'.join(ko_level2_unique + ko_level1_unique + ko_other))
out_file.write('\n')
out_file.close()

In [ ]: