KL to do list: see B18/p94: make the plots not repeat isomers for the KEGG compounds, but present # of isomers
In [5]:
#%reset
In [6]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.cluster import KMeans
import cPickle as cpk
import palettable as pal
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
from Bio.KEGG.REST import *
%matplotlib inline
In [7]:
#corrected bug in MATLAB code that resulted in duplicate row numbers
mtabFile = 'RImetabolites_isomers.2016.09.01.csv' #first column is RInumber
In [8]:
CO_fromMATLAB=pd.read_csv(mtabFile, index_col='RInumber')
In [9]:
#make a list of the unique CO numbers for the CreateHash_COtoKO.py. Export the list as CSV
td = CO_fromMATLAB.groupby('cNumber').count()
COnumbers = td.drop(list(td.columns.values),axis=1)
del td
COnumbers.to_csv('exportCOnumbers.csv',header=True)
In [10]:
def findRInumber(dataIn,KEGGin):
#find possible RI numbers for a given KEGG number.
dataOut = []
for i,KEGG in enumerate(dataIn['KEGG']):
if KEGG == KEGGin:
t = dataIn.index[i]
dataOut.append(t)
return dataOut
##For example: this will give back one row, C18028 will be multiple
#m = findRInumber(forRelatedness,'C00078')
def convertRItoCO(dataIn,RIin):
#do the reverse, given an RInumber find the cNumber
dataOut = dataIn.loc[RIin].loc['cNumber']
return dataOut
##This will always be a single value
#m = convertRItoCO(forRelatedness,'RI2')
#slight change, no need to send in a comparison file if it always the same thing
def convertRItoCO2(RIin):
#do the reverse, given an RInumber find the cNumber
dataOut = CO_fromMATLAB.loc[RIin].loc['cNumber']
return dataOut
##This will always be a single value, also uses CO_fromMATLAB as input
In [11]:
#This grabs the CO/KO links from the KEGG website. The actual code is in the
#CreateHash_COtoKO.py that Harriet wrote. Note that since the exportCOnumbers.csv
#file is a unique list of C number we essentially already have a lookup table
#for all the metabolites of interest.
if os.path.isfile('exportCOnumbers.csv' + '.pickle'):
#just read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
else:
#need to make the file
filename = "CreateHash_COtoKO.py"
%run $filename exportCOnumbers.csv
#then read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
In [22]:
def SplitCODict(WorkingFile):
CO_withoutKO={}
CO_withKO={}
for CO in WorkingFile.keys():
if WorkingFile[CO]['Related KO']==[]:
CO_withoutKO[CO]=WorkingFile[CO]
else:
CO_withKO[CO]=WorkingFile[CO]
return CO_withoutKO, CO_withKO
CO_withoutKO, CO_withKO=SplitCODict(WorkingFile)
print 'There are', len(CO_withKO), 'COs with an associated KO.', len(CO_withoutKO), 'are not associated with a KO.'
In [23]:
AllKO=[]
AllCO=[]
for key in CO_withKO:
AllKO.append(CO_withKO[key]['Related KO'])
AllCO.append(CO_withKO[key]['Related CO'])
AllKO=list(set([item for sublist in AllKO for item in sublist]))
AllCO=list(set([item for sublist in AllCO for item in sublist]))
In [25]:
#go through CO_RawData_all one row at a time (inefficient for sure, but I understand
#what is happening), then make a new column in CO_RawData_all that is True/False
CO_fromMATLAB['inList'] = ""
for idx in range(0,len(CO_fromMATLAB)):
# for idx in range(0):
fc = CO_fromMATLAB.ix[idx,'cNumber']
if fc in AllCO:
CO_fromMATLAB.ix[idx,'inList'] = True
else:
CO_fromMATLAB.ix[idx,'inList'] = False
In [26]:
#can't quite figure out how to do this in one step.
m = CO_fromMATLAB[CO_fromMATLAB['inList']==True]
CO_metadata_pruned = m.loc[:,['cNumber','ChargedMass','RT','ionMode']]
#this list of days is useful, so define it up front.
dayList = ['S1','S2','S3','S4','S5']
CO_RawData_pruned = m.loc[:,dayList]
del m
In [27]:
#Load PhytoKEGG Annotations
#from Harriet...doesn't work, complaining about header=False
#AllPhytoKO_ann=pd.read_table('AllPhytoKegg_annotated.tab', header=False, delimiter='\t')
#try this instead (note double \\ at end)
#pathToData = 'Z:\KLtemp\TranscriptomicsData_Feb2016\\'
pathToData = 'C:\Users\Krista\Documents\Current projects\Kujawinski_Metabolomics_RIsamples\TranscriptomicsData_Feb2016\\'
AllPhytoKO_ann=pd.read_table((pathToData + 'AllPhytoKegg_annotated.tab'), delimiter='\t')
InsituCounts=pd.read_table((pathToData + 'AllInsitu_NoZero.tab'), index_col='gID')
In [28]:
#normalize to the library size
InsituTPM=InsituCounts.copy()
InsituTPM[['S1', 'S2', 'S3', 'S4', 'S5']]=(InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']]/InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']].sum())*10**6
#Add annotation information
InsituCounts=InsituCounts.join(AllPhytoKO_ann)
InsituTPM=InsituTPM.join(AllPhytoKO_ann)
InsituCounts=InsituCounts.dropna()
InsituTPM=InsituTPM.dropna()
KO_RawData=InsituTPM.groupby('kID').sum()
In [29]:
def NormalizeToMean(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['mean']=DF.mean(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['mean']
DF_meanNorm=DF_meanNorm.T.drop('mean').T
return out
def NormalizeToMax(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['max']=DF.max(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['max']
DF_meanNorm=DF_meanNorm.T.drop('max').T
return out
def NormalizeToMean_CV(DF):
out=DF.copy()
out['mean']=DF.mean(axis=1)
out['SD']=DF.std(axis=1)
out['CV']=out['SD']/out['mean']
return out
In [30]:
#norm2Mean seems based (see plots in earlier notebooks)
KO_Norm2Mean=NormalizeToMean(KO_RawData)
##two versions of the CO data: (1) regular and (2) inverse
CO_Norm2Mean_regular=NormalizeToMean(CO_RawData_pruned)
In [31]:
#use _finalOption variable names to make life easier
KO_finalOption = KO_Norm2Mean.loc[AllKO].dropna()
CO_final_regular = CO_Norm2Mean_regular.dropna() #already 'limited' this before the normalization
In [ ]:
##Combine the CO and the KO data in preparation for K means clustering
#Combined_KO_CO_regular=KO_finalOption.append(CO_final_regular.loc[:,(dayList)])
In [32]:
CO_final_regular.head(4)
Out[32]:
In [33]:
CO_final_regular.insert(0,'KEGG',"") #add a column called 'KEGG'
In [34]:
for idx in range(0,len(CO_final_regular)):
t = CO_final_regular.iloc[idx,:].name
if t[0]=='R':
#go find the matching cNumber in CO_RawData_all
t2 = CO_fromMATLAB.loc[t,('cNumber')]
CO_final_regular.ix[idx,('KEGG')] = t2
elif t[0] == 'K':
#just copy the K number over
CO_final_regular.ix[idx,('KEGG')] = t
In [35]:
oneCpd = 'C06030'
rnList = kegg_link('reaction',oneCpd).read() #now go get the compounds for that reaction
len(rnList.splitlines())
Out[35]:
In [36]:
uniqueCO = CO_final_regular['KEGG'].drop_duplicates()
countReactions = pd.DataFrame(uniqueCO)
countReactions.insert(1,'nReactions',"")
countReactions.insert(1,'nPathways',"")
In [37]:
#shorten for testing
#countReactions.drop(countReactions.index[[range(3,500)]], inplace = True)
In [38]:
for idx in range(0,len(countReactions)):
#first figure out if this is a compound ...oops, can remove this bc only have compounds now
#print countReactions.iloc[idx]
oneCpd = countReactions.ix[idx,'KEGG']
rnList = kegg_link('reaction',oneCpd).read() #now go get the compounds for that reaction
countReactions.ix[idx,'nReactions'] = len(rnList.splitlines())
#now go count the pathways
rnList = kegg_link('pathway',oneCpd).read()
countReactions.ix[idx,'nPathways'] = len(rnList.splitlines())
In [39]:
countReactions.head(4)
Out[39]:
In [40]:
#next will need to sort by number...have to go home now though
In [45]:
countReactions.sort_values(by = 'nReactions',ascending = False,inplace = True)
In [119]:
countReactions.head(6)
Out[119]:
so...C00019 is SAM, C00026 is 2-oxoglurate, C00020 is AMP, C00042 is succinate, C00031 is glucose
In [120]:
countReactions['nPathways'].plot(kind = 'hist',bins = 40)
Out[120]:
In [121]:
countReactions['nReactions'].plot(kind = 'hist',bins = 40)
Out[121]:
In [122]:
oneCpd = 'C00026'
In [123]:
#how many pathways? #or, linked to how many genes?
rnList = kegg_link('pathway',oneCpd).read() #now go get the compounds for that reaction
print len(rnList.splitlines())
In [124]:
#how many pathways? #or, linked to how many genes?
rnList = kegg_link('reaction',oneCpd).read() #now go get the compounds for that reaction
print len(rnList.splitlines())
In [128]:
countReactions.head(5)
Out[128]:
In [ ]:
In [163]:
use = range(0,10)
labels = countReactions.ix[use,'KEGG']
tData = countReactions.ix[use]
ax = tData.plot(kind = 'bar',x = tData['KEGG'])
plt.savefig('topTen.png',bbox_inches='tight')
In [174]:
kegg_find('compound','C00353').read()
Out[174]:
In [126]:
#some other crazy ideas for plotting...leave here for now...
In [127]:
%debug #put here to force a stop...
In [117]:
import matplotlib.pyplot as plt
from pylab import *
ioff() # don't show figures
colors = [(102, 194, 165), (252, 141, 98), (141, 160, 203), (231, 138,195),
(166, 216, 84), (255, 217, 47), (171, 197, 233), (252, 205, 229)]
for icol in range(len(colors)):
red,green,blue = colors[icol]
colors[icol] = (red / 255., green / 255., blue / 255.)
fig = plt.figure(1, figsize=(8, 8))
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
sizes_component_1 = countReactions.ix[range(0,len(countReactions)),'nPathways']
sizes_component_2 = countReactions.ix[range(0,len(countReactions)),'nReactions']
component_1 = 'exampleofalongtextthatiscutoff', '2', '3', '4','5'
component_2 = 'Unix', 'Mac', 'Windows7', 'Windows10', 'WindowsXP'
patches1, texts1, autotexts1 = ax.pie(sizes_component_1, radius=1, pctdistance=0.9, colors=colors, autopct='%1.1f%%', shadow=False, startangle=90)
patches2, texts2, autotexts2 = ax.pie(sizes_component_2, radius=0.8, pctdistance=0.6, colors=colors, autopct='%1.1f%%', shadow=False, startangle=90)
# To draw circular donuts
ax.axis('equal')
# Draw white circle
centre_circle = plt.Circle((0,0),0.6,color='black', fc='white')
ax.add_artist(centre_circle)
# Shrink current axis by 20%
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
#lgd1=ax.legend(patches1,component_1, frameon=False, loc='center left', bbox_to_anchor=(1.0, 0.8), borderaxespad=0.1)
#lgd2=ax.legend(patches2,component_2, frameon=False, loc='center left', ncol=len(patches2)/2, bbox_to_anchor=(0.0, -0.005), borderaxespad=0)
#ax_elem = ax.add_artist(lgd1)
fig.suptitle('Title', fontsize=16)
fig.savefig('donut.png',bbox_extra_artists=(lgd1,lgd2,), bbox_inches='tight')
#plt.gcf().clear() # clears buffer
In [94]:
from bokeh.charts import Donut, show, output_file
if False: #only do reactions
df = pd.DataFrame(countReactions.ix[range(0,len(countReactions)),('KEGG','nReactions')])
df = df.sort("nReactions", ascending=False)
df = pd.melt(df, id_vars=['KEGG'],
value_vars=['nReactions'],
value_name='group_count', var_name='group')
d = Donut(df, label=['KEGG', 'group'], values='group_count',
text_font_size='8pt', hover_text='group_count')
elif True:
df = countReactions.ix[range(0,len(countReactions))]
df = df.sort("nReactions", ascending=False)
df = pd.melt(df, id_vars=['KEGG'],
value_vars=['nPathways', 'nReactions'],
value_name='group_count', var_name='group')
d = Donut(df, label=['KEGG', 'group'], values='group_count',
text_font_size='8pt', hover_text='group_count')
output_file("donut.html", title="donut.py example")
show(d)
In [ ]: