In [1]:
import pandas as pd
import urllib2
from bs4 import BeautifulSoup
import re
from sklearn.cluster import KMeans
import palettable as pal
from itertools import chain
import os
import os.path #? not sure if I need both
import glob
import numpy as np
from IPython.core.debugger import Tracer #used this to step into the function and debug it, also need line with Tracer()()
import matplotlib.pyplot as plt
import matplotlib as mpl #KL moved this here 2/12/2016
#BUT...then messes with the plotting
mpl.rcParams['pdf.fonttype'] = 42
%matplotlib inline
from collections import Counter
import cPickle as cpk
from stackedBarGraph import StackedBarGrapher
SBG = StackedBarGrapher()
In [ ]:
#since notebook is just keeping the plotting examples, can start by reading in the data in gatherCounts
In [351]:
#and now...read in the file...can use this without having to go through the pain of rerunning gatherCounts
gatherCounts = cpk.load(open('gatherCounts_norm2mean.2016.03.31.pickle','rb'))
gatherCounts.head(2)
Out[351]:
In [352]:
colLabel
Out[352]:
In [353]:
newCols = colLabel[2:]
In [354]:
cpdCols = colLabel[2::2]
cpdCols
Out[354]:
In [355]:
geneCols = colLabel[3::2]
geneCols
Out[355]:
In [356]:
#only keep the ones where I have some value...no sense in tracking zeros
s = gatherCounts[(gatherCounts.loc[:,newCols].values > 0).any(axis=1)]
pGroup = pd.unique(s.pathwayGroup_A.ravel())
In [357]:
dfHighest = pd.DataFrame(index = pGroup,columns = newCols)
#do the math - add up the genes/cpds by higher level grouping
for i, group in s.groupby('pathwayGroup_A'):
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
dfHighest.loc[i,newCols] = out
In [358]:
# playing around with color palettes
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
dfHighest.plot(kind = 'bar',color=useColors)
Out[358]:
In [359]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot = dfHighest.loc[:,cpdCols]
toPlot.plot(kind = 'bar',color = useColors)
Out[359]:
In [360]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot = dfHighest.loc[:,geneCols]
toPlot.plot(kind = 'bar',color = useColors)
Out[360]:
In [361]:
##let's narrow in on the metabolism group since that is the only one that is really interesting
plotMetabolism = gatherCounts[gatherCounts.loc[:,'pathwayGroup_A']=='Metabolism']
In [362]:
s = plotMetabolism[(plotMetabolism.loc[:,newCols].values > 0).any(axis=1)]
dataToPlot = s.loc[:,newCols]
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = newCols)
for i, group in s.groupby('pathwayGroup_B'):
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
newDFmtab.loc[i,newCols] = out
In [363]:
s.head(5)
Out[363]:
In [ ]:
In [364]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot_cpds = newDFmtab.loc[:,cpdCols]
toPlot_cpds.plot(kind = 'barh',color=useColors,figsize=(8,8))
# toPlot_cpds.to_csv('compounds_byKmeans.csv')
Out[364]:
In [365]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot_cpds = newDFmtab.loc[:,geneCols]
toPlot_cpds.plot(kind = 'barh',color=useColors,figsize = (8,8))
Out[365]:
In [366]:
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
toPlot_cpds = newDFmtab.loc[:,cpdCols]
toPlot_cpds.T.plot(kind = 'barh',color=useColors,figsize = (12,12))
plt.legend(bbox_to_anchor=([-0.15, 0.5]))
Out[366]:
In [367]:
s = plotMetabolism[(plotMetabolism.loc[:,newCols].values > 0).any(axis=1)]
pGroup = pd.unique(plotMetabolism.pathwayInfo.ravel())
newDFmtabLower = pd.DataFrame(index = pGroup,columns = newCols)
for i, group in s.groupby('pathwayInfo'):
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
newDFmtabLower.loc[i,newCols] = out
In [368]:
testing = toPlot_cpds.loc[:,'Km2_cpd']
testing.plot(kind = 'barh',color = 'blue')
plt.legend(bbox_to_anchor=([-1, 0.5]))
Out[368]:
In [369]:
toPlot_cpds_proportion=toPlot_cpds.copy()
toPlot_cpds['sum']=toPlot_cpds.sum(axis=1)
for i in toPlot_cpds_proportion.columns:
toPlot_cpds_proportion[i]=toPlot_cpds[i]/toPlot_cpds['sum']
toPlot_cpds=toPlot_cpds.T.drop('sum').T
In [370]:
#what about the genes?
toPlot_genes = newDFmtab.loc[:,geneCols]
toPlot_genes_proportion=toPlot_genes.copy()
toPlot_genes['sum']=toPlot_genes.sum(axis=1)
for i in toPlot_genes_proportion.columns:
toPlot_genes_proportion[i]=toPlot_genes[i]/toPlot_genes['sum']
toPlot_genes=toPlot_genes.T.drop('sum').T
#toPlot_genes.to_csv('genes_byKmeans.csv')
In [371]:
#can play around with the colors
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
toPlot_cpds_proportion.plot(kind = 'barh',stacked=True,color=useColors)
plt.legend(bbox_to_anchor=([-1, 0.5]))
plt.title('proportions')
current_figure = plt.gcf()
# mpl.rcParams['pdf.fonttype'] = 42
#current_figure.savefig('cpd_proportions.pdf')
In [372]:
toPlot_genes_proportion.plot(kind = 'barh',stacked=True,color=useColors)
plt.legend(bbox_to_anchor=([-1, 0.5]))
plt.title('proportions')
current_figure = plt.gcf()
mpl.rcParams['pdf.fonttype'] = 42
current_figure.savefig('genes_proportions.pdf')
In [373]:
working = toPlot_genes.T
workingC = toPlot_cpds.T
In [374]:
working['sum'] = toPlot_genes.T.sum(axis = 1)
workingC['sum'] = toPlot_cpds.T.sum(axis=1)
In [375]:
for i in workingC.columns:
workingC[i] = workingC[i]/workingC['sum']
workingC = workingC.T.drop('sum').T
In [376]:
for i in working.columns:
working[i] = working[i]/working['sum']
working = working.T.drop('sum').T
In [377]:
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
working.plot(kind = 'barh',stacked=True,color = useColors,figsize=(12,12))
plt.legend(bbox_to_anchor=([-1, 0.5]))
Out[377]:
In [378]:
useColors = pal.colorbrewer.qualitative.Paired_12.hex_colors
workingC.plot(kind = 'barh',stacked=True,color = useColors,figsize=(12,12),xlim=(0,1))
plt.legend(bbox_to_anchor=([-1, 0.5]))
Out[378]:
In [379]:
# #exported this table to a CSV file for the paper (did one for compounds too)
# toPlot_genes.to_csv('genes.csv')
# toPlot_cpds.to_csv('cpds.csv')
In [380]:
# plot one compound or gene (for paper)
oneCO = 'C02666'
plotOne = forRelatedness[forRelatedness['KEGG']==oneCO]
kData = plotOne.ix[:,dayList]
fig,ax = plt.subplots(1)
kData.T.plot(color = 'r',ax=ax, ylim = [0,5])
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
##kegg_list('C00020').read()
#will have annoying tabs, use this to find them
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
#pngName = 'pathway' + item + '_' + m.group(0) + '_' + oneCO + '.png'
# fig.savefig(pngName)
Out[380]:
In [381]:
shortList = ['C00020']
#note change to get values from list with multiple items
plotOne = forRelatedness.loc[forRelatedness['KEGG'].isin(shortList)]
In [382]:
kData = plotOne.ix[:,dayList]
fig,ax = plt.subplots(1)
# kData.T.plot(color = 'r',ax=ax, ylim = [0,5])
kData.T.plot(ax=ax, ylim = [0,5])
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
##kegg_list('C00020').read()
#will have annoying tabs, use this to find them
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
ax.legend(handles, labels, bbox_to_anchor = ([-0.5, 0.5]))
# fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
fig.suptitle('plotting select items by hand')
# pngName = 'pathway' + item + '_' + m.group(0) + '_' + oneCO + '.png'
fig.savefig('AMP.pdf')
In [383]:
shortList = ['K01081']
plotOne = forRelatedness.loc[forRelatedness['KEGG'].isin(shortList)]
plotOne
Out[383]:
In [384]:
kData = plotOne.ix[:,dayList]
fig,ax = plt.subplots(1)
kData.T.plot(ax=ax, ylim = [0,5])
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
##kegg_list('C00020').read()
#will have annoying tabs, use this to find them
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
# ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
# fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
fig.suptitle('plotting select items by hand')
#pdfName = 'PhenlypropanoidPathway2' + '.pdf'
#mpl.rcParams['pdf.fonttype'] = 42
#fig.savefig(pdfName)
Out[384]:
In [385]:
#how many reactions does AMP appear in? pantothenic acid?
In [386]:
kegg_list('C00020').read()
Out[386]:
In [387]:
cpdList = kegg_link('reaction','C00020') #how many reactions does AMP participate in?
In [388]:
#set up a function to get the list of compounds for a given pathway (must be defined as ko00140 NOT map00140)
def countReactions_byCompound(c_id):
cpdList = kegg_link('reaction',c_id).read() # query and read the pathway
nReactions = 0
for line in cpdList.rstrip().split("\n"):
nReactions = nReactions + 1
return nReactions
countReactions_byCompound('C00864')
Out[388]:
In [389]:
#set up a function to get the list of compounds for a given pathway (must be defined as ko00140 NOT map00140)
def countReactions_byGene(k_id):
cpdList = kegg_link('reaction',k_id).read() # query and read the pathway
nReactions = 0
for line in cpdList.rstrip().split("\n"):
nReactions = nReactions + 1
return nReactions
countReactions_byGene('K00944')
Out[389]:
In [390]:
kegg_link('module','C00020').read()
Out[390]:
In [391]:
kegg_find('module','m00049').read()
Out[391]:
In [392]:
kegg_find('pathway','map00230').read()
Out[392]:
In [393]:
#this provides the complete list of pathways at KEGG:
test = kegg_list('pathway')
#can see the list with this command:
#test.read()
In [394]:
# kegg_list('reaction').read()
In [395]:
kegg_find('ko','K11752').read()
Out[395]:
In [396]:
##bingo. This will provide both the genes and the compounds within a given pathway
#print kegg_get('ko00230').read()
In [397]:
kegg_link('cpd','cpd:C00020').read()
Out[397]:
In [398]:
kegg_link('pathway','C00020').read()
Out[398]:
In [399]:
kegg_link('brite','K00926').read()
Out[399]:
In [400]:
kegg_link('brite','C00020').read()
Out[400]:
In [405]:
#this makes a numpy.ndarray
temp = toPlot_genes.as_matrix(columns = geneCols)
type(temp)
Out[405]:
In [406]:
#this will also make a numpy.ndarray
temp = toPlot_genes.values
type(temp)
Out[406]:
In [407]:
#pluck code from Harriet, trim down a bit, need a few pieces
import palettable.colorbrewer as b2m
from matplotlib.colors import LogNorm, NoNorm, BoundaryNorm
In [408]:
#this will make a pandas DataFrame ...which is what I need for the Heatmap code I have
# data = toPlot_genes
data = toPlot_cpds_proportion
type(data)
Out[408]:
In [409]:
# The global and overview maps will be empty, so get rid of it
data = data.drop(['Global and overview maps'])
In [410]:
def HeatMap(heatmapData, columns=None,colormap=b2m.sequential.GnBu_3.get_mpl_colormap(), m=1e-5):
#note - m in the above function is a small, but non-zero number, allows the LogNorm function to work
#clean up the data to make a pretty heatmap
heatmapData['mean']=heatmapData.mean(skipna = 1, axis=1) #calculate mean value for each class
heatmapData=heatmapData.sort_values(by='mean', ascending=False)#Sort by the mean value
heatmapData=heatmapData.drop('mean',1) #drop mean column
heatmapData=heatmapData.loc[heatmapData.sum(axis=1)!=0]
col_labels=list(heatmapData.index)
row_labels=list(heatmapData.columns.values)
fig3,ax3=plt.subplots()
fig3.set_figheight(len(col_labels)/2)
fig3.set_figwidth(len(row_labels)*2)
#this works, but looks like one color until I start increasing ncolors to bigger numbers...
bounds = np.linspace(heatmapData.min().min() ,heatmapData.max().max())
heatmap3 = ax3.pcolor(heatmapData, cmap=colormap, norm=BoundaryNorm(boundaries = bounds,ncolors=300))
ax3.set_xticks(np.arange(heatmapData.shape[1])+0.5, minor=False)
ax3.set_yticks(np.arange(heatmapData.shape[0])+0.5, minor=False)
ax3.invert_yaxis()
ax3.xaxis.tick_top()
ax3.margins(0,0)
ax3.set_xticklabels(row_labels, minor=False)
ax3.set_yticklabels(col_labels, minor=False)
plt.colorbar(heatmap3)
plt.show()
return fig3
Testing=HeatMap(data)
#Testing.savefig('Insitu_KEGG_Heatmap.pdf')
In [411]:
data = forRelatedness.loc[:,dayList].values
In [412]:
type(data)
Out[412]:
In [413]:
#the next plots work with numpy.ndarray
In [414]:
row_labels = list(dayList)
fig,ax = plt.subplots()
heatmap = ax.pcolor(data,cmap = 'YlOrRd')
# put the major ticks at the middle of each cell
ax.set_xticks(np.arange(data.shape[1])+0.5, minor=False)
ax.set_xticklabels(row_labels, minor=False)
cbar = fig.colorbar(heatmap,ticks = range(5))
plt.show()
In [415]:
plt.pcolor(data)
Out[415]:
In [421]:
##let's narrow in on the metabolism group since that is the only one that is really interesting
plotMetabolism = gatherCounts[gatherCounts.loc[:,'pathwayGroup_A']=='Metabolism']
s = plotMetabolism[(plotMetabolism.loc[:,newCols].values > 0).any(axis=1)]
dataToPlot = s.loc[:,newCols]
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = newCols)
for i, group in s.groupby('pathwayGroup_C'):
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
newDFmtab.loc[i,newCols] = out
#this next line is the part that selects compounds or genes
# toPlot_heatmap = newDFmtab.loc[:,cpdCols]
toPlot_heatmap = newDFmtab.loc[:,geneCols]
# The global and overview maps will be empty, so get rid of it
toPlot_heatmap = toPlot_heatmap.drop(['Global and overview maps'])
In [422]:
Testing=HeatMap(toPlot_heatmap)
In [425]:
#consider different options for the heat map...average value per row? normalize to maximum value?
In [457]:
##let's narrow in on the metabolism group since that is the only one that is really interesting
plotMetabolism = gatherCounts[gatherCounts.loc[:,'pathwayGroup_A']=='Metabolism']
s = plotMetabolism[(plotMetabolism.loc[:,newCols].values > 0).any(axis=1)]
dataToPlot = s.loc[:,newCols]
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = newCols)
for i, group in s.groupby('pathwayGroup_B'): #pathwayGroup_C is the most detailed rendition
d2 = group.loc[:,newCols]
out = d2.sum(axis=0)
newDFmtab.loc[i,newCols] = out
#this next line is the part that selects compounds or genes
# toPlot_heatmap = newDFmtab.loc[:,cpdCols]
toPlot_heatmap = newDFmtab.loc[:,geneCols]
# The global and overview maps will be empty, so get rid of it
toPlot_heatmap = toPlot_heatmap.drop(['Global and overview maps'])
In [450]:
In [451]:
toPlot_heatmap.max()
Out[451]:
In [452]:
toPlot_heatmap.min()
Out[452]:
In [ ]:
#to normalize to mean, take advantage of the function Harriet already wrote...
out = NormalizeToMean(toPlot_heatmap)
In [458]:
def HeatMap(heatmapData, columns=None,colormap=b2m.sequential.GnBu_3.get_mpl_colormap()):
#use the data to sort the rows in the heatmap (not actually changing the data except to delete empty rows)
heatmapData['mean']=heatmapData.mean(skipna = 1, axis=1) #calculate mean value for each class
heatmapData=heatmapData.sort_values(by='mean', ascending=False)#Sort by the mean value
heatmapData=heatmapData.drop('mean',1) #drop mean column
heatmapData=heatmapData.loc[heatmapData.sum(axis=1)!=0] #get rid of rows with zeros
col_labels=list(heatmapData.index)
row_labels=list(heatmapData.columns.values)
fig3,ax3=plt.subplots()
fig3.set_figheight(len(col_labels)/2)
fig3.set_figwidth(len(row_labels)*2)
#this works, but I haven't quite figured out the best values for the colors (though I like the green/blue)
bounds = np.linspace(heatmapData.min().min() ,heatmapData.max().max())
heatmap3 = ax3.pcolor(heatmapData, cmap=colormap, norm=BoundaryNorm(boundaries = bounds,ncolors=300))
ax3.set_xticks(np.arange(heatmapData.shape[1])+0.5, minor=False)
ax3.set_yticks(np.arange(heatmapData.shape[0])+0.5, minor=False)
ax3.invert_yaxis()
ax3.xaxis.tick_top()
ax3.margins(0,0)
ax3.set_xticklabels(row_labels, minor=False)
ax3.set_yticklabels(col_labels, minor=False)
plt.colorbar(heatmap3)
plt.show()
return fig3
Testing=HeatMap(toPlot_heatmap)
Testing.savefig('testing.png', bbox_inches = 'tight')
# Testing.savefig('Testing_Heatmap.pdf')
In [ ]:
In [ ]:
In [ ]: