notebook to visualize the reduced # of genes/compounds when we require things to be 'connected'

via a biochemical pathway

KL 31 March 2016 ; 4 April 2016


In [409]:
import pandas as pd
import palettable as pal
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
%matplotlib inline
import cPickle as cpk

In [410]:
unconnectedCounts = cpk.load(open('gatherCounts_unconnected_norm2mean.2016.03.31.pickle','rb'))
unconnectedCounts.head(2)


Out[410]:
nCpds nGenes Km0_cpd Km0_gene Km1_cpd Km1_gene Km2_cpd Km2_gene Km3_cpd Km3_gene Km4_cpd Km4_gene Km5_cpd Km5_gene Km6_cpd Km6_gene pathwayInfo pathwayGroup_A pathwayGroup_B pathwayGroup_C
ko00010 31 96 0 33 0 6 0 1 1 1 0 12 3 1 0 2 Glycolysis / Gluconeogenesis Metabolism Carbohydrate metabolism Glycolysis / Gluconeogenesis
ko00020 20 57 3 31 0 4 0 0 0 2 0 2 2 0 0 2 Citrate cycle (TCA cycle) Metabolism Carbohydrate metabolism Citrate cycle (TCA cycle)

In [411]:
connectedCounts = cpk.load(open('gatherCounts_norm2mean.2016.03.31.pickle','rb'))
connectedCounts.head(2)


Out[411]:
nCpds nGenes Km0_cpd Km0_gene Km1_cpd Km1_gene Km2_cpd Km2_gene Km3_cpd Km3_gene Km4_cpd Km4_gene Km5_cpd Km5_gene pathwayInfo pathwayGroup_A pathwayGroup_B pathwayGroup_C
ko00010 31 96 1 0 0 1 0 1 0 14 3 0 0 0 Glycolysis / Gluconeogenesis Metabolism Carbohydrate metabolism Glycolysis / Gluconeogenesis
ko00020 20 57 0 2 0 2 0 0 3 16 2 0 0 1 Citrate cycle (TCA cycle) Metabolism Carbohydrate metabolism Citrate cycle (TCA cycle)

In [412]:
#well, reading in was easy, not the questions is how to plot these?
#first attempt is way, way too busy:

In [413]:
cpdCols = colLabel[2::2]
geneCols = colLabel[3::2]

In [414]:
tData = connectedCounts.loc[:,geneCols]
tData.plot(kind = 'barh')


Out[414]:
<matplotlib.axes._subplots.AxesSubplot at 0x3360fac8>

In [415]:
from IPython.core.display import Image
Image(filename = ('CombinedKOandCO_Kmeans_allGenesAndCpds.png'))


Out[415]:

In [416]:
#cheat...manually matched up the figures for the connected and unconnected K means groups and used the titles 
#in the figure to make a table in Excel...read that in here
cf = pd.read_csv('SummarizeConnected_and_Unconnected.csv',header=0)

In [417]:
cf


Out[417]:
description Kmeans type unconnected connected
0 S2 and some S5 higher Km0 cpd 1162 395
1 S2 and some S5 higher Km0 gene 1038 45
2 S1 highest Km1 cpd 130 17
3 S1 highest Km1 gene 578 50
4 even or S4 Km2 cpd 1190 414
5 even or S4 Km2 gene 4992 494
6 S3 highest Km3 cpd 133 39
7 S3 highest Km3 gene 559 51
8 S5 highest Km4 cpd 540 236
9 S5 highest Km4 gene 536 23
10 massive S2 Km5 cpd 1782 312
11 massive S2 Km5 gene 157 8
12 NaN NaN NaN NaN NaN

In [418]:
#have an empty row...
cf.iloc[12,:]


Out[418]:
description    NaN
Kmeans         NaN
type           NaN
unconnected    NaN
connected      NaN
Name: 12, dtype: object

In [419]:
cf = cf.drop([12]) #delete the empty row

In [420]:
#make the labels to use on the figure
makeNclusters = 6
bar_labels = []

for item in range(makeNclusters):
    bar_labels.append('Km' + str(item))

In [421]:
bar_labels


Out[421]:
['Km0', 'Km1', 'Km2', 'Km3', 'Km4', 'Km5']

In [422]:
# %pull out the gene and cpd data...and set the index to the Kmeans columns
justGenes = cf[cf.loc[:,'type']=='gene']
justGenes.set_index('Kmeans',inplace=True)

justCpds = cf[cf.loc[:,'type']=='cpd']
justCpds.set_index('Kmeans',inplace=True)

In [423]:
# Setting the positions and width for the bars
pos = list(range(len(justGenes))) 
wid = 0.25 

colors=pal.colorbrewer.qualitative.Set1_4.hex_colors

# Plotting the bars
fig, ax = plt.subplots(figsize=(10,5))

# Create a bar with pre_score data,
# in position pos,
plt.bar([p - wid for p in pos],justGenes['unconnected'], wid, color=colors[0]) 

# add in some more data,
plt.bar(pos, justGenes['connected'], wid, color=colors[1]) 

# now bring in the compounds...made those negative numbers
plt.bar([p - wid for p in pos], -justCpds['unconnected'], width=wid,color=colors[2]) 

# #final set...seems odd, but could not figure out how to call both genes/compounds at once
plt.bar(pos,-justCpds['connected'], width=wid,color=colors[3]) 

# Set the y axis label
ax.set_ylabel('# of genes/compounds')

# Set the chart's title
ax.set_title('neg #s are compounds')

# Set the position of the x ticks
ax.set_xticks(range(len(justGenes)))
ax.set_xticklabels(bar_labels)

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-wid*2, max(pos)+wid*2)

# Adding the legend and showing the plot
plt.legend(['unconnected gene','connected gene','unconnected cpd','connected cpd'], loc='upper left')
# plt.grid()
# plt.show() #oddlly...if I turn this on, any attempt to save gives a blank figure?

#add a straight (black) line at zero:
plt.plot([-5,6],[0,0],color = '#000000')

plt.savefig('connectedVunconnected.png')



In [ ]: