In [22]:
import pandas as pd
# import urllib2
# from bs4 import BeautifulSoup
# import re
# from sklearn.cluster import KMeans
import palettable as pal
# from itertools import chain
import os
# import os.path #? not sure if I need both
# import glob
# import numpy as np
# from IPython.core.debugger import Tracer #used this to step into the function and debug it, also need line with Tracer()()
import matplotlib.pyplot as plt
# import matplotlib as mpl #KL moved this here 2/12/2016
#BUT...then messes with the plotting
# mpl.rcParams['pdf.fonttype'] = 42
%matplotlib inline
# from collections import Counter
import cPickle as cpk
# from stackedBarGraph import StackedBarGrapher
# SBG = StackedBarGrapher()
In [5]:
mtabFile = 'RImetabolites_isomers.2015.08.27.csv' #first column is RInumber
In [6]:
CO_fromMATLAB=pd.read_csv(mtabFile, index_col='RInumber')
In [20]:
#make a list of the unique CO numbers for the CreateHash_COtoKO.py. Export the list as CSV
td = CO_fromMATLAB.groupby('cNumber').count()
COnumbers = td.drop(list(td.columns.values),axis=1)
COnumbers.to_csv('exportCOnumbers.csv',header=True)
In [23]:
if os.path.isfile('exportCOnumbers.csv' + '.pickle'):
#just read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
else:
#need to make the file
filename = "CreateHash_COtoKO.py"
%run $filename exportCOnumbers.csv
#then read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
In [24]:
def SplitCODict(WorkingFile):
CO_withoutKO={}
CO_withKO={}
for CO in WorkingFile.keys():
if WorkingFile[CO]['Related KO']==[]:
CO_withoutKO[CO]=WorkingFile[CO]
else:
CO_withKO[CO]=WorkingFile[CO]
return CO_withoutKO, CO_withKO
CO_withoutKO, CO_withKO=SplitCODict(WorkingFile)
print 'There are', len(CO_withKO), 'COs with an associated KO.', len(CO_withoutKO), 'are not associated with a KO.'
In [25]:
AllKO=[]
AllCO=[]
for key in CO_withKO:
AllKO.append(CO_withKO[key]['Related KO'])
AllCO.append(CO_withKO[key]['Related CO'])
AllKO=list(set([item for sublist in AllKO for item in sublist]))
AllCO=list(set([item for sublist in AllCO for item in sublist]))
# KO_limited_Norm2Mean=KO_Norm2Mean.loc[AllKO].dropna()
# CO_limited_Norm2Mean=CO_Norm2Mean.loc[AllCO].dropna()
In [26]:
#go through CO_RawData_all one row at a time (inefficient for sure, but I understand
#what is happening), then make a new column in CO_RawData_all that is True/False
CO_fromMATLAB['inList'] = ""
for idx in range(0,len(CO_fromMATLAB)):
# for idx in range(0):
fc = CO_fromMATLAB.ix[idx,'cNumber']
if fc in AllCO:
CO_fromMATLAB.ix[idx,'inList'] = True
else:
CO_fromMATLAB.ix[idx,'inList'] = False
In [27]:
#can't quite figure out how to do this in one step.
m = CO_fromMATLAB[CO_fromMATLAB['inList']==True]
CO_metadata_pruned = m.loc[:,['cNumber','ChargedMass','RT','ionMode']]
#this list of days is useful, so define it up front.
dayList = ['S1','S2','S3','S4','S5'] #this makes a list (mutable, can be changed)
CO_RawData_pruned = m.loc[:,dayList]
del m
In [8]:
#Load PhytoKEGG Annotations
#from Harriet...doesn't work, complaining about header=False
#AllPhytoKO_ann=pd.read_table('AllPhytoKegg_annotated.tab', header=False, delimiter='\t')
#try this instead (note double \\ at end)
pathToData = 'Z:\KLtemp\TranscriptomicsData_Feb2016\\'
AllPhytoKO_ann=pd.read_table((pathToData + 'AllPhytoKegg_annotated.tab'), delimiter='\t')
In [9]:
InsituCounts=pd.read_table((pathToData + 'AllInsitu_NoZero.tab'), index_col='gID')
In [10]:
#normalize to the library size
InsituTPM=InsituCounts.copy()
InsituTPM[['S1', 'S2', 'S3', 'S4', 'S5']]=(InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']]/InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']].sum())*10**6
#Add annotation information
InsituCounts=InsituCounts.join(AllPhytoKO_ann)
InsituTPM=InsituTPM.join(AllPhytoKO_ann)
InsituCounts=InsituCounts.dropna()
InsituTPM=InsituTPM.dropna()
In [11]:
KO_RawData=InsituTPM.groupby('kID').sum()
In [28]:
cmap=pal.colorbrewer.sequential.YlOrRd_5.get_mpl_colormap()
fig, axs=plt.subplots(2,2)
fig.set_size_inches(12,12)
for ax in axs:
for a in ax:
a.set_ylim([0,1000])
CO_RawData_pruned.plot(kind='hist', bins=100,colormap=cmap, ax=axs[0][0])
axs[0][0].set_title('Histogram of CO "concentrations"', size='large')
KO_RawData.plot(kind='hist', bins=100,colormap=cmap,ax=axs[0][1])
axs[0][1].set_title('Histogram of KO TPM', size='large')
CO_RawData_pruned.plot(kind='hist', bins=100,colormap=cmap, range = [0,0.1],ax=axs[1][0])
KO_RawData.plot(kind='hist', bins=100,colormap=cmap, range = [0,50],ax=axs[1][1])
Out[28]:
In [29]:
def NormalizeToMean(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['mean']=DF.mean(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['mean']
DF_meanNorm=DF_meanNorm.T.drop('mean').T
return out
def NormalizeToMax(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['max']=DF.max(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['max']
DF_meanNorm=DF_meanNorm.T.drop('max').T
return out
def NormalizeToMean_CV(DF):
out=DF.copy()
out['mean']=DF.mean(axis=1)
out['SD']=DF.std(axis=1)
out['CV']=out['SD']/out['mean']
return out
In [30]:
#several options for normalizing the data
CO_Norm2Mean=NormalizeToMean(CO_RawData_pruned) #this is what gets used in the original code
KO_Norm2Mean=NormalizeToMean(KO_RawData) #this is what gets used in the original code
CO_Norm2Max=NormalizeToMax(CO_RawData_pruned)
KO_Norm2Max=NormalizeToMax(KO_RawData)
cmap=pal.colorbrewer.sequential.YlOrRd_5.get_mpl_colormap()
fig, axs=plt.subplots(1,2)
fig.set_size_inches(12,6)
kplt=KO_Norm2Mean.plot(kind='hist', bins=100, title='KO Mean Normalized', colormap=cmap, ax=axs[1])
cplt=CO_Norm2Mean.plot(kind='hist', bins=100, title='CO Mean Normalized', colormap=cmap, ax=axs[0])
fig, axs=plt.subplots(1,2)
fig.set_size_inches(12,6)
kplt=KO_Norm2Max.plot(kind='hist', bins=100, title='KO Max Normalized', colormap=cmap, ax=axs[1])
cplt=CO_Norm2Max.plot(kind='hist', bins=100, title='CO Max Normalized', colormap=cmap, ax=axs[0])
In [31]:
#could also try the normalize to mean, CV
cmap=pal.colorbrewer.diverging.PRGn_5.get_mpl_colormap()
fig,ax=plt.subplots(1)
CO_CV=NormalizeToMean_CV(CO_RawData_pruned)
KO_CV=NormalizeToMean_CV(KO_RawData)
#KL note: by using KO_CV.CV, this will only plot the 'CV' variable in the data frame
KO_CV.CV.plot(kind='hist', ax=ax, bins=100, color='r')
CO_CV.CV.plot(kind='hist', ax=ax, bins=100, title='Coefficient of Variation', color='k')
#fig.savefig('Coefficent of Variation')
Out[31]:
In [32]:
#use _finalOption variable names to make life easier
KO_finalOption = KO_Norm2Mean.loc[AllKO].dropna()
CO_finalOption = CO_Norm2Mean.dropna() #already 'limited' this before the normalization