In [2]:
#%reset
In [3]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.cluster import KMeans
import cPickle as cpk
import palettable as pal
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
from Bio.KEGG.REST import *
%matplotlib inline
In [4]:
mtabFile = 'RImetabolites_isomers.2015.08.27.csv' #first column is RInumber
In [5]:
CO_fromMATLAB=pd.read_csv(mtabFile, index_col='RInumber')
In [6]:
#make a list of the unique CO numbers for the CreateHash_COtoKO.py. Export the list as CSV
td = CO_fromMATLAB.groupby('cNumber').count()
COnumbers = td.drop(list(td.columns.values),axis=1)
del td
COnumbers.to_csv('exportCOnumbers.csv',header=True)
In [7]:
def findRInumber(dataIn,KEGGin):
#find possible RI numbers for a given KEGG number.
dataOut = []
for i,KEGG in enumerate(dataIn['KEGG']):
if KEGG == KEGGin:
t = dataIn.index[i]
dataOut.append(t)
return dataOut
##For example: this will give back one row, C18028 will be multiple
#m = findRInumber(forRelatedness,'C00078')
def convertRItoCO(dataIn,RIin):
#do the reverse, given an RInumber find the cNumber
dataOut = dataIn.loc[RIin].loc['cNumber']
return dataOut
##This will always be a single value
#m = convertRItoCO(forRelatedness,'RI2')
#slight change, no need to send in a comparison file if it always the same thing
def convertRItoCO2(RIin):
#do the reverse, given an RInumber find the cNumber
dataOut = CO_fromMATLAB.loc[RIin].loc['cNumber']
return dataOut
##This will always be a single value, also uses CO_fromMATLAB as input
In [8]:
#This grabs the CO/KO links from the KEGG website. The actual code is in the
#CreateHash_COtoKO.py that Harriet wrote. Note that since the exportCOnumbers.csv
#file is a unique list of C number we essentially already have a lookup table
#for all the metabolites of interest.
if os.path.isfile('exportCOnumbers.csv' + '.pickle'):
#just read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
else:
#need to make the file
filename = "CreateHash_COtoKO.py"
%run $filename exportCOnumbers.csv
#then read in the file
WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
In [9]:
def SplitCODict(WorkingFile):
CO_withoutKO={}
CO_withKO={}
for CO in WorkingFile.keys():
if WorkingFile[CO]['Related KO']==[]:
CO_withoutKO[CO]=WorkingFile[CO]
else:
CO_withKO[CO]=WorkingFile[CO]
return CO_withoutKO, CO_withKO
CO_withoutKO, CO_withKO=SplitCODict(WorkingFile)
print 'There are', len(CO_withKO), 'COs with an associated KO.', len(CO_withoutKO), 'are not associated with a KO.'
In [10]:
AllKO=[]
AllCO=[]
for key in CO_withKO:
AllKO.append(CO_withKO[key]['Related KO'])
AllCO.append(CO_withKO[key]['Related CO'])
AllKO=list(set([item for sublist in AllKO for item in sublist]))
AllCO=list(set([item for sublist in AllCO for item in sublist]))
In [11]:
#go through CO_RawData_all one row at a time (inefficient for sure, but I understand
#what is happening), then make a new column in CO_RawData_all that is True/False
CO_fromMATLAB['inList'] = ""
for idx in range(0,len(CO_fromMATLAB)):
# for idx in range(0):
fc = CO_fromMATLAB.ix[idx,'cNumber']
if fc in AllCO:
CO_fromMATLAB.ix[idx,'inList'] = True
else:
CO_fromMATLAB.ix[idx,'inList'] = False
In [12]:
#can't quite figure out how to do this in one step.
m = CO_fromMATLAB[CO_fromMATLAB['inList']==True]
CO_metadata_pruned = m.loc[:,['cNumber','ChargedMass','RT','ionMode']]
#this list of days is useful, so define it up front.
dayList = ['S1','S2','S3','S4','S5']
CO_RawData_pruned = m.loc[:,dayList]
del m
In [13]:
#Load PhytoKEGG Annotations
#from Harriet...doesn't work, complaining about header=False
#AllPhytoKO_ann=pd.read_table('AllPhytoKegg_annotated.tab', header=False, delimiter='\t')
#try this instead (note double \\ at end)
pathToData = 'Z:\KLtemp\TranscriptomicsData_Feb2016\\'
AllPhytoKO_ann=pd.read_table((pathToData + 'AllPhytoKegg_annotated.tab'), delimiter='\t')
InsituCounts=pd.read_table((pathToData + 'AllInsitu_NoZero.tab'), index_col='gID')
In [14]:
#normalize to the library size
InsituTPM=InsituCounts.copy()
InsituTPM[['S1', 'S2', 'S3', 'S4', 'S5']]=(InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']]/InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']].sum())*10**6
#Add annotation information
InsituCounts=InsituCounts.join(AllPhytoKO_ann)
InsituTPM=InsituTPM.join(AllPhytoKO_ann)
InsituCounts=InsituCounts.dropna()
InsituTPM=InsituTPM.dropna()
KO_RawData=InsituTPM.groupby('kID').sum()
In [14]:
def NormalizeToMean(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['mean']=DF.mean(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['mean']
DF_meanNorm=DF_meanNorm.T.drop('mean').T
return out
def NormalizeToMax(DF):
DF_meanNorm=DF.copy()
out=DF_meanNorm.copy()
DF_meanNorm['max']=DF.max(axis=1)
for i in out.columns:
out[i]=DF_meanNorm[i]/DF_meanNorm['max']
DF_meanNorm=DF_meanNorm.T.drop('max').T
return out
def NormalizeToMean_CV(DF):
out=DF.copy()
out['mean']=DF.mean(axis=1)
out['SD']=DF.std(axis=1)
out['CV']=out['SD']/out['mean']
return out
In [15]:
if False:
#norm2Mean seems based (see plots in earlier notebooks)
KO_Norm2Mean=NormalizeToMean(KO_RawData)
##two versions of the CO data: (1) regular and (2) inverse
CO_Norm2Mean_regular=NormalizeToMean(CO_RawData_pruned)
CO_Norm2Mean_inverse=NormalizeToMean(1/CO_RawData_pruned)
In [16]:
#use _finalOption variable names to make life easier
KO_finalOption = KO_RawData.loc[AllKO].dropna()
CO_final_regular = CO_RawData_pruned.dropna() #already 'limited' this before the normalization
#CO_final_inverse = CO_Norm2Mean_inverse.dropna() #already 'limited' this before the normalization
In [17]:
##Combine the CO and the KO data in preparation for K means clustering
Combined_KO_CO_regular=KO_finalOption.append(CO_final_regular.loc[:,(dayList)])
#Combined_KO_CO_inverse=KO_finalOption.append(CO_final_inverse.loc[:,(dayList)])
In [18]:
def kmeanCluster(data,nc):
#kmeans=KMeans(n_clusters=nc)
kmeans = KMeans(n_clusters = nc, max_iter = 1000, n_init = 50, init = 'random')
kmeans.fit(data)
newData=data.copy()
newData['kmeans']=kmeans.labels_
return newData
def PlotKmeans(KmeansPD, kSize=10, figSizeX=1, figSizeY=5, color='k'):
KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color=color)
fig,axs=plt.subplots(figSizeX, figSizeY)
axs=[item for sublist in axs for item in sublist]
fig.set_size_inches(9,12)
for ax, y in zip(axs,range(kSize)):
pltData=KmeansPD[KmeansPD.kmeans==y].T.drop('kmeans')
pltData.plot(ax=ax, legend=False, grid=False, color=color)
Move forward with 'best' number of clusters --> use Determine_Optimal_Number_Kmeans_Groups.ipynb to determine this number
In [19]:
#setting # of clusters manually, also some good options with lower # of clusters I think
#this number will get used later when plotting up the BRITE categories and the Kmeans clusters
makeNclusters = 6
In [23]:
#do the K-means clustering with the final # of clusters
CcoClust_regular=kmeanCluster(Combined_KO_CO_regular, makeNclusters)
#CcoClust_inverse=kmeanCluster(Combined_KO_CO_inverse, makeNclusters)
In [25]:
CcoClust_regular.head(5)
Out[25]:
In [30]:
CcoClust_regular.kmeans.unique()
Out[30]:
In [31]:
CcoClust_regular['kmeans'].value_counts()
Out[31]:
In [ ]:
#import matplotlib as mpl
def PlotKmeansCombined(KmeansPD, kSize=10, figSizeX=2, figSizeY=3, color='k'):
KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color='k',range = (0,kSize),align = 'left')
fig,axs=plt.subplots(figSizeX, figSizeY)
axs=[item for sublist in axs for item in sublist]
fig.set_size_inches(15,9)
i=KmeansPD.index
i=list(i)
Ks=re.compile('K.*')
#Cs=re.compile('C.*')
Cs = re.compile('R.*') #this is the RInumber I created...for the moment, do not need the Cnumber
C = filter(Cs.search, i)
K = filter(Ks.search, i)
Ksplit=KmeansPD.loc[K]
Csplit=KmeansPD.loc[C]
for ax, y in zip(axs,range(kSize)):
KData=Ksplit[Ksplit.kmeans==y].T.drop('kmeans')
KData.plot(ax=ax, legend=False, grid=False, color='b')
CData=Csplit[Csplit.kmeans==y].T.drop('kmeans')
CData.plot(ax=ax, legend=False, grid=False, color='r')
SumKC=len(KData.T)+len(CData.T)
KPct=(len(KData.T))
CPct=(len(CData.T))
ax.set_title('nGenes ' + str(KPct) + ', nCpds ' + str(CPct) + ', Cluster ' + str(y))
ax.set_ylim([0,5])
return fig
#fig.savefig(figureName)
In [34]:
#import matplotlib as mpl
def PlotKmeansCombined_2(KmeansPD, kSize=10, figSizeX=2, figSizeY=3, color='k'):
KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color='k',range = (0,kSize),align = 'left')
fig,axs=plt.subplots(figSizeX, figSizeY)
axs=[item for sublist in axs for item in sublist]
fig.set_size_inches(15,9)
i=KmeansPD.index
i=list(i)
Ks=re.compile('K.*')
#Cs=re.compile('C.*')
Cs = re.compile('R.*') #this is the RInumber I created...for the moment, do not need the Cnumber
C = filter(Cs.search, i)
K = filter(Ks.search, i)
Ksplit=KmeansPD.loc[K]
Csplit=KmeansPD.loc[C]
for ax, y in zip(axs,range(kSize)):
KData=Ksplit[Ksplit.kmeans==y].T.drop('kmeans')
CData=Csplit[Csplit.kmeans==y].T.drop('kmeans')
if CData.empty and KData.empty:
continue
elif CData.empty:
KData.plot(ax=ax, legend=False, grid=False, color='r')
elif KData.empty:
CData.plot(ax=ax, legend=False, grid=False, color='k')
else:
CData.plot(ax=ax, legend=False, grid=False, color='k')
KData.plot(ax=ax, legend=False, grid=False, color='r')
# KData.plot(ax=ax, legend=False, grid=False, color='b')
# CData.plot(ax=ax, legend=False, grid=False, color='r')
SumKC=len(KData.T)+len(CData.T)
KPct=(len(KData.T))
CPct=(len(CData.T))
ax.set_title('nGenes ' + str(KPct) + ', nCpds ' + str(CPct) + ', Cluster ' + str(y))
#ax.set_ylim([0,5])
return fig
#fig.savefig(figureName)
In [35]:
f = PlotKmeansCombined_2(CcoClust_regular,makeNclusters)
f.savefig('Kmeans_regular_beforeMean.png')
del f
In [24]:
f = PlotKmeansCombined(CcoClust_inverse,makeNclusters)
f.savefig('Kmeans_inverse.png')
del f
In [168]:
#need this cell to get the taxonomic information used in the species plots later on
#load in the species/group information
Group_Species=pd.read_table('GrpSpecies',delimiter=' ').T.drop(['MMETSP',
'MMETSP.1']).T.drop_duplicates().set_index('SName')
InsituTPMGrped=InsituTPM.groupby(['kID','sgID']).sum().reset_index().set_index('sgID')
Group_Species=Group_Species.reset_index()
#get the specific group combos
Dia=Group_Species[Group_Species['Grp']=='Bacillariophyta']
Din=Group_Species[Group_Species['Grp']=='Dinophyta']
Oth=Group_Species[((Group_Species['Grp']!='Dinophyta')&
(Group_Species['Grp']!='Bacillariophyta'))]
#get the insitu counts
Insitu_TPM_DIA=InsituTPMGrped.loc[Dia['SName']].groupby('kID').sum()
Insitu_TPM_DIN=InsituTPMGrped.loc[Din['SName']].groupby('kID').sum()
Insitu_TPM_Oth=InsituTPMGrped.loc[Oth['SName'].dropna()].groupby('kID').sum()
D=set(Insitu_TPM_DIA.index)
N=set(Insitu_TPM_DIN.index)
O=set(Insitu_TPM_Oth.index)
In [26]:
fig, axs=plt.subplots(2,3)
fig.set_size_inches(9,6)
axs=axs.flatten()
for i in set(CcoClust_regular['kmeans']):
ClusterK = CcoClust_regular[CcoClust_regular['kmeans']==i]
ClusterKind=set([k for k in ClusterK.index if k.startswith('K')])
Di=D.intersection(ClusterKind)
Ni=N.intersection(ClusterKind)
Oi=O.intersection(ClusterKind)
Dsum=Insitu_TPM_DIA.loc[Di].sum()
Nsum=Insitu_TPM_DIN.loc[Ni].sum()
Osum=Insitu_TPM_Oth.loc[Di].sum()
ax=axs[i]
ax.stackplot(range(5), Dsum, Nsum, Osum, colors=
pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
In [27]:
fig, axs=plt.subplots(2,3)
fig.set_size_inches(9,6)
axs=axs.flatten()
for i in set(CcoClust_inverse['kmeans']):
ClusterK = CcoClust_inverse[CcoClust_inverse['kmeans']==i]
ClusterKind=set([k for k in ClusterK.index if k.startswith('K')])
Di=D.intersection(ClusterKind)
Ni=N.intersection(ClusterKind)
Oi=O.intersection(ClusterKind)
Dsum=Insitu_TPM_DIA.loc[Di].sum()
Nsum=Insitu_TPM_DIN.loc[Ni].sum()
Osum=Insitu_TPM_Oth.loc[Di].sum()
ax=axs[i]
ax.stackplot(range(5), Dsum, Nsum, Osum, colors=
pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
In [28]:
#But...for the CheckRelatedness...do need to go back to the cNumber...
#for now, easiest to just make yet another matrix and put the cNumbers back in.
forRelatedness_regular = CcoClust_regular.copy(deep=True) #make a copy of the CcoClust data frame
forRelatedness_regular.insert(0,'KEGG',"") #add a column called 'KEGG'
forRelatedness_inverse = CcoClust_inverse.copy(deep=True) #make a copy of the CcoClust data frame
forRelatedness_inverse.insert(0,'KEGG',"") #add a column called 'KEGG'
In [29]:
for idx in range(0,len(forRelatedness_regular)):
t = forRelatedness_regular.iloc[idx,:].name
if t[0]=='R':
#go find the matching cNumber in CO_RawData_all
t2 = CO_fromMATLAB.loc[t,('cNumber')]
forRelatedness_regular.ix[idx,('KEGG')] = t2
elif t[0] == 'K':
#just copy the K number over
forRelatedness_regular.ix[idx,('KEGG')] = t
for idx in range(0,len(forRelatedness_inverse)):
t = forRelatedness_inverse.iloc[idx,:].name
if t[0]=='R':
#go find the matching cNumber in CO_RawData_all
t2 = CO_fromMATLAB.loc[t,('cNumber')]
forRelatedness_inverse.ix[idx,('KEGG')] = t2
elif t[0] == 'K':
#just copy the K number over
forRelatedness_inverse.ix[idx,('KEGG')] = t
In [30]:
def CheckRelatedness(inClust,nC):
df=pd.DataFrame(columns=['Common Rxn','No linked Rxn'], index=range(nC))
for n in range(nC):
kClust=inClust[inClust.kmeans==n]
#i=kClust.index
i = kClust.KEGG #change the new column I created with Cnumbers and Knumbers
i=list(i)
Csearc=re.compile('C.*') #re is regular expression...perl-like
Cs = filter(Csearc.search, i)
Ksearc=re.compile('K.*')
Kis = filter(Ksearc.search, i)
Kis=set(Kis)
Ks=[]
for c in Cs:
if c in CO_withKO.keys():
Ks.append(CO_withKO[c]['Related KO'])
Ks=set([item for sublist in Ks for item in sublist])
df.loc[n,'Common Rxn']=len(Kis.intersection(Ks))
df.loc[n, 'No linked Rxn']=len(Kis)-len(Kis.intersection(Ks))
df.plot(kind='bar', stacked=True, colormap=pal.colorbrewer.diverging.PRGn_5.get_mpl_colormap(), grid=False)
In [31]:
CheckRelatedness(forRelatedness_regular, makeNclusters)
In [32]:
CheckRelatedness(forRelatedness_inverse, makeNclusters)
Grab the information from KEGG for plotting and bean counting
In [33]:
allPathways = kegg_list("pathway").read()
len(allPathways.split('\n'))
#number here is the # of pathways at KEGG, up to 486 by 4/13/2016
Out[33]:
In [34]:
trimPath = []
current_section = None
for line in allPathways.rstrip().split("\n"):
tp = line[8:13]
trimPath.append('ko' + tp)
#have some cases where KEGG will send back a pathway, but the pathway itself is not searchable...seems to
#be a KEGG bug, 'ko00351' was first, then realized there are many of these,
#did this list manually since I thought it would be short...
toDelete = ('ko00351', 'ko01010','ko01060', 'ko01061', 'ko01062', 'ko01063',
'ko01064', 'ko01065', 'ko01066', 'ko01070', 'ko07011', 'ko07012',
'ko07013', 'ko07014', 'ko07015', 'ko07016', 'ko07017', 'ko07018',
'ko07019', 'ko07020', 'ko07021', 'ko07023', 'ko07024', 'ko07025',
'ko07026', 'ko07027', 'ko07028', 'ko07029', 'ko07030', 'ko07031',
'ko07032', 'ko07033', 'ko07034', 'ko07035', 'ko07036', 'ko07037',
'ko07038', 'ko07039', 'ko07040', 'ko07041', 'ko07042', 'ko07043',
'ko07044', 'ko07045', 'ko07046', 'ko07047', 'ko07048', 'ko07049',
'ko07050', 'ko07051', 'ko07052', 'ko07053', 'ko07054', 'ko07055',
'ko07056', 'ko07057', 'ko07110', 'ko07112', 'ko07114', 'ko07117',
'ko07211', 'ko07212', 'ko07213', 'ko07214', 'ko07215', 'ko07216',
'ko07217', 'ko07218', 'ko07219', 'ko07220', 'ko07221', 'ko07222',
'ko07223', 'ko07224', 'ko07225', 'ko07226', 'ko07227', 'ko07228',
'ko07229', 'ko07230', 'ko07231', 'ko07232', 'ko07233', 'ko07234',
'ko07235', 'ko04933')
#probably a way to do this without the for loop, but this will work
for item in toDelete:
trimPath.remove(item)
In [35]:
shortList = ['ko03010','ko03013','ko00140'] #for testing
In [36]:
import fxn_gatherDetails
##if I make a change, have to reload the function:
#reload(fxn_gatherDetails)
In [37]:
#usePathway = shortList #for testing...use complete list later
usePathway = trimPath
KOandCOwithKmeans = forRelatedness_regular #linked KO & CO, data, with K means information
useCO = CO_fromMATLAB #full list of CO data, need to color pathway, and lookup table
useKO = KO_Norm2Mean # full list of KO data, need to color pathway
dia = Insitu_TPM_DIA #diatom TPM, for plotting species
din = Insitu_TPM_DIN #dino TPM, for plotting species
oth = Insitu_TPM_Oth #other TPM, for plotting species
folder = 'plots_regular_Km'
In [38]:
gc_regular = fxn_gatherDetails.gatherDetails(makeNclusters,usePathway,KOandCOwithKmeans,folder,useCO,useKO,dia,din,oth)
In [39]:
#switch to inverse, repeat
KOandCOwithKmeans = forRelatedness_inverse #linked KO & CO, data, with K means information
folder = 'plots_inverse_Km'
gc_inverse = fxn_gatherDetails.gatherDetails(makeNclusters,usePathway,KOandCOwithKmeans,folder,useCO,useKO,dia,din,oth)
In [40]:
#now...save all that so I don't have to do this everytime BUT be careful with re-assigning K means group numbers!
cpk.dump(gc_regular, open('gatherCounts_norm2mean_regular.2016.04.18.pickle', 'wb'))
cpk.dump(gc_inverse, open('gatherCounts_norm2mean_inverse.2016.04.18.pickle', 'wb'))
## this is the code to read in the file...can use this without having to go through the pain of rerunning gatherCounts
#gatherCounts_regular = cpk.load(open('gatherCounts_norm2mean_regular.2016.04.13.pickle','rb'))
In [41]:
def plotGenes_ratio(tg):
#if this is something in the pathway, plot up the species for the K number
if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_DIN.index.tolist()):
DaDn=Insitu_TPM_DIA.loc[tg]/Insitu_TPM_DIN.loc[tg]
if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
DaOt=Insitu_TPM_DIA.loc[tg]/Insitu_TPM_Oth.loc[tg]
if (tg in Insitu_TPM_DIN.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
DnOt=Insitu_TPM_DIN.loc[tg]/Insitu_TPM_Oth.loc[tg]
useColors = pal.colorbrewer.qualitative.Set1_3.hex_colors
ms = 200 #set the size of the dots
fig,axs=plt.subplots(1,3)
fig.set_size_inches(10,4)
axs[0].scatter(range(5),DaDn,s = ms, color = useColors[0])
axs[0].set_title('DIA:DIN')
axs[0].set_xticks(range(5))
axs[0].set_ylabel('Ratios')
axs[1].scatter(range(5),DaOt,s = ms, color = useColors[1])
axs[1].set_title('DIA:Oth')
axs[1].set_xticks(range(5))
axs[2].scatter(range(5),DnOt,s = ms, color = useColors[2])
axs[2].set_title('DIN:Oth')
axs[2].set_xticks(range(5))
#fig.savefig(directorySpecies + '/' + tg + '_species.png',bbox_inches='tight')
#plt.close()
In [42]:
#not sure this is helpful since it would be another set of figures to wade through, and the
#same information is already in the species plots...leave here to get a sense of what we
#are looking for though in considering the ratios of the various TPMs
r = plotGenes_ratio('K01914')
In [43]:
#make a function to calculate the ratios of the three TPM groups
#function expects a version of the forRelatedness_XXXX dataframe:
def calcTPMratios(dataIn):
#do NOT pull the KEGG numbers bc that causes issues downstream with the plotting: confusion of numbers/text
df2 = dataIn.loc[:,('kmeans')]
df2 = df2.to_frame() #this gets around the error about working on a copy of a slice
df2['inList'] = ''
for idx in range(0,len(df2)):
fc = df2.index.values[idx]
if fc[0] == 'C':
df2.ix[idx,'inList'] = False
elif fc[0] == 'K':
df2.ix[idx,'inList'] = True
df2 = df2[df2['inList']==True]
df2 = df2.T.drop('inList').T #this somehow gets around the warning about making a change on a copy...?
for tg,row in df2.iterrows():
if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_DIN.index.tolist()):
#use this to get the mean, ignoring any inf/NaN values
tv = Insitu_TPM_DIA.loc[tg,:]/Insitu_TPM_DIN.loc[tg,:]
t = np.ma.masked_invalid(tv).mean()
df2.loc[tg,'meanDaDn'] = t
else:
df2.loc[tg,'meanDaDn'] = np.nan
if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
tv = Insitu_TPM_DIA.loc[tg,:]/Insitu_TPM_Oth.loc[tg,:]
t = np.ma.masked_invalid(tv).mean()
df2.loc[tg,'meanDaOt'] = t
else:
df2.loc[tg,'meanDaOt'] = np.nan
if (tg in Insitu_TPM_DIN.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
tv = Insitu_TPM_DIN.loc[tg,:]/Insitu_TPM_Oth.loc[tg,:]
t = np.ma.masked_invalid(tv).mean()
df2.loc[tg,'meanDnOt'] = t
else:
df2.loc[tg,'meanDnOt'] = np.nan
return df2
In [44]:
out_regular = calcTPMratios(forRelatedness_regular)
out_inverse = calcTPMratios(forRelatedness_inverse)
#save inverse for later, this is already complicated enough with the regular K means groups
In [45]:
#not fully convinced this is actually interesting...the ratios in the extremes can be the cases
#with small # if TPM in denominator/numerator driving to large numbers (ie. 300 / 0.05))
axs = out_regular.boxplot(column = 'meanDaDn',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDaDn' + ' regular')
Out[45]:
In [46]:
axs = out_regular.boxplot(column = 'meanDaOt',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDaOt' + ' regular')
Out[46]:
In [47]:
axs = out_regular.boxplot(column = 'meanDnOt',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDnOt' + ' regular')
Out[47]:
In [48]:
out_regular[out_regular.loc[:,'meanDaDn']>=1000]
Out[48]:
In [49]:
out_regular.mean()
Out[49]:
In [50]:
out_regular.groupby('kmeans').mean()
Out[50]:
In [51]:
#explore...plot one gene here rather than digging it up from the folders
kid='K01914'
allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
allK=allK.drop('kID')
allK=allK.astype('float')
allK
if kid in Insitu_TPM_DIA.index.tolist():
Dk=Insitu_TPM_DIA.loc[kid]
else:
Dk = 0/allK
if kid in Insitu_TPM_DIN.index.tolist():
Nk=Insitu_TPM_DIN.loc[kid]
else:
Nk = 0/allK
if kid in Insitu_TPM_Oth.index.tolist():
Ok=Insitu_TPM_Oth.loc[kid]
else:
Ok = 0/allK
fig,ax=plt.subplots(1)
ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
plt.title(kid)
# ax.set_ylim(0,1)
#fig.savefig(kid +'_numbers_forTalk.pdf')
#there are very small values here for DIN and other, but can't see them on the figure
Out[51]:
In [52]:
#function expects a version of the forRelatedness_XXXX dataframe:
def averageTPM(dataIn):
#do NOT pull the KEGG numbers bc that causes issues downstream with the plotting: confusion of numbers/text
df2 = dataIn.loc[:,('kmeans')]
df2 = df2.to_frame() #this gets around the error about working on a copy of a slice
df2['inList'] = ''
for idx in range(0,len(df2)):
fc = df2.index.values[idx]
if fc[0] == 'C':
df2.ix[idx,'inList'] = False
elif fc[0] == 'K':
df2.ix[idx,'inList'] = True
df2 = df2[df2['inList']==True]
df2 = df2.T.drop('inList').T #this somehow gets around the warning about making a change on a copy...?
for tg,row in df2.iterrows():
if (tg in Insitu_TPM_DIA.index.tolist()) :
#use this to get the mean, ignoring any inf/NaN values
tv = Insitu_TPM_DIA.loc[tg,:]
t = np.ma.masked_invalid(tv).mean()
df2.loc[tg,'meanDia'] = t
else:
df2.loc[tg,'meanDia'] = np.nan
if (tg in Insitu_TPM_DIN.index.tolist()) :
tv = Insitu_TPM_DIN.loc[tg,:]
t = np.ma.masked_invalid(tv).mean()
df2.loc[tg,'meanDin'] = t
else:
df2.loc[tg,'meanDin'] = np.nan
if (tg in Insitu_TPM_Oth.index.tolist()):
tv = Insitu_TPM_Oth.loc[tg,:]
t = np.ma.masked_invalid(tv).mean()
df2.loc[tg,'meanOth'] = t
else:
df2.loc[tg,'meanOth'] = np.nan
return df2
In [53]:
testing = averageTPM(forRelatedness_regular)
In [54]:
testing.head(5)
Out[54]:
In [55]:
#but...ask Harriet whether she thinks 'average TPM' is really a valid concept here
#also, remember the different K means group have different number of genes...so, the dip
#in 'other' in K means group 4 is only considering 8 genes.
In [56]:
use = 'meanDia'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')
Out[56]:
In [57]:
use = 'meanDin'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')
Out[57]:
In [58]:
use = 'meanOth'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')
Out[58]:
In [59]:
colLabels = list(gc_regular.columns.values)
In [60]:
colLabels
Out[60]:
In [61]:
cpdLabels = colLabels[2:13:2] #odd Python syntax, from 2 to 13, in steps of 2)
cpdLabels
Out[61]:
In [62]:
geneLabels = colLabels[3:14:2] #odd Python syntax, from 2 to 13, in steps of 2)
geneLabels
Out[62]:
In [63]:
#I have put the bar graphs back in here since I think they are the simplest way to consider these data
In [64]:
##let's narrow in on the metabolism group since that is the only one that is really interesting
plotMetabolism = gc_regular[gc_regular.loc[:,'pathwayGroup_A']=='Metabolism']
In [65]:
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]
In [66]:
dataToPlot.head(5)
Out[66]:
In [67]:
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
pGroup
Out[67]:
In [68]:
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = cpdLabels)
for i, group in s.groupby('pathwayGroup_B'):
d2 = group.loc[:,cpdLabels]
out = d2.sum(axis=0)
newDFmtab.loc[i,cpdLabels] = out
# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])
In [69]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors, figsize = (12,12))
Out[69]:
In [70]:
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = geneLabels)
for i, group in s.groupby('pathwayGroup_B'):
d2 = group.loc[:,geneLabels]
out = d2.sum(axis=0)
newDFmtab.loc[i,geneLabels] = out
# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])
In [71]:
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors, figsize = (12,12))
Out[71]:
In [72]:
#useColors = pal.colorbrewer.qualitative.Set3_11.hex_colors
useColors = pal.colorbrewer.diverging.RdYlBu_11.hex_colors
newDFmtab.T.plot(kind = 'barh',color = useColors, figsize = (12,12))
Out[72]:
In [73]:
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = cpdLabels)
for i, group in s.groupby('pathwayGroup_C'):
d2 = group.loc[:,cpdLabels]
out = d2.sum(axis=0)
newDFmtab.loc[i,cpdLabels] = out
# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors,figsize = (12,192))
Out[73]:
In [74]:
s = plotMetabolism[(plotMetabolism.loc[:,geneLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,geneLabels]
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = geneLabels)
for i, group in s.groupby('pathwayGroup_C'):
d2 = group.loc[:,geneLabels]
out = d2.sum(axis=0)
newDFmtab.loc[i,geneLabels] = out
# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors,figsize = (12,192))
Out[74]:
Liz wants to think about ratio of genes at one time point versus another...I think this will be driven by the small numbers again
In [75]:
dtm = forRelatedness_regular.loc[:,'S2']/forRelatedness_regular.loc[:,'S3']
#how many cases do we have with genes/compounds 10x greater at S2?
r = dtm[dtm>10]
r.replace(np.inf,np.nan).dropna() #replace inf with NaN, then drop the NaN
Out[75]:
In [76]:
ax2 = dtm.plot.box()
ax2.set_yscale('log')
Consider diversity of other EC/genes (lots of examples follow)
In [77]:
def plotOneGene(kid):#explore...plot one gene here rather than digging it up from the folders
#kid='K03183'
allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
allK=allK.drop('kID')
allK=allK.astype('float')
allK
if kid in Insitu_TPM_DIA.index.tolist():
Dk=Insitu_TPM_DIA.loc[kid]
else:
Dk = 0/allK
if kid in Insitu_TPM_DIN.index.tolist():
Nk=Insitu_TPM_DIN.loc[kid]
else:
Nk = 0/allK
if kid in Insitu_TPM_Oth.index.tolist():
Ok=Insitu_TPM_Oth.loc[kid]
else:
Ok = 0/allK
#try not to plot things that are all NaN (though this will allow 'zeros', and
#definitely have cases with 0 and NaN for a given gene)
if not Dk.dropna().empty and not Nk.dropna().empty and not Ok.dropna().empty: #only plot with data!
fig,ax=plt.subplots(1)
ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
fig.set_size_inches(2,2) #this is a property of the figure, not the axis...
plt.title(kid)
# ax.set_ylim(0,1)
#fig.savefig(kid +'_numbers_forTalk.pdf')
In [78]:
#given a gene, go get all possible EC numbers, use them to search for other genes with that EC, and plot
#this will skip over genes that are not in the RI data
def plotMultipleEC(kid):
getOne = kegg_list(kid).read()
geneList = []
for line in getOne.rstrip().split('\n'):
r1 = line.find("[EC:")
r2 = line.find("]")
r = line[(r1+4):r2]
r3 = r.split(' ')
#now that I have the EC numbers...get the genes for each one
for idx in r3:
getK = kegg_link('ko',idx).read()
for line3 in re.split('\n',getK):
if line3 is not '':
ge = line3.find('ko')
t = line3[ge+3:]
geneList.append(t)
geneList = set(geneList) #this will make the list of unique genes...can print that up
for idx in geneList:
plotOneGene(idx) #plotting function has already been defined above...
In [79]:
#in some cases, only get back the one gene we started with
plotMultipleEC('K01830')
In [80]:
##KL note for Harriet: this is an ADP-ribose pyrophosphate...in the upper left corner
##of the purine pathway
plotMultipleEC('K13988')
In [81]:
#example of something high at S2, but another version of the gene is steady throughout
#lipid: arachidonic acid metabolism
plotMultipleEC('K00509')
In [82]:
plotMultipleEC('K17690')
In [148]:
plotMultipleEC('K12343')
In [ ]:
In [ ]:
In [ ]:
In [85]:
#leave bits below for the moment...
In [90]:
type(InsituTPMGrped)
Out[90]:
In [92]:
InsituTPMGrped.head(5)
Out[92]:
In [94]:
dayList
Out[94]:
In [99]:
InsituTPMGrped['mean'] = InsituTPMGrped[dayList].mean(axis=1)
In [100]:
InsituTPMGrped.head(5)
Out[100]:
In [101]:
In [104]:
InsituTPMGrped.shape
Out[104]:
In [ ]:
In [103]:
InsituTPMGrped['mean'].max()
Out[103]:
In [106]:
Insitu_TPM_DIA.shape
Out[106]:
In [107]:
Insitu_TPM_DIN.shape
Out[107]:
In [108]:
Insitu_TPM_Oth.shape
Out[108]:
In [ ]:
In [102]:
plt.hist(InsituTPMGrped['mean'])
Out[102]:
In [ ]:
In [115]:
Insitu_TPM_DIA['mean'] = Insitu_TPM_DIA[dayList].mean(axis=1)
Insitu_TPM_DIN['mean'] = Insitu_TPM_DIN[dayList].mean(axis=1)
Insitu_TPM_Oth['mean'] = Insitu_TPM_Oth[dayList].mean(axis=1)
In [110]:
Insitu_TPM_DIA.head(5)
Out[110]:
In [120]:
fig= plt.hist(Insitu_TPM_DIA['mean'],bins = 100,histtype='stepfilled',color = 'r',label = 'DIA',log=True)
In [121]:
fig = plt.hist(Insitu_TPM_DIN['mean'],bins = 100,histtype='stepfilled',color = 'g',alpha = 0.25,label = 'DIN',log = True)
In [122]:
fig = plt.hist(Insitu_TPM_Oth['mean'],bins = 100,histtype='stepfilled',color = 'b',alpha = 0.75,label = 'Oth',log = True)
In [123]:
pruneDIA = Insitu_TPM_DIA.copy(deep=True)
In [125]:
pruneDIA.head(5)
Out[125]:
In [127]:
pruneDIA.drop('mean',axis=1,inplace=True)
In [129]:
k = pruneDIA<2
In [131]:
pruneDIA[k] = np.nan
In [137]:
pruneDIA['mean'] = pruneDIA[dayList].mean(axis=1)
In [140]:
#need dropna or else the histogram will barf
fig = plt.hist(pruneDIA['mean'].dropna(),log = True)
In [142]:
pruneDIN = Insitu_TPM_DIN[dayList].mean(axis=1)
In [143]:
fig = plt.hist(pruneDIN.dropna(),log=True)
In [157]:
fig = plt.hist(Insitu_TPM_Oth[dayList].mean(axis=1).dropna(),log=True)
In [ ]:
In [ ]:
In [171]:
def plotOneGene_pruned(kid):#explore...plot one gene here rather than digging it up from the folders
#kid='K03183'
allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
allK=allK.drop('kID')
allK=allK.astype('float')
allK
k = Insitu_TPM_DIA < 2
Insitu_TPM_DIA[k] = 0
k = Insitu_TPM_DIN < 2
Insitu_TPM_DIN[k] = 0
k = Insitu_TPM_Oth < 2
Insitu_TPM_Oth[k] = 0
if kid in Insitu_TPM_DIA.index.tolist():
Dk=Insitu_TPM_DIA.loc[kid]
else:
Dk = 0/allK
if kid in Insitu_TPM_DIN.index.tolist():
Nk=Insitu_TPM_DIN.loc[kid]
else:
Nk = 0/allK
if kid in Insitu_TPM_Oth.index.tolist():
Ok=Insitu_TPM_Oth.loc[kid]
else:
Ok = 0/allK
#try not to plot things that are all NaN (though this will allow 'zeros', and
#definitely have cases with 0 and NaN for a given gene)
if not Dk.dropna().empty and not Nk.dropna().empty and not Ok.dropna().empty: #only plot with data!
fig,ax=plt.subplots(1)
ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
fig.set_size_inches(2,2) #this is a property of the figure, not the axis...
plt.title(kid)
# ax.set_ylim(0,1)
#fig.savefig(kid +'_numbers_forTalk.pdf')
#given a gene, go get all possible EC numbers, use them to search for other genes with that EC, and plot
#this will skip over genes that are not in the RI data
def plotMultipleEC_2(kid):
getOne = kegg_list(kid).read()
geneList = []
for line in getOne.rstrip().split('\n'):
r1 = line.find("[EC:")
r2 = line.find("]")
r = line[(r1+4):r2]
r3 = r.split(' ')
#now that I have the EC numbers...get the genes for each one
for idx in r3:
getK = kegg_link('ko',idx).read()
for line3 in re.split('\n',getK):
if line3 is not '':
ge = line3.find('ko')
t = line3[ge+3:]
geneList.append(t)
geneList = set(geneList) #this will make the list of unique genes...can print that up
for idx in geneList:
plotOneGene_pruned(idx) #plotting function has already been defined above...
In [172]:
plotMultipleEC_2('K13988')
In [166]:
Dk
Out[166]:
In [ ]:
In [ ]: