Linking CO and KO

housecleaning: pull out the pieces that can go somewhere else to get a clean version of the iPython notebook 4/12/2016 ; 4/15/2016

Cell 12 is the place to change the path for the transcriptomics data...too big for GitHub



In [2]:

    
#%reset



In [3]:

    
import pandas as pd
import numpy as np
import re
import os
from sklearn.cluster import KMeans
import cPickle as cpk

import palettable as pal
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams['pdf.fonttype'] = 42

from Bio.KEGG.REST import *

%matplotlib inline



In [4]:

    
mtabFile = 'RImetabolites_isomers.2015.08.27.csv' #first column is RInumber



In [5]:

    
CO_fromMATLAB=pd.read_csv(mtabFile, index_col='RInumber')



In [6]:

    
#make a list of the unique CO numbers for the CreateHash_COtoKO.py. Export the list as CSV
td = CO_fromMATLAB.groupby('cNumber').count()
COnumbers = td.drop(list(td.columns.values),axis=1)
del td
COnumbers.to_csv('exportCOnumbers.csv',header=True)

Write a couple of functions to swap between CO and RInumbers



In [7]:

    
def findRInumber(dataIn,KEGGin):
    #find possible RI numbers for a given KEGG number. 
    dataOut = []
    for i,KEGG in enumerate(dataIn['KEGG']):
        if KEGG == KEGGin:
            t = dataIn.index[i]
            dataOut.append(t)
    return dataOut
##For example: this will give back one row, C18028 will be multiple
#m = findRInumber(forRelatedness,'C00078') 

def convertRItoCO(dataIn,RIin):
    #do the reverse, given an RInumber find the cNumber
    dataOut = dataIn.loc[RIin].loc['cNumber']
    return dataOut
##This will always be a single value
#m = convertRItoCO(forRelatedness,'RI2')

#slight change, no need to send in a comparison file if it always the same thing
def convertRItoCO2(RIin):
    #do the reverse, given an RInumber find the cNumber
    dataOut = CO_fromMATLAB.loc[RIin].loc['cNumber']
    return dataOut
##This will always be a single value, also uses CO_fromMATLAB as input



In [8]:

    
#This grabs the CO/KO links from the KEGG website. The actual code is in the 
#CreateHash_COtoKO.py that Harriet wrote. Note that since the exportCOnumbers.csv 
#file is a unique list of C number we essentially already have a lookup table 
#for all the metabolites of interest.

if os.path.isfile('exportCOnumbers.csv' + '.pickle'):
    #just read in the file
    WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))
else:
    #need to make the file
    filename = "CreateHash_COtoKO.py"
    %run $filename exportCOnumbers.csv 
    #then read in the file
    WorkingFile = cpk.load(open('exportCOnumbers.csv.pickle','r'))



In [9]:

    
def SplitCODict(WorkingFile):
    CO_withoutKO={}
    CO_withKO={}
    for CO in WorkingFile.keys():

        if WorkingFile[CO]['Related KO']==[]:
            CO_withoutKO[CO]=WorkingFile[CO]
        else:
            CO_withKO[CO]=WorkingFile[CO]
    return CO_withoutKO, CO_withKO

CO_withoutKO, CO_withKO=SplitCODict(WorkingFile)
print 'There are', len(CO_withKO), 'COs with an associated KO.', len(CO_withoutKO), 'are not associated with a KO.'









    



There are 404 COs with an associated KO. 1438 are not associated with a KO.



In [10]:

    
AllKO=[]
AllCO=[]
for key in CO_withKO:
    AllKO.append(CO_withKO[key]['Related KO'])
    AllCO.append(CO_withKO[key]['Related CO'])
AllKO=list(set([item for sublist in AllKO for item in sublist]))
AllCO=list(set([item for sublist in AllCO for item in sublist]))



In [11]:

    
#go through CO_RawData_all one row at a time (inefficient for sure, but I understand 
#what is happening), then make a new column in CO_RawData_all that is True/False
CO_fromMATLAB['inList'] = ""

for idx in range(0,len(CO_fromMATLAB)):
# for idx in range(0):
    fc = CO_fromMATLAB.ix[idx,'cNumber']
    if fc in AllCO:
        CO_fromMATLAB.ix[idx,'inList'] = True
    else:
        CO_fromMATLAB.ix[idx,'inList'] = False



In [12]:

    
#can't quite figure out how to do this in one step.
m = CO_fromMATLAB[CO_fromMATLAB['inList']==True]
CO_metadata_pruned = m.loc[:,['cNumber','ChargedMass','RT','ionMode']]

#this list of days is useful, so define it up front.
dayList = ['S1','S2','S3','S4','S5'] 
CO_RawData_pruned = m.loc[:,dayList]
del m



In [13]:

    
#Load PhytoKEGG Annotations
#from Harriet...doesn't work, complaining about header=False 
#AllPhytoKO_ann=pd.read_table('AllPhytoKegg_annotated.tab', header=False, delimiter='\t')

#try this instead (note double \\ at end)
pathToData = 'Z:\KLtemp\TranscriptomicsData_Feb2016\\'
AllPhytoKO_ann=pd.read_table((pathToData + 'AllPhytoKegg_annotated.tab'), delimiter='\t')

InsituCounts=pd.read_table((pathToData + 'AllInsitu_NoZero.tab'), index_col='gID')



In [14]:

    
#normalize to the library size
InsituTPM=InsituCounts.copy()
InsituTPM[['S1', 'S2', 'S3', 'S4', 'S5']]=(InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']]/InsituCounts[['S1', 'S2', 'S3', 'S4', 'S5']].sum())*10**6

#Add annotation information
InsituCounts=InsituCounts.join(AllPhytoKO_ann)
InsituTPM=InsituTPM.join(AllPhytoKO_ann)
InsituCounts=InsituCounts.dropna()
InsituTPM=InsituTPM.dropna()

KO_RawData=InsituTPM.groupby('kID').sum()



In [14]:

    
def NormalizeToMean(DF):
    DF_meanNorm=DF.copy()
    out=DF_meanNorm.copy()
    DF_meanNorm['mean']=DF.mean(axis=1)

    for i in out.columns:
        out[i]=DF_meanNorm[i]/DF_meanNorm['mean']
    DF_meanNorm=DF_meanNorm.T.drop('mean').T
    return out

def NormalizeToMax(DF):
    DF_meanNorm=DF.copy()
    out=DF_meanNorm.copy()
    DF_meanNorm['max']=DF.max(axis=1)
    for i in out.columns:
        out[i]=DF_meanNorm[i]/DF_meanNorm['max']
    DF_meanNorm=DF_meanNorm.T.drop('max').T
    return out

def NormalizeToMean_CV(DF):
    out=DF.copy()
    out['mean']=DF.mean(axis=1)
    out['SD']=DF.std(axis=1)
    
    out['CV']=out['SD']/out['mean']
    return out



In [15]:

    
if False: 
    #norm2Mean seems based (see plots in earlier notebooks)
    KO_Norm2Mean=NormalizeToMean(KO_RawData) 

    ##two versions of the CO data: (1) regular and (2) inverse
    CO_Norm2Mean_regular=NormalizeToMean(CO_RawData_pruned) 
    CO_Norm2Mean_inverse=NormalizeToMean(1/CO_RawData_pruned)



In [16]:

    
#use _finalOption variable names to make life easier
KO_finalOption = KO_RawData.loc[AllKO].dropna()
CO_final_regular = CO_RawData_pruned.dropna() #already 'limited' this before the normalization
#CO_final_inverse = CO_Norm2Mean_inverse.dropna() #already 'limited' this before the normalization



In [17]:

    
##Combine the CO and the KO data in preparation for K means clustering
Combined_KO_CO_regular=KO_finalOption.append(CO_final_regular.loc[:,(dayList)])
#Combined_KO_CO_inverse=KO_finalOption.append(CO_final_inverse.loc[:,(dayList)])



In [18]:

    
def kmeanCluster(data,nc):
    #kmeans=KMeans(n_clusters=nc)
    kmeans = KMeans(n_clusters = nc, max_iter = 1000, n_init = 50, init = 'random')
    kmeans.fit(data)
    newData=data.copy()
    newData['kmeans']=kmeans.labels_
    return newData
    
def PlotKmeans(KmeansPD, kSize=10, figSizeX=1, figSizeY=5, color='k'):
    KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color=color)
    fig,axs=plt.subplots(figSizeX, figSizeY)
    axs=[item for sublist in axs for item in sublist]
    fig.set_size_inches(9,12)
    for ax, y in zip(axs,range(kSize)):
        pltData=KmeansPD[KmeansPD.kmeans==y].T.drop('kmeans')
        pltData.plot(ax=ax, legend=False, grid=False, color=color)

Move forward with 'best' number of clusters --> use Determine_Optimal_Number_Kmeans_Groups.ipynb to determine this number



In [19]:

    
#setting # of clusters manually, also some good options with lower # of clusters I think
#this number will get used later when plotting up the BRITE categories and the Kmeans clusters
makeNclusters = 6



In [23]:

    
#do the K-means clustering with the final # of clusters
CcoClust_regular=kmeanCluster(Combined_KO_CO_regular, makeNclusters) 

#CcoClust_inverse=kmeanCluster(Combined_KO_CO_inverse, makeNclusters)



In [25]:

    
CcoClust_regular.head(5)



In [30]:

    
CcoClust_regular.kmeans.unique()









    Out[30]:





array([0, 4, 3, 1, 5, 2], dtype=int64)



In [31]:

    
CcoClust_regular['kmeans'].value_counts()









    Out[31]:





0    1938
4      84
3      35
1      21
2       4
5       2
Name: kmeans, dtype: int64



In [ ]:

    
#import matplotlib as mpl
def PlotKmeansCombined(KmeansPD, kSize=10, figSizeX=2, figSizeY=3, color='k'):
    KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color='k',range = (0,kSize),align = 'left')
    fig,axs=plt.subplots(figSizeX, figSizeY)
    axs=[item for sublist in axs for item in sublist]
    fig.set_size_inches(15,9)
    i=KmeansPD.index
    i=list(i)
    Ks=re.compile('K.*')
    #Cs=re.compile('C.*')
    Cs = re.compile('R.*') #this is the RInumber I created...for the moment, do not need the Cnumber
    C = filter(Cs.search, i)  
    K = filter(Ks.search, i)  
    Ksplit=KmeansPD.loc[K]
    Csplit=KmeansPD.loc[C]
    for ax, y in zip(axs,range(kSize)):
        KData=Ksplit[Ksplit.kmeans==y].T.drop('kmeans')
        KData.plot(ax=ax, legend=False, grid=False, color='b')
        CData=Csplit[Csplit.kmeans==y].T.drop('kmeans')
        CData.plot(ax=ax, legend=False, grid=False, color='r')
        SumKC=len(KData.T)+len(CData.T)
        KPct=(len(KData.T))
        CPct=(len(CData.T))
        ax.set_title('nGenes ' + str(KPct) + ', nCpds ' + str(CPct) + ', Cluster ' + str(y))
        ax.set_ylim([0,5])
    return fig
    #fig.savefig(figureName)



In [34]:

    
#import matplotlib as mpl
def PlotKmeansCombined_2(KmeansPD, kSize=10, figSizeX=2, figSizeY=3, color='k'):
    KmeansPD['kmeans'].plot(kind='hist', bins=kSize, color='k',range = (0,kSize),align = 'left')
    fig,axs=plt.subplots(figSizeX, figSizeY)
    axs=[item for sublist in axs for item in sublist]
    fig.set_size_inches(15,9)
    i=KmeansPD.index
    i=list(i)
    Ks=re.compile('K.*')
    #Cs=re.compile('C.*')
    Cs = re.compile('R.*') #this is the RInumber I created...for the moment, do not need the Cnumber
    C = filter(Cs.search, i)  
    K = filter(Ks.search, i)  
    Ksplit=KmeansPD.loc[K]
    Csplit=KmeansPD.loc[C]
    for ax, y in zip(axs,range(kSize)):
        KData=Ksplit[Ksplit.kmeans==y].T.drop('kmeans')
        CData=Csplit[Csplit.kmeans==y].T.drop('kmeans')
        
        if CData.empty and KData.empty:
            continue
        
        elif CData.empty:
            KData.plot(ax=ax, legend=False, grid=False, color='r')
        
        elif KData.empty:
            CData.plot(ax=ax, legend=False, grid=False, color='k')
                   
      
        else:
            CData.plot(ax=ax, legend=False, grid=False, color='k')
            KData.plot(ax=ax, legend=False, grid=False, color='r')
            
        
#         KData.plot(ax=ax, legend=False, grid=False, color='b')
#         CData.plot(ax=ax, legend=False, grid=False, color='r')
        
        
        SumKC=len(KData.T)+len(CData.T)
        KPct=(len(KData.T))
        CPct=(len(CData.T))
        ax.set_title('nGenes ' + str(KPct) + ', nCpds ' + str(CPct) + ', Cluster ' + str(y))
        #ax.set_ylim([0,5])
    return fig
    #fig.savefig(figureName)



In [35]:

    
f = PlotKmeansCombined_2(CcoClust_regular,makeNclusters) 
f.savefig('Kmeans_regular_beforeMean.png')
del f



In [24]:

    
f = PlotKmeansCombined(CcoClust_inverse,makeNclusters) 
f.savefig('Kmeans_inverse.png')
del f



In [168]:

    
#need this cell to get the taxonomic information used in the species plots later on

#load in the species/group information
Group_Species=pd.read_table('GrpSpecies',delimiter=' ').T.drop(['MMETSP', 
                                                  'MMETSP.1']).T.drop_duplicates().set_index('SName')

InsituTPMGrped=InsituTPM.groupby(['kID','sgID']).sum().reset_index().set_index('sgID')

Group_Species=Group_Species.reset_index()
#get the specific group combos
Dia=Group_Species[Group_Species['Grp']=='Bacillariophyta']
Din=Group_Species[Group_Species['Grp']=='Dinophyta']
Oth=Group_Species[((Group_Species['Grp']!='Dinophyta')&
                  (Group_Species['Grp']!='Bacillariophyta'))]
#get the insitu counts
Insitu_TPM_DIA=InsituTPMGrped.loc[Dia['SName']].groupby('kID').sum()
Insitu_TPM_DIN=InsituTPMGrped.loc[Din['SName']].groupby('kID').sum()
Insitu_TPM_Oth=InsituTPMGrped.loc[Oth['SName'].dropna()].groupby('kID').sum()

D=set(Insitu_TPM_DIA.index)
N=set(Insitu_TPM_DIN.index)
O=set(Insitu_TPM_Oth.index)



In [26]:

    
fig, axs=plt.subplots(2,3)
fig.set_size_inches(9,6)
axs=axs.flatten()
for i in set(CcoClust_regular['kmeans']):
    ClusterK = CcoClust_regular[CcoClust_regular['kmeans']==i]
    ClusterKind=set([k for k in ClusterK.index if k.startswith('K')])

    Di=D.intersection(ClusterKind)
    Ni=N.intersection(ClusterKind)
    Oi=O.intersection(ClusterKind)

    Dsum=Insitu_TPM_DIA.loc[Di].sum()
    Nsum=Insitu_TPM_DIN.loc[Ni].sum()
    Osum=Insitu_TPM_Oth.loc[Di].sum()

    ax=axs[i]
    ax.stackplot(range(5), Dsum, Nsum, Osum, colors=
                 pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
    ax.set_xticks(range(5))
    ax.set_xticklabels([1,2,3,4,5])









    



C:\Anaconda\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The set_color_cycle attribute was deprecated in version 1.5. Use set_prop_cycle instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)



In [27]:

    
fig, axs=plt.subplots(2,3)
fig.set_size_inches(9,6)
axs=axs.flatten()
for i in set(CcoClust_inverse['kmeans']):
    ClusterK = CcoClust_inverse[CcoClust_inverse['kmeans']==i]
    ClusterKind=set([k for k in ClusterK.index if k.startswith('K')])

    Di=D.intersection(ClusterKind)
    Ni=N.intersection(ClusterKind)
    Oi=O.intersection(ClusterKind)

    Dsum=Insitu_TPM_DIA.loc[Di].sum()
    Nsum=Insitu_TPM_DIN.loc[Ni].sum()
    Osum=Insitu_TPM_Oth.loc[Di].sum()

    ax=axs[i]
    ax.stackplot(range(5), Dsum, Nsum, Osum, colors=
                 pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
    ax.set_xticks(range(5))
    ax.set_xticklabels([1,2,3,4,5])

focus on things with common linked reactions...



In [28]:

    
#But...for the CheckRelatedness...do need to go back to the cNumber...
#for now, easiest to just make yet another matrix and put the cNumbers back in.
forRelatedness_regular = CcoClust_regular.copy(deep=True) #make a copy of the CcoClust data frame
forRelatedness_regular.insert(0,'KEGG',"") #add a column called 'KEGG'

forRelatedness_inverse = CcoClust_inverse.copy(deep=True) #make a copy of the CcoClust data frame
forRelatedness_inverse.insert(0,'KEGG',"") #add a column called 'KEGG'



In [29]:

    
for idx in range(0,len(forRelatedness_regular)):
    t = forRelatedness_regular.iloc[idx,:].name

    if t[0]=='R':
        #go find the matching cNumber in CO_RawData_all
        t2 = CO_fromMATLAB.loc[t,('cNumber')]
        forRelatedness_regular.ix[idx,('KEGG')] = t2
    elif t[0] == 'K':
        #just copy the K number over
        forRelatedness_regular.ix[idx,('KEGG')] = t
        
for idx in range(0,len(forRelatedness_inverse)):
    t = forRelatedness_inverse.iloc[idx,:].name

    if t[0]=='R':
        #go find the matching cNumber in CO_RawData_all
        t2 = CO_fromMATLAB.loc[t,('cNumber')]
        forRelatedness_inverse.ix[idx,('KEGG')] = t2
    elif t[0] == 'K':
        #just copy the K number over
        forRelatedness_inverse.ix[idx,('KEGG')] = t



In [30]:

    
def CheckRelatedness(inClust,nC):
    df=pd.DataFrame(columns=['Common Rxn','No linked Rxn'], index=range(nC))
    for n in range(nC):
        kClust=inClust[inClust.kmeans==n]
        #i=kClust.index
        i = kClust.KEGG #change the new column I created with Cnumbers and Knumbers
        i=list(i)
        Csearc=re.compile('C.*') #re is regular expression...perl-like 
        Cs = filter(Csearc.search, i)
        Ksearc=re.compile('K.*')
        Kis = filter(Ksearc.search, i)
        Kis=set(Kis)
        Ks=[]
        for c in Cs:
            if c in CO_withKO.keys():
                Ks.append(CO_withKO[c]['Related KO'])
        Ks=set([item for sublist in Ks for item in sublist])
        df.loc[n,'Common Rxn']=len(Kis.intersection(Ks))
        df.loc[n, 'No linked Rxn']=len(Kis)-len(Kis.intersection(Ks))
    df.plot(kind='bar', stacked=True, colormap=pal.colorbrewer.diverging.PRGn_5.get_mpl_colormap(), grid=False)



In [31]:

    
CheckRelatedness(forRelatedness_regular, makeNclusters)



In [32]:

    
CheckRelatedness(forRelatedness_inverse, makeNclusters)

Grab the information from KEGG for plotting and bean counting



In [33]:

    
allPathways = kegg_list("pathway").read()
len(allPathways.split('\n'))
#number here is the # of pathways at KEGG, up to 486 by 4/13/2016









    Out[33]:





487



In [34]:

    
trimPath = []
current_section = None
for line in allPathways.rstrip().split("\n"):
    tp = line[8:13]
    trimPath.append('ko' + tp)
    
#have some cases where KEGG will send back a pathway, but the pathway itself is not searchable...seems to 
#be a KEGG bug, 'ko00351' was first, then realized there are many of these,
#did this list manually since I thought it would be short...
toDelete = ('ko00351', 'ko01010','ko01060',  'ko01061', 'ko01062', 'ko01063', 
            'ko01064', 'ko01065', 'ko01066', 'ko01070', 'ko07011', 'ko07012', 
            'ko07013', 'ko07014', 'ko07015', 'ko07016', 'ko07017', 'ko07018', 
            'ko07019', 'ko07020', 'ko07021', 'ko07023', 'ko07024', 'ko07025', 
            'ko07026', 'ko07027', 'ko07028', 'ko07029', 'ko07030', 'ko07031', 
            'ko07032', 'ko07033', 'ko07034', 'ko07035', 'ko07036', 'ko07037', 
            'ko07038', 'ko07039', 'ko07040', 'ko07041', 'ko07042', 'ko07043', 
            'ko07044', 'ko07045', 'ko07046', 'ko07047', 'ko07048', 'ko07049', 
            'ko07050', 'ko07051', 'ko07052', 'ko07053', 'ko07054', 'ko07055', 
            'ko07056', 'ko07057', 'ko07110', 'ko07112', 'ko07114', 'ko07117', 
            'ko07211', 'ko07212', 'ko07213', 'ko07214', 'ko07215', 'ko07216', 
            'ko07217', 'ko07218', 'ko07219', 'ko07220', 'ko07221', 'ko07222', 
            'ko07223', 'ko07224', 'ko07225', 'ko07226', 'ko07227', 'ko07228', 
            'ko07229', 'ko07230', 'ko07231', 'ko07232', 'ko07233', 'ko07234', 
            'ko07235', 'ko04933')

#probably a way to do this without the for loop, but this will work
for item in toDelete:
    trimPath.remove(item)



In [35]:

    
shortList = ['ko03010','ko03013','ko00140'] #for testing



In [36]:

    
import fxn_gatherDetails
##if I make a change, have to reload the function:
#reload(fxn_gatherDetails)



In [37]:

    
#usePathway = shortList #for testing...use complete list later
usePathway = trimPath
KOandCOwithKmeans = forRelatedness_regular #linked KO & CO, data, with K means information
useCO = CO_fromMATLAB #full list of CO data, need to color pathway, and lookup table
useKO = KO_Norm2Mean # full list of KO data, need to color pathway
dia = Insitu_TPM_DIA #diatom TPM, for plotting species
din = Insitu_TPM_DIN #dino TPM, for plotting species
oth = Insitu_TPM_Oth #other TPM, for plotting species
folder = 'plots_regular_Km'



In [38]:

    
gc_regular = fxn_gatherDetails.gatherDetails(makeNclusters,usePathway,KOandCOwithKmeans,folder,useCO,useKO,dia,din,oth)



In [39]:

    
#switch to inverse, repeat 
KOandCOwithKmeans = forRelatedness_inverse #linked KO & CO, data, with K means information
folder = 'plots_inverse_Km'

gc_inverse = fxn_gatherDetails.gatherDetails(makeNclusters,usePathway,KOandCOwithKmeans,folder,useCO,useKO,dia,din,oth)



In [40]:

    
#now...save all that so I don't have to do this everytime BUT be careful with re-assigning K means group numbers!
cpk.dump(gc_regular, open('gatherCounts_norm2mean_regular.2016.04.18.pickle', 'wb'))
cpk.dump(gc_inverse, open('gatherCounts_norm2mean_inverse.2016.04.18.pickle', 'wb'))

## this is the code to read in the file...can use this without having to go through the pain of rerunning gatherCounts
#gatherCounts_regular = cpk.load(open('gatherCounts_norm2mean_regular.2016.04.13.pickle','rb'))



In [41]:

    
def plotGenes_ratio(tg):
    #if this is something in the pathway, plot up the species for the K number
    if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_DIN.index.tolist()):
        DaDn=Insitu_TPM_DIA.loc[tg]/Insitu_TPM_DIN.loc[tg]

    if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
        DaOt=Insitu_TPM_DIA.loc[tg]/Insitu_TPM_Oth.loc[tg]

    if (tg in Insitu_TPM_DIN.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
        DnOt=Insitu_TPM_DIN.loc[tg]/Insitu_TPM_Oth.loc[tg]

    useColors = pal.colorbrewer.qualitative.Set1_3.hex_colors    
    ms = 200 #set the size of the dots
    fig,axs=plt.subplots(1,3)
    fig.set_size_inches(10,4)
    axs[0].scatter(range(5),DaDn,s = ms, color = useColors[0])
    axs[0].set_title('DIA:DIN')
    axs[0].set_xticks(range(5))
    axs[0].set_ylabel('Ratios')

    axs[1].scatter(range(5),DaOt,s = ms, color = useColors[1])
    axs[1].set_title('DIA:Oth')
    axs[1].set_xticks(range(5))

    axs[2].scatter(range(5),DnOt,s = ms, color = useColors[2])
    axs[2].set_title('DIN:Oth')
    axs[2].set_xticks(range(5))

    #fig.savefig(directorySpecies + '/' + tg + '_species.png',bbox_inches='tight')
    #plt.close()



In [42]:

    
#not sure this is helpful since it would be another set of figures to wade through, and the 
#same information is already in the species plots...leave here to get a sense of what we 
#are looking for though in considering the ratios of the various TPMs
r = plotGenes_ratio('K01914')



In [43]:

    
#make a function to calculate the ratios of the three TPM groups
#function expects a version of the forRelatedness_XXXX dataframe:
def calcTPMratios(dataIn):

    #do NOT pull the KEGG numbers bc that causes issues downstream with the plotting: confusion of numbers/text
    df2 = dataIn.loc[:,('kmeans')]
    
    df2 = df2.to_frame() #this gets around the error about working on a copy of a slice
    df2['inList'] = ''

    for idx in range(0,len(df2)):
        fc = df2.index.values[idx]

        if fc[0] == 'C':
            df2.ix[idx,'inList'] = False
        elif fc[0] == 'K':
            df2.ix[idx,'inList'] = True  

    df2 = df2[df2['inList']==True]
    df2 = df2.T.drop('inList').T #this somehow gets around the warning about making a change on a copy...?

    for tg,row in df2.iterrows():

        if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_DIN.index.tolist()):
            #use this to get the mean, ignoring any inf/NaN values
            tv = Insitu_TPM_DIA.loc[tg,:]/Insitu_TPM_DIN.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDaDn'] = t 
        else:
            df2.loc[tg,'meanDaDn'] = np.nan

        if (tg in Insitu_TPM_DIA.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
            tv =  Insitu_TPM_DIA.loc[tg,:]/Insitu_TPM_Oth.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDaOt'] = t
        else:
            df2.loc[tg,'meanDaOt'] = np.nan

        if (tg in Insitu_TPM_DIN.index.tolist()) and (tg in Insitu_TPM_Oth.index.tolist()):
            tv = Insitu_TPM_DIN.loc[tg,:]/Insitu_TPM_Oth.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDnOt'] = t
        else:
            df2.loc[tg,'meanDnOt'] = np.nan
            
    return df2



In [44]:

    
out_regular = calcTPMratios(forRelatedness_regular)
out_inverse = calcTPMratios(forRelatedness_inverse)
#save inverse for later, this is already complicated enough with the regular K means groups



In [45]:

    
#not fully convinced this is actually interesting...the ratios in the extremes can be the cases
#with small # if TPM in denominator/numerator driving to large numbers (ie. 300 / 0.05))
axs = out_regular.boxplot(column = 'meanDaDn',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDaDn' + ' regular')









    Out[45]:





<matplotlib.text.Text at 0x95453278>



In [46]:

    
axs = out_regular.boxplot(column = 'meanDaOt',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDaOt' + ' regular')









    Out[46]:





<matplotlib.text.Text at 0x96746550>



In [47]:

    
axs = out_regular.boxplot(column = 'meanDnOt',by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title('meanDnOt' + ' regular')









    Out[47]:





<matplotlib.text.Text at 0xbce789e8>



In [48]:

    
out_regular[out_regular.loc[:,'meanDaDn']>=1000]









    Out[48]:






  
    
      
      kmeans
      meanDaDn
      meanDaOt
      meanDnOt
    
  
  
    
      K01914
      3
      8761.666667
      879.138333
      0.110476



In [49]:

    
out_regular.mean()









    Out[49]:





kmeans       2.126677
meanDaDn    31.217918
meanDaOt    36.837100
meanDnOt    10.500923
dtype: float64



In [50]:

    
out_regular.groupby('kmeans').mean()



In [51]:

    
#explore...plot one gene here rather than digging it up from the folders
kid='K01914'
allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
allK=allK.drop('kID')
allK=allK.astype('float')
allK

if kid in Insitu_TPM_DIA.index.tolist():
    Dk=Insitu_TPM_DIA.loc[kid]
else: 
    Dk = 0/allK

if kid in Insitu_TPM_DIN.index.tolist():
    Nk=Insitu_TPM_DIN.loc[kid]
else:
    Nk = 0/allK
    
if kid in Insitu_TPM_Oth.index.tolist():
    Ok=Insitu_TPM_Oth.loc[kid]
else:
    Ok = 0/allK
    
fig,ax=plt.subplots(1)
ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
plt.title(kid)
# ax.set_ylim(0,1)
#fig.savefig(kid +'_numbers_forTalk.pdf')

#there are very small values here for DIN and other, but can't see them on the figure









    Out[51]:





<matplotlib.text.Text at 0xa42fc128>



In [52]:

    
#function expects a version of the forRelatedness_XXXX dataframe:
def averageTPM(dataIn):

    #do NOT pull the KEGG numbers bc that causes issues downstream with the plotting: confusion of numbers/text
    df2 = dataIn.loc[:,('kmeans')]
    
    df2 = df2.to_frame() #this gets around the error about working on a copy of a slice
    df2['inList'] = ''

    for idx in range(0,len(df2)):
        fc = df2.index.values[idx]
        if fc[0] == 'C':
            df2.ix[idx,'inList'] = False
        elif fc[0] == 'K':
            df2.ix[idx,'inList'] = True  

    df2 = df2[df2['inList']==True]
    df2 = df2.T.drop('inList').T #this somehow gets around the warning about making a change on a copy...?

    for tg,row in df2.iterrows():

        if (tg in Insitu_TPM_DIA.index.tolist()) :
            #use this to get the mean, ignoring any inf/NaN values
            tv = Insitu_TPM_DIA.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDia'] = t 
        else:
            df2.loc[tg,'meanDia'] = np.nan

        if (tg in Insitu_TPM_DIN.index.tolist()) :
            tv =  Insitu_TPM_DIN.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanDin'] = t
        else:
            df2.loc[tg,'meanDin'] = np.nan

        if (tg in Insitu_TPM_Oth.index.tolist()):
            tv = Insitu_TPM_Oth.loc[tg,:]
            t = np.ma.masked_invalid(tv).mean()
            df2.loc[tg,'meanOth'] = t
        else:
            df2.loc[tg,'meanOth'] = np.nan
            
    return df2



In [53]:

    
testing = averageTPM(forRelatedness_regular)



In [54]:

    
testing.head(5)



In [55]:

    
#but...ask Harriet whether she thinks 'average TPM' is really a valid concept here
#also, remember the different K means group have different number of genes...so, the dip
#in 'other' in K means group 4 is only considering 8 genes.



In [56]:

    
use = 'meanDia'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')









    Out[56]:





<matplotlib.text.Text at 0xa4477400>



In [57]:

    
use = 'meanDin'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')









    Out[57]:





<matplotlib.text.Text at 0x931ab710>



In [58]:

    
use = 'meanOth'
axs = testing.boxplot(column = use,by='kmeans')
axs.set_yscale('log')
axs.grid('off')
axs.set_title(use + ' regular')









    Out[58]:





<matplotlib.text.Text at 0x1018515c0>



In [59]:

    
colLabels = list(gc_regular.columns.values)



In [60]:

    
colLabels









    Out[60]:





['nCpds',
 'nGenes',
 'Km0_cpd',
 'Km0_gene',
 'Km1_cpd',
 'Km1_gene',
 'Km2_cpd',
 'Km2_gene',
 'Km3_cpd',
 'Km3_gene',
 'Km4_cpd',
 'Km4_gene',
 'Km5_cpd',
 'Km5_gene',
 'pathwayInfo',
 'pathwayGroup_A',
 'pathwayGroup_B',
 'pathwayGroup_C']



In [61]:

    
cpdLabels = colLabels[2:13:2] #odd Python syntax, from 2 to 13, in steps of 2)
cpdLabels









    Out[61]:





['Km0_cpd', 'Km1_cpd', 'Km2_cpd', 'Km3_cpd', 'Km4_cpd', 'Km5_cpd']



In [62]:

    
geneLabels = colLabels[3:14:2] #odd Python syntax, from 2 to 13, in steps of 2)
geneLabels









    Out[62]:





['Km0_gene', 'Km1_gene', 'Km2_gene', 'Km3_gene', 'Km4_gene', 'Km5_gene']



In [63]:

    
#I have put the bar graphs back in here since I think they are the simplest way to consider these data



In [64]:

    
##let's narrow in on the metabolism group since that is the only one that is really interesting
plotMetabolism = gc_regular[gc_regular.loc[:,'pathwayGroup_A']=='Metabolism']



In [65]:

    
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]



In [66]:

    
dataToPlot.head(5)









    Out[66]:






  
    
      
      nCpds
      nGenes
      Km0_cpd
      Km0_gene
      Km1_cpd
      Km1_gene
      Km2_cpd
      Km2_gene
      Km3_cpd
      Km3_gene
      Km4_cpd
      Km4_gene
      Km5_cpd
      Km5_gene
      pathwayInfo
      pathwayGroup_A
      pathwayGroup_B
      pathwayGroup_C
    
  
  
    
      ko00010
      31
      96
      1
      0
      0
      1
      0
      14
      0
      1
      0
      0
      3
      0
      Glycolysis / Gluconeogenesis
      Metabolism
      Carbohydrate metabolism
      Glycolysis / Gluconeogenesis
    
    
      ko00020
      20
      57
      0
      2
      0
      2
      3
      16
      0
      0
      0
      1
      2
      0
      Citrate cycle (TCA cycle)
      Metabolism
      Carbohydrate metabolism
      Citrate cycle (TCA cycle)
    
    
      ko00030
      35
      76
      0
      0
      0
      0
      0
      2
      0
      1
      0
      2
      2
      0
      Pentose phosphate pathway
      Metabolism
      Carbohydrate metabolism
      Pentose phosphate pathway
    
    
      ko00040
      55
      63
      0
      0
      0
      0
      1
      7
      0
      1
      0
      0
      1
      0
      Pentose and glucuronate interconversions
      Metabolism
      Carbohydrate metabolism
      Pentose and glucuronate interconversions
    
    
      ko00051
      53
      98
      0
      1
      4
      0
      6
      11
      0
      0
      0
      0
      7
      0
      Fructose and mannose metabolism
      Metabolism
      Carbohydrate metabolism
      Fructose and mannose metabolism



In [67]:

    
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
pGroup









    Out[67]:





array(['Carbohydrate metabolism', 'Lipid metabolism',
       'Metabolism of cofactors and vitamins', 'Energy metabolism',
       'Amino acid metabolism', 'Nucleotide metabolism',
       'Biosynthesis of other secondary metabolites',
       'Metabolism of terpenoids and polyketides',
       'Xenobiotics biodegradation and metabolism',
       'Metabolism of other amino acids',
       'Glycan biosynthesis and metabolism', 'Global and overview maps'], dtype=object)



In [68]:

    
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]

pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = cpdLabels)

for i, group in s.groupby('pathwayGroup_B'):
    d2 = group.loc[:,cpdLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,cpdLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])



In [69]:

    
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors, figsize = (12,12))









    Out[69]:





<matplotlib.axes._subplots.AxesSubplot at 0x10174b860>



In [70]:

    
pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = geneLabels)

for i, group in s.groupby('pathwayGroup_B'):
    d2 = group.loc[:,geneLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,geneLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])



In [71]:

    
useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors, figsize = (12,12))









    Out[71]:





<matplotlib.axes._subplots.AxesSubplot at 0xcc9992b0>



In [72]:

    
#useColors = pal.colorbrewer.qualitative.Set3_11.hex_colors
useColors = pal.colorbrewer.diverging.RdYlBu_11.hex_colors
newDFmtab.T.plot(kind = 'barh',color = useColors, figsize = (12,12))









    Out[72]:





<matplotlib.axes._subplots.AxesSubplot at 0xd0a9e668>



In [73]:

    
s = plotMetabolism[(plotMetabolism.loc[:,colLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,colLabels]

pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = cpdLabels)

for i, group in s.groupby('pathwayGroup_C'):
    d2 = group.loc[:,cpdLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,cpdLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])

useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors,figsize = (12,192))









    Out[73]:





<matplotlib.axes._subplots.AxesSubplot at 0xe3df1e80>



In [74]:

    
s = plotMetabolism[(plotMetabolism.loc[:,geneLabels].values > 0).any(axis=1)]
dataToPlot = s.loc[:,geneLabels]

pGroup = pd.unique(plotMetabolism.pathwayGroup_B.ravel())
newDFmtab = pd.DataFrame(index = pGroup,columns = geneLabels)

for i, group in s.groupby('pathwayGroup_C'):
    d2 = group.loc[:,geneLabels]
    out = d2.sum(axis=0)
    newDFmtab.loc[i,geneLabels] = out

# The global and overview maps will be empty, so get rid of it
newDFmtab = newDFmtab.drop(['Global and overview maps'])

useColors = pal.colorbrewer.qualitative.Set1_6.hex_colors
newDFmtab.plot(kind = 'barh',color = useColors,figsize = (12,192))









    Out[74]:





<matplotlib.axes._subplots.AxesSubplot at 0x26200a90>

Liz wants to think about ratio of genes at one time point versus another...I think this will be driven by the small numbers again



In [75]:

    
dtm = forRelatedness_regular.loc[:,'S2']/forRelatedness_regular.loc[:,'S3']
#how many cases do we have with genes/compounds 10x greater at S2?
r = dtm[dtm>10]
r.replace(np.inf,np.nan).dropna() #replace inf with NaN, then drop the NaN









    Out[75]:





K00509     35.550889
K13481     10.873400
K01243     12.021647
K14264     40.887580
K00944     14.301025
K00790     17.838439
K09843     11.771332
K00454     12.696537
K17690     14.747486
RI67       44.783023
RI68       44.783023
RI182      63.015188
RI184      63.015188
RI198      63.015188
RI200      63.015188
RI257      26.824789
RI267      26.824789
RI369     142.486187
RI481      21.561016
RI483      21.561016
RI543     264.225210
RI544     264.225210
RI554     264.225210
RI556     264.225210
RI560     264.225210
RI573      11.433061
RI581      11.433061
RI650      54.058096
RI661      13.335896
RI662      13.335896
             ...    
RI4477     18.783764
RI4478     20.392295
RI4479     18.783764
RI4480     20.392295
RI4481     18.783764
RI4482     20.392295
RI4483     18.783764
RI4491     14.602895
RI4503     14.602895
RI4515     14.602895
RI4527     14.602895
RI4716     74.966762
RI4718     74.966762
RI4724     74.966762
RI4730     74.966762
RI4732     74.966762
RI4738     74.966762
RI4762     39.965927
RI4764     32.279442
RI4765     32.279442
RI4766     32.279442
RI4820     10.587128
RI4826     10.587128
RI4828     10.587128
RI4829     10.587128
RI4868     14.384067
RI4869     14.384067
RI4878     15.880987
RI4914     36.837767
RI4916     40.171429
dtype: float64



In [76]:

    
ax2 = dtm.plot.box()
ax2.set_yscale('log')

Consider diversity of other EC/genes (lots of examples follow)



In [77]:

    
def plotOneGene(kid):#explore...plot one gene here rather than digging it up from the folders
    #kid='K03183'
    allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
    allK=allK.drop('kID')
    allK=allK.astype('float')
    allK

    if kid in Insitu_TPM_DIA.index.tolist():
        Dk=Insitu_TPM_DIA.loc[kid]
    else: 
        Dk = 0/allK

    if kid in Insitu_TPM_DIN.index.tolist():
        Nk=Insitu_TPM_DIN.loc[kid]
    else:
        Nk = 0/allK

    if kid in Insitu_TPM_Oth.index.tolist():
        Ok=Insitu_TPM_Oth.loc[kid]
    else:
        Ok = 0/allK
        
    #try not to plot things that are all NaN (though this will allow 'zeros', and
    #definitely have cases with 0 and NaN for a given gene)    
    if not Dk.dropna().empty and not Nk.dropna().empty and not Ok.dropna().empty: #only plot with data!
        fig,ax=plt.subplots(1)
        ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
        ax.set_xticks(range(5))
        ax.set_xticklabels([1,2,3,4,5])
        fig.set_size_inches(2,2) #this is a property of the figure, not the axis...
        plt.title(kid)
        # ax.set_ylim(0,1)
        #fig.savefig(kid +'_numbers_forTalk.pdf')



In [78]:

    
#given a gene, go get all possible EC numbers, use them to search for other genes with that EC, and plot
#this will skip over genes that are not in the RI data
def plotMultipleEC(kid):
    getOne = kegg_list(kid).read()
    geneList = []

    for line in getOne.rstrip().split('\n'):
        r1 = line.find("[EC:")
        r2 = line.find("]")
        r = line[(r1+4):r2]
        r3 = r.split(' ')
        #now that I have the EC numbers...get the genes for each one
        for idx in r3:
            getK = kegg_link('ko',idx).read()
            for line3 in re.split('\n',getK):
                if line3 is not '':
                    ge = line3.find('ko')
                    t = line3[ge+3:]
                    geneList.append(t)
    geneList = set(geneList) #this will make the list of unique genes...can print that up

    for idx in geneList:
        plotOneGene(idx) #plotting function has already been defined above...



In [79]:

    
#in some cases, only get back the one gene we started with
plotMultipleEC('K01830')



In [80]:

    
##KL note for Harriet: this is an ADP-ribose pyrophosphate...in the upper left corner 
##of the purine pathway
plotMultipleEC('K13988')



In [81]:

    
#example of something high at S2, but another version of the gene is steady throughout
#lipid: arachidonic acid metabolism
plotMultipleEC('K00509')



In [82]:

    
plotMultipleEC('K17690')



In [148]:

    
plotMultipleEC('K12343')



In [ ]:



In [ ]:



In [ ]:



In [85]:

    
#leave bits below for the moment...



In [90]:

    
type(InsituTPMGrped)









    Out[90]:





pandas.core.frame.DataFrame



In [92]:

    
InsituTPMGrped.head(5)



In [94]:

    
dayList









    Out[94]:





['S1', 'S2', 'S3', 'S4', 'S5']



In [99]:

    
InsituTPMGrped['mean'] = InsituTPMGrped[dayList].mean(axis=1)



In [100]:

    
InsituTPMGrped.head(5)



In [101]:



In [104]:

    
InsituTPMGrped.shape









    Out[104]:





(266871, 7)



In [ ]:



In [103]:

    
InsituTPMGrped['mean'].max()









    Out[103]:





984.59611900226923



In [106]:

    
Insitu_TPM_DIA.shape









    Out[106]:





(5846, 5)



In [107]:

    
Insitu_TPM_DIN.shape









    Out[107]:





(6608, 5)



In [108]:

    
Insitu_TPM_Oth.shape









    Out[108]:





(5214, 5)



In [ ]:



In [102]:

    
plt.hist(InsituTPMGrped['mean'])









    Out[102]:





(array([  2.66783000e+05,   6.70000000e+01,   1.30000000e+01,
          6.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00]),
 array([  2.24883660e-03,   9.84616359e+01,   1.96921023e+02,
          2.95380410e+02,   3.93839797e+02,   4.92299184e+02,
          5.90758571e+02,   6.89217958e+02,   7.87677345e+02,
          8.86136732e+02,   9.84596119e+02]),
 <a list of 10 Patch objects>)



In [ ]:



In [115]:

    
Insitu_TPM_DIA['mean'] = Insitu_TPM_DIA[dayList].mean(axis=1)
Insitu_TPM_DIN['mean'] = Insitu_TPM_DIN[dayList].mean(axis=1)
Insitu_TPM_Oth['mean'] = Insitu_TPM_Oth[dayList].mean(axis=1)



In [110]:

    
Insitu_TPM_DIA.head(5)



In [120]:

    
fig= plt.hist(Insitu_TPM_DIA['mean'],bins = 100,histtype='stepfilled',color = 'r',label = 'DIA',log=True)



In [121]:

    
fig = plt.hist(Insitu_TPM_DIN['mean'],bins = 100,histtype='stepfilled',color = 'g',alpha = 0.25,label = 'DIN',log = True)



In [122]:

    
fig = plt.hist(Insitu_TPM_Oth['mean'],bins = 100,histtype='stepfilled',color = 'b',alpha = 0.75,label = 'Oth',log = True)



In [123]:

    
pruneDIA = Insitu_TPM_DIA.copy(deep=True)



In [125]:

    
pruneDIA.head(5)



In [127]:

    
pruneDIA.drop('mean',axis=1,inplace=True)



In [129]:

    
k = pruneDIA<2



In [131]:

    
pruneDIA[k] = np.nan



In [137]:

    
pruneDIA['mean'] = pruneDIA[dayList].mean(axis=1)



In [140]:

    
#need dropna or else the histogram will barf
fig = plt.hist(pruneDIA['mean'].dropna(),log = True)



In [142]:

    
pruneDIN = Insitu_TPM_DIN[dayList].mean(axis=1)



In [143]:

    
fig = plt.hist(pruneDIN.dropna(),log=True)



In [157]:

    
fig = plt.hist(Insitu_TPM_Oth[dayList].mean(axis=1).dropna(),log=True)



In [ ]:



In [ ]:



In [171]:

    
def plotOneGene_pruned(kid):#explore...plot one gene here rather than digging it up from the folders
    #kid='K03183'
    allK=InsituTPMGrped[InsituTPMGrped['kID']==kid].sum()
    allK=allK.drop('kID')
    allK=allK.astype('float')
    allK

    k = Insitu_TPM_DIA < 2
    Insitu_TPM_DIA[k] = 0
    
    k = Insitu_TPM_DIN < 2
    Insitu_TPM_DIN[k] = 0
    
    k = Insitu_TPM_Oth < 2
    Insitu_TPM_Oth[k] = 0
    
    if kid in Insitu_TPM_DIA.index.tolist():
        Dk=Insitu_TPM_DIA.loc[kid]
    else: 
        Dk = 0/allK

    if kid in Insitu_TPM_DIN.index.tolist():
        Nk=Insitu_TPM_DIN.loc[kid]
    else:
        Nk = 0/allK

    if kid in Insitu_TPM_Oth.index.tolist():
        Ok=Insitu_TPM_Oth.loc[kid]
    else:
        Ok = 0/allK
        
    #try not to plot things that are all NaN (though this will allow 'zeros', and
    #definitely have cases with 0 and NaN for a given gene)    
    if not Dk.dropna().empty and not Nk.dropna().empty and not Ok.dropna().empty: #only plot with data!
        fig,ax=plt.subplots(1)
        ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
        ax.set_xticks(range(5))
        ax.set_xticklabels([1,2,3,4,5])
        fig.set_size_inches(2,2) #this is a property of the figure, not the axis...
        plt.title(kid)
        # ax.set_ylim(0,1)
        #fig.savefig(kid +'_numbers_forTalk.pdf')
        
#given a gene, go get all possible EC numbers, use them to search for other genes with that EC, and plot
#this will skip over genes that are not in the RI data
def plotMultipleEC_2(kid):
    getOne = kegg_list(kid).read()
    geneList = []

    for line in getOne.rstrip().split('\n'):
        r1 = line.find("[EC:")
        r2 = line.find("]")
        r = line[(r1+4):r2]
        r3 = r.split(' ')
        #now that I have the EC numbers...get the genes for each one
        for idx in r3:
            getK = kegg_link('ko',idx).read()
            for line3 in re.split('\n',getK):
                if line3 is not '':
                    ge = line3.find('ko')
                    t = line3[ge+3:]
                    geneList.append(t)
    geneList = set(geneList) #this will make the list of unique genes...can print that up

    for idx in geneList:
        plotOneGene_pruned(idx) #plotting function has already been defined above...



In [172]:

    
plotMultipleEC_2('K13988')



In [166]:

    
Dk









    Out[166]:





S1      2.878511
S2      2.718476
S3           NaN
S4      2.343015
S5      2.211123
mean    2.377372
Name: K00002, dtype: float64



In [ ]:



In [ ]:

	S1	S2	S3	S4	S5	kmeans
K00509	1.371790	39.039095	1.098119	0.985132	1.474082	0
K00503	1.933999	3.847460	6.281711	3.674274	5.449637	0
K00500	7.511114	32.829680	11.984844	8.493431	26.555812	4
K00505	8.230742	6.491661	8.477948	4.579530	10.921608	0
K01101	24.433610	37.583299	54.882318	142.045315	52.262910	3

	meanDaDn	meanDaOt	meanDnOt
kmeans
0	42.025429	13.088527	4.428997
1	31.495108	93.007387	4.939637
2	11.093790	26.453004	10.039916
3	293.437380	88.722557	3.311839
4	1.573144	11.403837	39.140383
5	13.063875	220.244444	9.833333

	kmeans	meanDia	meanDin	meanOth
K00509	5	8.773276	NaN	0.020368
K00503	2	1.600686	2.606638	0.030093
K00500	2	14.198886	3.270149	0.039838
K00505	2	4.244865	0.877827	2.639846
K01101	2	50.637024	8.752114	2.948806

	kID	S1	S2	S3	S4	S5
sgID
Aletam	K00001	0.000000	0.000000	0.011808	0.079876	0.000000
Ampcof	K00001	0.011244	0.044565	0.011808	0.000000	0.000000
Ampmas	K00001	0.011244	0.014855	0.035423	0.053250	0.000000
Astgla	K00001	0.000000	0.000000	0.011808	0.000000	0.000000
Attsep	K00001	0.112442	0.133696	0.047231	0.026625	0.178677

	kID	S1	S2	S3	S4	S5	mean
sgID
Aletam	K00001	0.000000	0.000000	0.011808	0.079876	0.000000	0.018337
Ampcof	K00001	0.011244	0.044565	0.011808	0.000000	0.000000	0.013523
Ampmas	K00001	0.011244	0.014855	0.035423	0.053250	0.000000	0.022955
Astgla	K00001	0.000000	0.000000	0.011808	0.000000	0.000000	0.002362
Attsep	K00001	0.112442	0.133696	0.047231	0.026625	0.178677	0.099734

	nCpds	nGenes	Km0_cpd	Km0_gene	Km1_cpd	Km1_gene	Km2_cpd	Km2_gene	Km3_gene	Km4_gene	Km5_cpd	pathwayInfo	pathwayGroup_A	pathwayGroup_B	pathwayGroup_C
ko00010	31	96	1	0	0	1	0	14	1	0	3	Glycolysis / Gluconeogenesis	Metabolism	Carbohydrate metabolism	Glycolysis / Gluconeogenesis
ko00020	20	57	0	2	0	2	3	16	0	1	2	Citrate cycle (TCA cycle)	Metabolism	Carbohydrate metabolism	Citrate cycle (TCA cycle)
ko00030	35	76	0	0	0	0	0	2	1	2	2	Pentose phosphate pathway	Metabolism	Carbohydrate metabolism	Pentose phosphate pathway
ko00040	55	63	0	0	0	0	1	7	1	0	1	Pentose and glucuronate interconversions	Metabolism	Carbohydrate metabolism	Pentose and glucuronate interconversions
ko00051	53	98	0	1	4	0	6	11	0	0	7	Fructose and mannose metabolism	Metabolism	Carbohydrate metabolism	Fructose and mannose metabolism

	S1	S2	S3	S4	S5	mean
kID
K00001	2.844778	3.758330	3.931973	4.952283	6.611035	4.419680
K00002	2.878511	2.718476	1.735736	2.343015	2.211123	2.377372
K00003	27.053504	16.593101	3.790281	2.795643	3.015168	10.649539
K00004	0.067465	0.148551	0.177116	0.292877	0.268015	0.190805
K00005	0.123686	0.103985	0.259770	0.319502	0.312684	0.223926