for some reason Janet's virtualenv is much happier with this TkAgg thing set.



In [1]:

    
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
import bacteriopop_utils
import feature_selection_utils
import load_data



In [3]:

    
loaded_data = data = load_data.load_data()



In [4]:

    
loaded_data.shape









    Out[4]:





(64755, 11)

make sure none of the phyla are NA (checking 160304 update to load_data.py



In [5]:

    
loaded_data[loaded_data['phylum'].isnull()].head(3)









    Out[5]:






  
    
      
      kingdom
      phylum
      class
      order
      family
      genus
      length
      oxygen
      replicate
      week
      abundance
    
    
      sampleID



In [6]:

    
loaded_data.head()









    Out[6]:






  
    
      
      kingdom
      phylum
      class
      order
      family
      genus
      length
      oxygen
      replicate
      week
      abundance
    
    
      sampleID
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      9948861
      Low
      1
      4
      0.228531
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Methylophilales
      Methylophilaceae
      Methylotenera
      5066955
      Low
      1
      4
      0.220860
    
    
      1056013
      Bacteria
      Bacteroidetes
      Flavobacteriia
      Flavobacteriales
      Flavobacteriaceae
      Flavobacterium
      4654774
      Low
      1
      4
      0.054719
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      
      3046340
      Low
      1
      4
      0.047956
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      
      
      
      5620690
      Low
      1
      4
      0.040903

Test filter and reduce functions using a high threshold, which selects for genus==Methylobacter



In [7]:

    
bacteriopop_utils.filter_by_abundance(dataframe=loaded_data, low= 0.6).head()









    



first (up to) 5 phylo columns to keep: ['Methylobacter']






    Out[7]:






  
    
      
      kingdom
      phylum
      class
      order
      family
      genus
      length
      oxygen
      replicate
      week
      abundance
    
    
      sampleID
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      9948861
      Low
      1
      4
      0.228531
    
    
      1056016
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      8256230
      Low
      2
      4
      0.183486
    
    
      1056019
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      9550864
      Low
      3
      4
      0.207635
    
    
      1056022
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      8211298
      Low
      4
      4
      0.200109
    
    
      1056025
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      4802092
      High
      1
      4
      0.070167



In [8]:

    
bacteriopop_utils.reduce_data(dataframe=loaded_data, min_abundance= 0.6, 
                              phylo_column='genus', oxygen='high').head()









    



first (up to) 5 phylo columns to keep: ['Methylobacter']






    Out[8]:






  
    
      
      kingdom
      phylum
      class
      order
      family
      genus
      length
      oxygen
      replicate
      week
      abundance
    
    
      sampleID
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1056025
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      4802092
      High
      1
      4
      0.070167
    
    
      1056028
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      3885957
      High
      2
      4
      0.084614
    
    
      1056031
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      5692971
      High
      3
      4
      0.075084
    
    
      1056034
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      4933896
      High
      4
      4
      0.103972
    
    
      1056049
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      7618692
      High
      1
      5
      0.242867

Demo of DMD data prep



In [9]:

    
raw_dmd_data = bacteriopop_utils.reduce_data(
    dataframe=loaded_data, min_abundance= 0.01, 
    phylo_column='genus', oxygen='Low')









    



first (up to) 5 phylo columns to keep: ['Methylobacter' 'Methylotenera' 'Flavobacterium' '' 'Acidovorax']

Errors are thrown by functions below if you drop min_abunance below. I think it is hanging up on multiple "other" rows.



In [10]:

    
data_dict = bacteriopop_utils.break_apart_experiments(raw_dmd_data)









    



('Low', 1)
('Low', 2)
('Low', 3)
('Low', 4)
dictionary keys: [('Low', 1), ('Low', 4), ('Low', 2), ('Low', 3)]



In [11]:

    
data_dict.keys()









    Out[11]:





[('Low', 1), ('Low', 4), ('Low', 2), ('Low', 3)]



In [12]:

    
# Can't view generators very easily!!!
data_dict.itervalues()









    Out[12]:





<dictionary-valueiterator at 0x9e848b8>



In [13]:

    
# But we can make a list from them and grab the 0th item
first_df = list(data_dict.itervalues())[0]



In [14]:

    
first_df.head(3)









    Out[14]:






  
    
      
      kingdom
      phylum
      class
      order
      family
      genus
      length
      oxygen
      replicate
      week
      abundance
    
    
      sampleID
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      9948861
      Low
      1
      4
      0.228531
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Methylophilales
      Methylophilaceae
      Methylotenera
      5066955
      Low
      1
      4
      0.220860
    
    
      1056013
      Bacteria
      Bacteroidetes
      Flavobacteriia
      Flavobacteriales
      Flavobacteriaceae
      Flavobacterium
      4654774
      Low
      1
      4
      0.054719



In [15]:

    
first_df[first_df['genus'] == 'other'].head()









    Out[15]:






  
    
      
      kingdom
      phylum
      class
      order
      family
      genus
      length
      oxygen
      replicate
      week
      abundance
    
    
      sampleID



In [16]:

    
first_df[first_df['genus'] != ''].pivot(index='genus', columns='week', values='abundance')









    Out[16]:






  
    
      week
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
    
    
      genus
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Acidovorax
      0.020355
      0.020589
      0.013148
      0.013226
      0.010375
      0.018167
      0.020856
      0.016379
      0.027526
      0.015568
      0.018448
    
    
      Bacteriovorax
      0.000008
      0.000101
      0.000015
      0.002874
      0.008906
      0.001774
      0.001367
      0.000004
      0.000004
      0.000002
      0.000234
    
    
      Bdellovibrio
      0.000150
      0.000199
      0.000027
      0.000391
      0.001179
      0.000179
      0.000124
      0.000017
      0.000023
      0.000008
      0.000021
    
    
      Dechloromonas
      0.020332
      0.002072
      0.003915
      0.001223
      0.000508
      0.001263
      0.000793
      0.000528
      0.000661
      0.001272
      0.000542
    
    
      Flavobacterium
      0.054719
      0.052710
      0.009651
      0.000718
      0.003886
      0.004191
      0.003909
      0.027589
      0.009506
      0.000864
      0.003408
    
    
      Hylemonella
      0.000195
      0.000420
      0.000764
      0.003060
      0.001922
      0.002836
      0.001870
      0.000403
      0.000476
      0.000245
      0.000293
    
    
      Methylobacter
      0.228531
      0.337567
      0.602610
      0.582960
      0.577079
      0.373659
      0.576840
      0.520927
      0.496885
      0.524244
      0.443803
    
    
      Methyloglobulus
      0.006307
      0.003741
      0.004548
      0.002640
      0.003330
      0.002085
      0.003678
      0.005605
      0.003896
      0.005911
      0.005425
    
    
      Methylomicrobium
      0.010910
      0.007253
      0.009028
      0.007626
      0.006712
      0.004788
      0.007907
      0.014478
      0.018061
      0.020332
      0.016700
    
    
      Methylomonas
      0.019269
      0.012871
      0.015736
      0.011606
      0.009817
      0.007646
      0.011084
      0.019213
      0.019743
      0.020395
      0.019126
    
    
      Methylophilus
      0.001383
      0.000820
      0.000669
      0.000737
      0.001022
      0.002151
      0.000689
      0.009797
      0.003772
      0.003104
      0.002799
    
    
      Methylosarcina
      0.014401
      0.011505
      0.010429
      0.006646
      0.005975
      0.004389
      0.006457
      0.008545
      0.009713
      0.010087
      0.009108
    
    
      Methylotenera
      0.220860
      0.131572
      0.064071
      0.125043
      0.170347
      0.373464
      0.150743
      0.004887
      0.079913
      0.110765
      0.195370
    
    
      Methylovulum
      0.009695
      0.004740
      0.004765
      0.004232
      0.004008
      0.003017
      0.003997
      0.008699
      0.009471
      0.010238
      0.013060
    
    
      Polaromonas
      0.002880
      0.003663
      0.001581
      0.001609
      0.001200
      0.001901
      0.001162
      0.002082
      0.002825
      0.001233
      0.001682
    
    
      Rheinheimera
      0.000746
      0.000054
      0.000048
      0.000063
      0.000071
      0.000149
      0.000082
      0.000442
      0.000497
      0.000913
      0.000881
    
    
      Sorangium
      0.004535
      0.005983
      0.000559
      0.001630
      0.000059
      0.000040
      0.000128
      0.000507
      0.000468
      0.000368
      0.000778



In [17]:

    
raw_dmd_data.columns









    Out[17]:





Index([u'kingdom', u'phylum', u'class', u'order', u'family', u'genus',
       u'length', u'oxygen', u'replicate', u'week', u'abundance'],
      dtype='object')



In [18]:

    
DMD_input_dict = \
    bacteriopop_utils.prepare_DMD_matrices(raw_dmd_data,
                                           groupby_level = "genus")









    



('Low', 1)
('Low', 2)
('Low', 3)
('Low', 4)
dictionary keys: [('Low', 1), ('Low', 4), ('Low', 2), ('Low', 3)]
           kingdom          phylum                class            order  \
sampleID                                                                   
1056013   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   
1056013   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056013   Methylococcaceae  Methylobacter  9948861    Low          1     4   
1056013   Methylophilaceae  Methylotenera  5066955    Low          1     4   

          abundance  
sampleID             
1056013    0.228531  
1056013    0.220860  
           kingdom          phylum                class            order  \
sampleID                                                                   
1056022   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   
1056022   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056022   Methylococcaceae  Methylobacter  8211298    Low          4     4   
1056022   Methylophilaceae  Methylotenera  3910112    Low          4     4   

          abundance  
sampleID             
1056022    0.200109  
1056022    0.165314  
           kingdom          phylum                class            order  \
sampleID                                                                   
1056016   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   
1056016   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056016   Methylophilaceae  Methylotenera  5558102    Low          2     4   
1056016   Methylococcaceae  Methylobacter  8256230    Low          2     4   

          abundance  
sampleID             
1056016    0.240462  
1056016    0.183486  
           kingdom          phylum                class            order  \
sampleID                                                                   
1056019   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   
1056019   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056019   Methylococcaceae  Methylobacter  9550864    Low          3     4   
1056019   Methylophilaceae  Methylotenera  6085760    Low          3     4   

          abundance  
sampleID             
1056019    0.207635  
1056019    0.203387



In [19]:

    
type(DMD_input_dict)









    Out[19]:





dict

We can get each dataframe out like this:



In [20]:

    
DMD_input_dict[('Low', 1)]









    Out[20]:






  
    
      
      kingdom
      phylum
      class
      order
      family
      genus
      length
      oxygen
      replicate
      week
      abundance
    
    
      sampleID
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      9948861
      Low
      1
      4
      2.285315e-01
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Methylophilales
      Methylophilaceae
      Methylotenera
      5066955
      Low
      1
      4
      2.208597e-01
    
    
      1056013
      Bacteria
      Bacteroidetes
      Flavobacteriia
      Flavobacteriales
      Flavobacteriaceae
      Flavobacterium
      4654774
      Low
      1
      4
      5.471935e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      
      3046340
      Low
      1
      4
      4.795637e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      
      
      
      5620690
      Low
      1
      4
      4.090279e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      
      
      
      
      3930509
      Low
      1
      4
      3.926640e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Burkholderiales
      Comamonadaceae
      
      2023406
      Low
      1
      4
      3.031538e-02
    
    
      1056013
      Bacteria
      
      
      
      
      
      2759851
      Low
      1
      4
      2.537614e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Burkholderiales
      Comamonadaceae
      Acidovorax
      1257730
      Low
      1
      4
      2.035456e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Rhodocyclales
      Rhodocyclaceae
      Dechloromonas
      1805462
      Low
      1
      4
      2.033165e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylomonas
      1317343
      Low
      1
      4
      1.926908e-02
    
    
      1056013
      unassigned
      
      
      
      
      
      920784
      Low
      1
      4
      1.446088e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylosarcina
      892878
      Low
      1
      4
      1.440142e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      
      
      
      1140440
      Low
      1
      4
      1.411396e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylomicrobium
      672916
      Low
      1
      4
      1.091001e-02
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylovulum
      545587
      Low
      1
      4
      9.694810e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Burkholderiales
      
      
      631062
      Low
      1
      4
      7.537654e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methyloglobulus
      338946
      Low
      1
      4
      6.306720e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Methylophilales
      Methylophilaceae
      
      517675
      Low
      1
      4
      6.231354e-03
    
    
      1056013
      Bacteria
      Cyanobacteria
      
      
      
      
      516781
      Low
      1
      4
      5.911939e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Deltaproteobacteria
      Myxococcales
      Polyangiaceae
      
      650466
      Low
      1
      4
      5.397583e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Deltaproteobacteria
      Myxococcales
      Polyangiaceae
      Sorangium
      527266
      Low
      1
      4
      4.535422e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Deltaproteobacteria
      Myxococcales
      
      
      522193
      Low
      1
      4
      4.054392e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Rhodocyclales
      
      
      258924
      Low
      1
      4
      3.578677e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Burkholderiales
      Comamonadaceae
      Polaromonas
      184581
      Low
      1
      4
      2.879513e-03
    
    
      1056013
      Bacteria
      Bacteroidetes
      
      
      
      
      335358
      Low
      1
      4
      2.595746e-03
    
    
      1056013
      Bacteria
      Bacteroidetes
      Flavobacteriia
      Flavobacteriales
      Flavobacteriaceae
      
      316010
      Low
      1
      4
      2.477658e-03
    
    
      1056013
      Bacteria
      Actinobacteria
      Actinobacteria
      Actinomycetales
      
      
      217492
      Low
      1
      4
      2.031323e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Rhodocyclales
      Rhodocyclaceae
      
      175689
      Low
      1
      4
      1.995820e-03
    
    
      1056013
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Neisseriales
      
      
      148127
      Low
      1
      4
      1.962090e-03
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1056253
      Bacteria
      Ignavibacteriae
      Ignavibacteria
      Ignavibacteriales
      
      
      1209
      Low
      1
      14
      2.340000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Deltaproteobacteria
      Desulfuromonadales
      Geobacteraceae
      
      658
      Low
      1
      14
      2.330000e-06
    
    
      1056253
      Bacteria
      Actinobacteria
      Actinobacteria
      Actinomycetales
      Propionibacteriaceae
      
      433
      Low
      1
      14
      2.240000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Acidithiobacillales
      
      
      849
      Low
      1
      14
      2.220000e-06
    
    
      1056253
      Bacteria
      Firmicutes
      Clostridia
      Clostridiales
      Clostridiaceae
      
      884
      Low
      1
      14
      2.100000e-06
    
    
      1056253
      Bacteria
      Firmicutes
      Bacilli
      Bacillales
      Alicyclobacillaceae
      
      816
      Low
      1
      14
      1.900000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Oceanospirillales
      Halomonadaceae
      
      567
      Low
      1
      14
      1.820000e-06
    
    
      1056253
      Bacteria
      Actinobacteria
      Actinobacteria
      Actinomycetales
      Intrasporangiaceae
      
      345
      Low
      1
      14
      1.770000e-06
    
    
      1056253
      Bacteria
      Bacteroidetes
      Sphingobacteriia
      Sphingobacteriales
      Sphingobacteriaceae
      
      680
      Low
      1
      14
      1.750000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Epsilonproteobacteria
      
      
      
      540
      Low
      1
      14
      1.700000e-06
    
    
      1056253
      Archaea
      Thaumarchaeota
      
      
      
      
      1215
      Low
      1
      14
      1.600000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Alphaproteobacteria
      Caulobacterales
      Caulobacteraceae
      
      269
      Low
      1
      14
      1.520000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Chromatiales
      Chromatiaceae
      
      443
      Low
      1
      14
      1.380000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Alphaproteobacteria
      Rhizobiales
      Methylocystaceae
      
      673
      Low
      1
      14
      1.360000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Deltaproteobacteria
      Desulfovibrionales
      Desulfohalobiaceae
      
      544
      Low
      1
      14
      1.310000e-06
    
    
      1056253
      Bacteria
      Verrucomicrobia
      Opitutae
      
      
      
      510
      Low
      1
      14
      1.200000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Alphaproteobacteria
      Rhodospirillales
      Acetobacteraceae
      
      255
      Low
      1
      14
      1.200000e-06
    
    
      1056253
      Bacteria
      Firmicutes
      Clostridia
      Clostridiales
      Peptococcaceae
      
      255
      Low
      1
      14
      1.200000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Alphaproteobacteria
      Rhodobacterales
      
      
      259
      Low
      1
      14
      1.180000e-06
    
    
      1056253
      Bacteria
      Thermotogae
      Thermotogae
      Thermotogales
      
      
      260
      Low
      1
      14
      1.180000e-06
    
    
      1056253
      Bacteria
      Bacteroidetes
      Bacteroidia
      Bacteroidales
      
      
      382
      Low
      1
      14
      1.070000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Alphaproteobacteria
      Rhizobiales
      Aurantimonadaceae
      
      1184
      Low
      1
      14
      1.030000e-06
    
    
      1056253
      Bacteria
      Bacteroidetes
      Cytophagia
      Cytophagales
      Cyclobacteriaceae
      
      253
      Low
      1
      14
      1.010000e-06
    
    
      1056253
      Bacteria
      Firmicutes
      Bacilli
      Bacillales
      Bacillaceae
      
      255
      Low
      1
      14
      1.000000e-06
    
    
      1056253
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Methylophilales
      
      
      309
      Low
      1
      14
      9.910000e-07
    
    
      1056253
      Bacteria
      Bacteroidetes
      Sphingobacteriia
      Sphingobacteriales
      Saprospiraceae
      
      318
      Low
      1
      14
      9.630000e-07
    
    
      1056253
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Pseudomonadales
      Moraxellaceae
      
      440
      Low
      1
      14
      9.280000e-07
    
    
      1056253
      Bacteria
      Verrucomicrobia
      Verrucomicrobiae
      Verrucomicrobiales
      
      
      332
      Low
      1
      14
      7.680000e-07
    
    
      1056253
      Bacteria
      Actinobacteria
      Actinobacteria
      
      
      
      366
      Low
      1
      14
      5.580000e-07
    
    
      1056253
      Bacteria
      Proteobacteria
      Alphaproteobacteria
      Kordiimonadales
      
      
      249
      Low
      1
      14
      4.100000e-07
    
  

1558 rows × 11 columns



In [21]:

    
DMD_input_dict[('Low', 1)].shape









    Out[21]:





(1558, 11)



In [22]:

    
DMD_input_dict[('Low', 1)].groupby('week')['abundance'].sum()









    Out[22]:





week
4     0.890610
5     0.889791
6     0.925614
7     0.931329
8     0.943502
9     0.930810
10    0.929779
11    0.918814
12    0.918659
13    0.925245
14    0.925412
Name: abundance, dtype: float64

DMD

TODO: test DMD on this abundance marix.



In [23]:

    
DMD_test_matrix = DMD_input_dict[('Low', 1)]



In [24]:

    
# Who is in there?



In [25]:

    
DMD_test_matrix.reset_index()['genus'].unique()









    Out[25]:





array(['Methylobacter', 'Methylotenera', 'Flavobacterium', '',
       'Acidovorax', 'Dechloromonas', 'Methylomonas', 'Methylosarcina',
       'Methylomicrobium', 'Methylovulum', 'Methyloglobulus', 'Sorangium',
       'Polaromonas', 'Methylophilus', 'Rheinheimera', 'Hylemonella',
       'Bdellovibrio', 'Bacteriovorax'], dtype=object)

I'm stuck at the installation of modred :(



In [ ]:

    
# following example 1: https://pythonhosted.org/modred/tutorial_modaldecomp.html
import modred as MR



In [ ]:

    
num_modes = 1
modes, eig_vals = MR.compute_POD_matrices_snaps_method(DMD_test_matrix, range(num_modes))



In [ ]:

    
modes



In [ ]:

    
eig_vals

Feature extraction and PCA



In [ ]:

    
extracted_features = bacteriopop_utils.extract_features(
    dataframe = loaded_data,
    column_list = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'oxygen', 'abundance']
    # default list was: ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'length', 'abundance', 'project']
    )



In [ ]:

    
extracted_features.head()



In [ ]:

    
extracted_features.shape

Just do PCA on a tiny bit of the data as a demo



In [ ]:

    
pca_results = feature_selection_utils.pca_bacteria(
    data = extracted_features.head(100), n_components = 10)



In [ ]:

    
pca_results.components_

Do correlations for a tiny subset of the data.



In [ ]:

    
feature_selection_utils.calculate_features_target_correlation(
    data = extracted_features.head(100),
    features = extracted_features.columns.tolist(),
    target='abundance',
    method="Pearson")



In [ ]:



In [ ]:

	kingdom	phylum	class	order	family	genus	length	oxygen	replicate	week	abundance
sampleID
1056013	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae	Methylobacter	9948861	Low	1	4	0.228531
1056013	Bacteria	Proteobacteria	Betaproteobacteria	Methylophilales	Methylophilaceae	Methylotenera	5066955	Low	1	4	0.220860
1056013	Bacteria	Bacteroidetes	Flavobacteriia	Flavobacteriales	Flavobacteriaceae	Flavobacterium	4654774	Low	1	4	0.054719
1056013	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae		3046340	Low	1	4	0.047956
1056013	Bacteria	Proteobacteria	Gammaproteobacteria				5620690	Low	1	4	0.040903

	kingdom	phylum	class	order	family	genus	length	oxygen	replicate	week	abundance
sampleID
1056025	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae	Methylobacter	4802092	High	1	4	0.070167
1056028	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae	Methylobacter	3885957	High	2	4	0.084614
1056031	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae	Methylobacter	5692971	High	3	4	0.075084
1056034	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae	Methylobacter	4933896	High	4	4	0.103972
1056049	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae	Methylobacter	7618692	High	1	5	0.242867

week	4	5	6	7	8	9	10	11	12	13	14
genus
Acidovorax	0.020355	0.020589	0.013148	0.013226	0.010375	0.018167	0.020856	0.016379	0.027526	0.015568	0.018448
Bacteriovorax	0.000008	0.000101	0.000015	0.002874	0.008906	0.001774	0.001367	0.000004	0.000004	0.000002	0.000234
Bdellovibrio	0.000150	0.000199	0.000027	0.000391	0.001179	0.000179	0.000124	0.000017	0.000023	0.000008	0.000021
Dechloromonas	0.020332	0.002072	0.003915	0.001223	0.000508	0.001263	0.000793	0.000528	0.000661	0.001272	0.000542
Flavobacterium	0.054719	0.052710	0.009651	0.000718	0.003886	0.004191	0.003909	0.027589	0.009506	0.000864	0.003408
Hylemonella	0.000195	0.000420	0.000764	0.003060	0.001922	0.002836	0.001870	0.000403	0.000476	0.000245	0.000293
Methylobacter	0.228531	0.337567	0.602610	0.582960	0.577079	0.373659	0.576840	0.520927	0.496885	0.524244	0.443803
Methyloglobulus	0.006307	0.003741	0.004548	0.002640	0.003330	0.002085	0.003678	0.005605	0.003896	0.005911	0.005425
Methylomicrobium	0.010910	0.007253	0.009028	0.007626	0.006712	0.004788	0.007907	0.014478	0.018061	0.020332	0.016700
Methylomonas	0.019269	0.012871	0.015736	0.011606	0.009817	0.007646	0.011084	0.019213	0.019743	0.020395	0.019126
Methylophilus	0.001383	0.000820	0.000669	0.000737	0.001022	0.002151	0.000689	0.009797	0.003772	0.003104	0.002799
Methylosarcina	0.014401	0.011505	0.010429	0.006646	0.005975	0.004389	0.006457	0.008545	0.009713	0.010087	0.009108
Methylotenera	0.220860	0.131572	0.064071	0.125043	0.170347	0.373464	0.150743	0.004887	0.079913	0.110765	0.195370
Methylovulum	0.009695	0.004740	0.004765	0.004232	0.004008	0.003017	0.003997	0.008699	0.009471	0.010238	0.013060
Polaromonas	0.002880	0.003663	0.001581	0.001609	0.001200	0.001901	0.001162	0.002082	0.002825	0.001233	0.001682
Rheinheimera	0.000746	0.000054	0.000048	0.000063	0.000071	0.000149	0.000082	0.000442	0.000497	0.000913	0.000881
Sorangium	0.004535	0.005983	0.000559	0.001630	0.000059	0.000040	0.000128	0.000507	0.000468	0.000368	0.000778