for some reason Janet's virtualenv is much happier with this TkAgg thing set.


In [1]:
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import bacteriopop_utils
import feature_selection_utils
import load_data

In [3]:
loaded_data = data = load_data.load_data()

In [4]:
loaded_data.shape


Out[4]:
(64755, 11)

make sure none of the phyla are NA (checking 160304 update to load_data.py


In [5]:
loaded_data[loaded_data['phylum'].isnull()].head(3)


Out[5]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID

In [6]:
loaded_data.head()


Out[6]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 9948861 Low 1 4 0.228531
1056013 Bacteria Proteobacteria Betaproteobacteria Methylophilales Methylophilaceae Methylotenera 5066955 Low 1 4 0.220860
1056013 Bacteria Bacteroidetes Flavobacteriia Flavobacteriales Flavobacteriaceae Flavobacterium 4654774 Low 1 4 0.054719
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae 3046340 Low 1 4 0.047956
1056013 Bacteria Proteobacteria Gammaproteobacteria 5620690 Low 1 4 0.040903

Test filter and reduce functions using a high threshold, which selects for genus==Methylobacter


In [7]:
bacteriopop_utils.filter_by_abundance(dataframe=loaded_data, low= 0.6).head()


first (up to) 5 phylo columns to keep: ['Methylobacter']
Out[7]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 9948861 Low 1 4 0.228531
1056016 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 8256230 Low 2 4 0.183486
1056019 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 9550864 Low 3 4 0.207635
1056022 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 8211298 Low 4 4 0.200109
1056025 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 4802092 High 1 4 0.070167

In [8]:
bacteriopop_utils.reduce_data(dataframe=loaded_data, min_abundance= 0.6, 
                              phylo_column='genus', oxygen='high').head()


first (up to) 5 phylo columns to keep: ['Methylobacter']
Out[8]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID
1056025 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 4802092 High 1 4 0.070167
1056028 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 3885957 High 2 4 0.084614
1056031 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 5692971 High 3 4 0.075084
1056034 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 4933896 High 4 4 0.103972
1056049 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 7618692 High 1 5 0.242867

Demo of DMD data prep


In [9]:
raw_dmd_data = bacteriopop_utils.reduce_data(
    dataframe=loaded_data, min_abundance= 0.01, 
    phylo_column='genus', oxygen='Low')


first (up to) 5 phylo columns to keep: ['Methylobacter' 'Methylotenera' 'Flavobacterium' '' 'Acidovorax']

Errors are thrown by functions below if you drop min_abunance below. I think it is hanging up on multiple "other" rows.


In [10]:
data_dict = bacteriopop_utils.break_apart_experiments(raw_dmd_data)


('Low', 1)
('Low', 2)
('Low', 3)
('Low', 4)
dictionary keys: [('Low', 1), ('Low', 4), ('Low', 2), ('Low', 3)]

In [11]:
data_dict.keys()


Out[11]:
[('Low', 1), ('Low', 4), ('Low', 2), ('Low', 3)]

In [12]:
# Can't view generators very easily!!!
data_dict.itervalues()


Out[12]:
<dictionary-valueiterator at 0x9e848b8>

In [13]:
# But we can make a list from them and grab the 0th item
first_df = list(data_dict.itervalues())[0]

In [14]:
first_df.head(3)


Out[14]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 9948861 Low 1 4 0.228531
1056013 Bacteria Proteobacteria Betaproteobacteria Methylophilales Methylophilaceae Methylotenera 5066955 Low 1 4 0.220860
1056013 Bacteria Bacteroidetes Flavobacteriia Flavobacteriales Flavobacteriaceae Flavobacterium 4654774 Low 1 4 0.054719

In [15]:
first_df[first_df['genus'] == 'other'].head()


Out[15]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID

In [16]:
first_df[first_df['genus'] != ''].pivot(index='genus', columns='week', values='abundance')


Out[16]:
week 4 5 6 7 8 9 10 11 12 13 14
genus
Acidovorax 0.020355 0.020589 0.013148 0.013226 0.010375 0.018167 0.020856 0.016379 0.027526 0.015568 0.018448
Bacteriovorax 0.000008 0.000101 0.000015 0.002874 0.008906 0.001774 0.001367 0.000004 0.000004 0.000002 0.000234
Bdellovibrio 0.000150 0.000199 0.000027 0.000391 0.001179 0.000179 0.000124 0.000017 0.000023 0.000008 0.000021
Dechloromonas 0.020332 0.002072 0.003915 0.001223 0.000508 0.001263 0.000793 0.000528 0.000661 0.001272 0.000542
Flavobacterium 0.054719 0.052710 0.009651 0.000718 0.003886 0.004191 0.003909 0.027589 0.009506 0.000864 0.003408
Hylemonella 0.000195 0.000420 0.000764 0.003060 0.001922 0.002836 0.001870 0.000403 0.000476 0.000245 0.000293
Methylobacter 0.228531 0.337567 0.602610 0.582960 0.577079 0.373659 0.576840 0.520927 0.496885 0.524244 0.443803
Methyloglobulus 0.006307 0.003741 0.004548 0.002640 0.003330 0.002085 0.003678 0.005605 0.003896 0.005911 0.005425
Methylomicrobium 0.010910 0.007253 0.009028 0.007626 0.006712 0.004788 0.007907 0.014478 0.018061 0.020332 0.016700
Methylomonas 0.019269 0.012871 0.015736 0.011606 0.009817 0.007646 0.011084 0.019213 0.019743 0.020395 0.019126
Methylophilus 0.001383 0.000820 0.000669 0.000737 0.001022 0.002151 0.000689 0.009797 0.003772 0.003104 0.002799
Methylosarcina 0.014401 0.011505 0.010429 0.006646 0.005975 0.004389 0.006457 0.008545 0.009713 0.010087 0.009108
Methylotenera 0.220860 0.131572 0.064071 0.125043 0.170347 0.373464 0.150743 0.004887 0.079913 0.110765 0.195370
Methylovulum 0.009695 0.004740 0.004765 0.004232 0.004008 0.003017 0.003997 0.008699 0.009471 0.010238 0.013060
Polaromonas 0.002880 0.003663 0.001581 0.001609 0.001200 0.001901 0.001162 0.002082 0.002825 0.001233 0.001682
Rheinheimera 0.000746 0.000054 0.000048 0.000063 0.000071 0.000149 0.000082 0.000442 0.000497 0.000913 0.000881
Sorangium 0.004535 0.005983 0.000559 0.001630 0.000059 0.000040 0.000128 0.000507 0.000468 0.000368 0.000778

In [17]:
raw_dmd_data.columns


Out[17]:
Index([u'kingdom', u'phylum', u'class', u'order', u'family', u'genus',
       u'length', u'oxygen', u'replicate', u'week', u'abundance'],
      dtype='object')

In [18]:
DMD_input_dict = \
    bacteriopop_utils.prepare_DMD_matrices(raw_dmd_data,
                                           groupby_level = "genus")


('Low', 1)
('Low', 2)
('Low', 3)
('Low', 4)
dictionary keys: [('Low', 1), ('Low', 4), ('Low', 2), ('Low', 3)]
           kingdom          phylum                class            order  \
sampleID                                                                   
1056013   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   
1056013   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056013   Methylococcaceae  Methylobacter  9948861    Low          1     4   
1056013   Methylophilaceae  Methylotenera  5066955    Low          1     4   

          abundance  
sampleID             
1056013    0.228531  
1056013    0.220860  
           kingdom          phylum                class            order  \
sampleID                                                                   
1056022   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   
1056022   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056022   Methylococcaceae  Methylobacter  8211298    Low          4     4   
1056022   Methylophilaceae  Methylotenera  3910112    Low          4     4   

          abundance  
sampleID             
1056022    0.200109  
1056022    0.165314  
           kingdom          phylum                class            order  \
sampleID                                                                   
1056016   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   
1056016   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056016   Methylophilaceae  Methylotenera  5558102    Low          2     4   
1056016   Methylococcaceae  Methylobacter  8256230    Low          2     4   

          abundance  
sampleID             
1056016    0.240462  
1056016    0.183486  
           kingdom          phylum                class            order  \
sampleID                                                                   
1056019   Bacteria  Proteobacteria  Gammaproteobacteria  Methylococcales   
1056019   Bacteria  Proteobacteria   Betaproteobacteria  Methylophilales   

                    family          genus   length oxygen  replicate  week  \
sampleID                                                                     
1056019   Methylococcaceae  Methylobacter  9550864    Low          3     4   
1056019   Methylophilaceae  Methylotenera  6085760    Low          3     4   

          abundance  
sampleID             
1056019    0.207635  
1056019    0.203387  

In [19]:
type(DMD_input_dict)


Out[19]:
dict

We can get each dataframe out like this:


In [20]:
DMD_input_dict[('Low', 1)]


Out[20]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 9948861 Low 1 4 2.285315e-01
1056013 Bacteria Proteobacteria Betaproteobacteria Methylophilales Methylophilaceae Methylotenera 5066955 Low 1 4 2.208597e-01
1056013 Bacteria Bacteroidetes Flavobacteriia Flavobacteriales Flavobacteriaceae Flavobacterium 4654774 Low 1 4 5.471935e-02
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae 3046340 Low 1 4 4.795637e-02
1056013 Bacteria Proteobacteria Gammaproteobacteria 5620690 Low 1 4 4.090279e-02
1056013 Bacteria Proteobacteria 3930509 Low 1 4 3.926640e-02
1056013 Bacteria Proteobacteria Betaproteobacteria Burkholderiales Comamonadaceae 2023406 Low 1 4 3.031538e-02
1056013 Bacteria 2759851 Low 1 4 2.537614e-02
1056013 Bacteria Proteobacteria Betaproteobacteria Burkholderiales Comamonadaceae Acidovorax 1257730 Low 1 4 2.035456e-02
1056013 Bacteria Proteobacteria Betaproteobacteria Rhodocyclales Rhodocyclaceae Dechloromonas 1805462 Low 1 4 2.033165e-02
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylomonas 1317343 Low 1 4 1.926908e-02
1056013 unassigned 920784 Low 1 4 1.446088e-02
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylosarcina 892878 Low 1 4 1.440142e-02
1056013 Bacteria Proteobacteria Betaproteobacteria 1140440 Low 1 4 1.411396e-02
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylomicrobium 672916 Low 1 4 1.091001e-02
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylovulum 545587 Low 1 4 9.694810e-03
1056013 Bacteria Proteobacteria Betaproteobacteria Burkholderiales 631062 Low 1 4 7.537654e-03
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methyloglobulus 338946 Low 1 4 6.306720e-03
1056013 Bacteria Proteobacteria Betaproteobacteria Methylophilales Methylophilaceae 517675 Low 1 4 6.231354e-03
1056013 Bacteria Cyanobacteria 516781 Low 1 4 5.911939e-03
1056013 Bacteria Proteobacteria Deltaproteobacteria Myxococcales Polyangiaceae 650466 Low 1 4 5.397583e-03
1056013 Bacteria Proteobacteria Deltaproteobacteria Myxococcales Polyangiaceae Sorangium 527266 Low 1 4 4.535422e-03
1056013 Bacteria Proteobacteria Deltaproteobacteria Myxococcales 522193 Low 1 4 4.054392e-03
1056013 Bacteria Proteobacteria Betaproteobacteria Rhodocyclales 258924 Low 1 4 3.578677e-03
1056013 Bacteria Proteobacteria Betaproteobacteria Burkholderiales Comamonadaceae Polaromonas 184581 Low 1 4 2.879513e-03
1056013 Bacteria Bacteroidetes 335358 Low 1 4 2.595746e-03
1056013 Bacteria Bacteroidetes Flavobacteriia Flavobacteriales Flavobacteriaceae 316010 Low 1 4 2.477658e-03
1056013 Bacteria Actinobacteria Actinobacteria Actinomycetales 217492 Low 1 4 2.031323e-03
1056013 Bacteria Proteobacteria Betaproteobacteria Rhodocyclales Rhodocyclaceae 175689 Low 1 4 1.995820e-03
1056013 Bacteria Proteobacteria Betaproteobacteria Neisseriales 148127 Low 1 4 1.962090e-03
... ... ... ... ... ... ... ... ... ... ... ...
1056253 Bacteria Ignavibacteriae Ignavibacteria Ignavibacteriales 1209 Low 1 14 2.340000e-06
1056253 Bacteria Proteobacteria Deltaproteobacteria Desulfuromonadales Geobacteraceae 658 Low 1 14 2.330000e-06
1056253 Bacteria Actinobacteria Actinobacteria Actinomycetales Propionibacteriaceae 433 Low 1 14 2.240000e-06
1056253 Bacteria Proteobacteria Gammaproteobacteria Acidithiobacillales 849 Low 1 14 2.220000e-06
1056253 Bacteria Firmicutes Clostridia Clostridiales Clostridiaceae 884 Low 1 14 2.100000e-06
1056253 Bacteria Firmicutes Bacilli Bacillales Alicyclobacillaceae 816 Low 1 14 1.900000e-06
1056253 Bacteria Proteobacteria Gammaproteobacteria Oceanospirillales Halomonadaceae 567 Low 1 14 1.820000e-06
1056253 Bacteria Actinobacteria Actinobacteria Actinomycetales Intrasporangiaceae 345 Low 1 14 1.770000e-06
1056253 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales Sphingobacteriaceae 680 Low 1 14 1.750000e-06
1056253 Bacteria Proteobacteria Epsilonproteobacteria 540 Low 1 14 1.700000e-06
1056253 Archaea Thaumarchaeota 1215 Low 1 14 1.600000e-06
1056253 Bacteria Proteobacteria Alphaproteobacteria Caulobacterales Caulobacteraceae 269 Low 1 14 1.520000e-06
1056253 Bacteria Proteobacteria Gammaproteobacteria Chromatiales Chromatiaceae 443 Low 1 14 1.380000e-06
1056253 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Methylocystaceae 673 Low 1 14 1.360000e-06
1056253 Bacteria Proteobacteria Deltaproteobacteria Desulfovibrionales Desulfohalobiaceae 544 Low 1 14 1.310000e-06
1056253 Bacteria Verrucomicrobia Opitutae 510 Low 1 14 1.200000e-06
1056253 Bacteria Proteobacteria Alphaproteobacteria Rhodospirillales Acetobacteraceae 255 Low 1 14 1.200000e-06
1056253 Bacteria Firmicutes Clostridia Clostridiales Peptococcaceae 255 Low 1 14 1.200000e-06
1056253 Bacteria Proteobacteria Alphaproteobacteria Rhodobacterales 259 Low 1 14 1.180000e-06
1056253 Bacteria Thermotogae Thermotogae Thermotogales 260 Low 1 14 1.180000e-06
1056253 Bacteria Bacteroidetes Bacteroidia Bacteroidales 382 Low 1 14 1.070000e-06
1056253 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Aurantimonadaceae 1184 Low 1 14 1.030000e-06
1056253 Bacteria Bacteroidetes Cytophagia Cytophagales Cyclobacteriaceae 253 Low 1 14 1.010000e-06
1056253 Bacteria Firmicutes Bacilli Bacillales Bacillaceae 255 Low 1 14 1.000000e-06
1056253 Bacteria Proteobacteria Betaproteobacteria Methylophilales 309 Low 1 14 9.910000e-07
1056253 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales Saprospiraceae 318 Low 1 14 9.630000e-07
1056253 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae 440 Low 1 14 9.280000e-07
1056253 Bacteria Verrucomicrobia Verrucomicrobiae Verrucomicrobiales 332 Low 1 14 7.680000e-07
1056253 Bacteria Actinobacteria Actinobacteria 366 Low 1 14 5.580000e-07
1056253 Bacteria Proteobacteria Alphaproteobacteria Kordiimonadales 249 Low 1 14 4.100000e-07

1558 rows × 11 columns


In [21]:
DMD_input_dict[('Low', 1)].shape


Out[21]:
(1558, 11)

In [22]:
DMD_input_dict[('Low', 1)].groupby('week')['abundance'].sum()


Out[22]:
week
4     0.890610
5     0.889791
6     0.925614
7     0.931329
8     0.943502
9     0.930810
10    0.929779
11    0.918814
12    0.918659
13    0.925245
14    0.925412
Name: abundance, dtype: float64

DMD

TODO: test DMD on this abundance marix.


In [23]:
DMD_test_matrix = DMD_input_dict[('Low', 1)]

In [24]:
# Who is in there?

In [25]:
DMD_test_matrix.reset_index()['genus'].unique()


Out[25]:
array(['Methylobacter', 'Methylotenera', 'Flavobacterium', '',
       'Acidovorax', 'Dechloromonas', 'Methylomonas', 'Methylosarcina',
       'Methylomicrobium', 'Methylovulum', 'Methyloglobulus', 'Sorangium',
       'Polaromonas', 'Methylophilus', 'Rheinheimera', 'Hylemonella',
       'Bdellovibrio', 'Bacteriovorax'], dtype=object)

I'm stuck at the installation of modred :(


In [ ]:
# following example 1: https://pythonhosted.org/modred/tutorial_modaldecomp.html
import modred as MR

In [ ]:
num_modes = 1
modes, eig_vals = MR.compute_POD_matrices_snaps_method(DMD_test_matrix, range(num_modes))

In [ ]:
modes

In [ ]:
eig_vals

Feature extraction and PCA


In [ ]:
extracted_features = bacteriopop_utils.extract_features(
    dataframe = loaded_data,
    column_list = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'oxygen', 'abundance']
    # default list was: ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'length', 'abundance', 'project']
    )

In [ ]:
extracted_features.head()

In [ ]:
extracted_features.shape

Just do PCA on a tiny bit of the data as a demo


In [ ]:
pca_results = feature_selection_utils.pca_bacteria(
    data = extracted_features.head(100), n_components = 10)

In [ ]:
pca_results.components_

Do correlations for a tiny subset of the data.


In [ ]:
feature_selection_utils.calculate_features_target_correlation(
    data = extracted_features.head(100),
    features = extracted_features.columns.tolist(),
    target='abundance',
    method="Pearson")

In [ ]:


In [ ]: