In [1]:
import bacteriopop_utils
import feature_selection_utils
import load_data
import dynamic_mode_decomposition as dmd
import network_construction as net
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


C:\Program Files\Anaconda\lib\site-packages\matplotlib\__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [2]:
loaded_data = data = load_data.load_data()

In [3]:
loaded_data.shape


Out[3]:
(64755, 11)

In [4]:
loaded_data.head()


Out[4]:
kingdom phylum class order family genus length oxygen replicate week abundance
sampleID
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae Methylobacter 9948861 Low 1 4 0.228531
1056013 Bacteria Proteobacteria Betaproteobacteria Methylophilales Methylophilaceae Methylotenera 5066955 Low 1 4 0.220860
1056013 Bacteria Bacteroidetes Flavobacteriia Flavobacteriales Flavobacteriaceae Flavobacterium 4654774 Low 1 4 0.054719
1056013 Bacteria Proteobacteria Gammaproteobacteria Methylococcales Methylococcaceae 3046340 Low 1 4 0.047956
1056013 Bacteria Proteobacteria Gammaproteobacteria 5620690 Low 1 4 0.040903

Demo extraction of categorical featues to binary columns:


In [5]:
extracted_features = bacteriopop_utils.extract_features(
    dataframe = loaded_data,
    column_list = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'oxygen', 'abundance']
    # default list was: ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'length', 'abundance', 'project']
    )

In [6]:
extracted_features.head()


Out[6]:
abundance class= class=Acidobacteriia class=Actinobacteria class=Alphaproteobacteria class=Anaerolineae class=Aquificae class=Archaeoglobi class=Bacilli class=Bacteroidia ... phylum=Planctomycetes phylum=Poribacteria phylum=Proteobacteria phylum=Spirochaetes phylum=Synergistetes phylum=Tenericutes phylum=Thaumarchaeota phylum=Thermodesulfobacteria phylum=Thermotogae phylum=Verrucomicrobia
0 0.228531 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
1 0.220860 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
2 0.054719 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0.047956 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
4 0.040903 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0

5 rows × 1450 columns


In [7]:
extracted_features.shape


Out[7]:
(64755, 1450)

Just do PCA on a tiny bit of the data as a demo


In [8]:
pca_results = feature_selection_utils.pca_bacteria(
    data = extracted_features.head(100), n_components = 10)

In [9]:
pca_results.components_


Out[9]:
array([[-0.00272314, -0.03975207, -0.00449753, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00259397,  0.19156772,  0.01794534, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00975221, -0.07112615,  0.00182794, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.02491242, -0.00092397,  0.00278457, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00165731, -0.10002272,  0.03476809, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00781743,  0.30522996, -0.02791468, ...,  0.        ,
         0.        ,  0.        ]])

Do correlations for a tiny subset of the data.


In [10]:
feature_selection_utils.calculate_features_target_correlation(
    data = extracted_features.head(100),
    features = extracted_features.columns.tolist(),
    target='abundance',
    method="Pearson")


Out[10]:
[1.0,
 0.05712255379759805,
 -0.027142703312487231,
 -0.051814815473068342,
 -0.084902856481758493,
 nan,
 nan,
 nan,
 -0.027244216677764683,
 nan,
 0.026510345994575166,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.092416636286092216,
 nan,
 nan,
 nan,
 0.084502315899734545,
 nan,
 0.087133017969123955,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.03880740443904402,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.0024052351073851441,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027142703312487231,
 nan,
 nan,
 nan,
 nan,
 -0.027610089966615979,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026377266058169365,
 nan,
 nan,
 -0.05004869648853312,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.037698514516767999,
 nan,
 -0.027348151171924347,
 nan,
 nan,
 -0.055793184905792437,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.0044912362737061749,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027479852502327623,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026872686836197524,
 nan,
 nan,
 nan,
 nan,
 -0.027193012443702279,
 nan,
 nan,
 nan,
 nan,
 -0.024614743283835769,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.084502315899734545,
 nan,
 nan,
 nan,
 -0.03721545468813179,
 nan,
 -0.027499942588456866,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027567553933391213,
 nan,
 nan,
 nan,
 nan,
 -0.02451730139331533,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.025862731104755243,
 nan,
 nan,
 nan,
 -0.027635824173477076,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.27594589847917828,
 nan,
 0.26327793101199798,
 nan,
 nan,
 nan,
 -0.02697971067837893,
 nan,
 -0.026555279636934638,
 nan,
 -0.047619592608600714,
 nan,
 -0.0268083264550768,
 nan,
 nan,
 -0.02509693953889364,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.047397164823425215,
 -0.027244216677764683,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027824582094406459,
 nan,
 -0.027882247852148098,
 nan,
 nan,
 -0.019897681085334434,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.010070391165141194,
 nan,
 nan,
 nan,
 nan,
 -0.027691037722376532,
 nan,
 nan,
 -0.048327763593380008,
 -0.039559466993493365,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027981703724075067,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.024389351414081304,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.025765960541370314,
 nan,
 nan,
 nan,
 nan,
 -0.026054618777654328,
 nan,
 nan,
 -0.053850010263212955,
 -0.011784225376097848,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.033827469557944086,
 -0.02697971067837893,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027610089966615979,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027693269263502893,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.025529696901615832,
 nan,
 -0.028023947605676038,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026373489843032141,
 nan,
 nan,
 nan,
 nan,
 -0.027720545036378857,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026377266058169365,
 nan,
 nan,
 nan,
 -0.025019233422958715,
 nan,
 nan,
 nan,
 -0.017995031932285306,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.028024572312871577,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026514776233092383,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.028051176758612042,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026345315859312837,
 nan,
 -0.025778349013417214,
 nan,
 nan,
 -0.027479852502327623,
 nan,
 0.033756259153644386,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026872686836197524,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027193012443702279,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.14063309623234968,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026522484063166728,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027499942588456866,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.025862731104755243,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027567553933391213,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027570630849428956,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026707596304790776,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027176701680706256,
 nan,
 nan,
 nan,
 nan,
 -0.027635824173477076,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026437741444292957,
 -0.027941719355564429,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027824582094406459,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.023503768358479674,
 0.68084048197912372,
 nan,
 -0.020494768249281495,
 nan,
 -0.027390932736832924,
 nan,
 nan,
 -0.0098331547226394155,
 0.0044738583896854643,
 0.030453795846349718,
 -0.027882247852148098,
 -0.025135780164877112,
 nan,
 0.01532513737208825,
 nan,
 0.65699675408758262,
 -0.027324462026429745,
 -0.021598162772399166,
 0.00069701543726718908,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.026555279636934638,
 nan,
 nan,
 -0.025897730247685539,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027538792538428336,
 -0.02509693953889364,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.027244216677764683,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 ...]

Find the interaction network among bactria


In [11]:
mappings, nodes_list = dmd.find_fixed_adjacency_matrix(0,'order',False)


all groups of oxygen/week/replicate have abundances that sum to 1
columns after aggregating on phylo level: Index([u'oxygen', u'replicate', u'week', u'kingdom', u'phylum', u'class',
       u'order', u'abundance'],
      dtype='object')

Grpahs Visualization


In [12]:
for key in mappings.keys():
    Adj = mappings[key]
    nodes = nodes_list[key]
#     g = net.create_one_graph(Adj,nodes,edge_treshhold=1e-10)

Make, reduce, and Panda-ify the Adjacency matrix for each series:

Note that "mappings" is called "linear mappings" in the code.


In [13]:
#Calculation parameters:
p = 1  # percent abundance to consider significant. 
adjacency_magnitude = 1.5 # signal to trim by before plotting

In [14]:
# Only look for bacteria who are p% of the population in at least 1 sample. 
mappings, nodes = dmd.find_fixed_adjacency_matrix(p/100,'order',True)


all groups of oxygen/week/replicate have abundances that sum to 1
columns after aggregating on phylo level: Index([u'oxygen', u'replicate', u'week', u'kingdom', u'phylum', u'class',
       u'order', u'abundance'],
      dtype='object')

In [15]:
# Only pull out species that have interactions with another member with magnitude greater than 1.5

mappings, nodes = net.reduce_all_adjacency_matrixes_in_dict(mappings, nodes, 
                                                            adjacency_magnitude)

In [16]:
# Convert all of the dataframes to Pandas
mappings = dmd.DMD_results_dict_from_numpy_to_pandas(mappings,nodes)

# aggregate_adjacency_matrix_over_replicates is depreciated!
# std_mappings, avg_mappings, snr_mappings = dmd.aggregate_adjacency_matrix_over_replicates(mappings)

Loop over the individual series and plot.


In [17]:
mappings.keys()


Out[17]:
[('High', 4),
 ('Low', 1),
 ('High', 3),
 ('Low', 2),
 ('High', 2),
 ('Low', 3),
 ('High', 1),
 ('Low', 4)]

Low O2 replicates


In [18]:
k = 1
plt.figure(k)
for i in range(1,5):
    rep=('Low',i)
    net.plot_heatmap(mappings[rep], 
                         ('Low'+ ' Oxygen Replicate '+str(i)+': Most Significant Interactions'), 
                         './plots/'+'Low'+str(i), 
                         file_type='.png',
                         width=14, height=10) 

    k += 1   
    plt.figure(k)


<matplotlib.figure.Figure at 0x1b4af630>

High O2 replicates


In [19]:
k = 1
plt.figure(k)
for i in range(1,5):
    rep=('High',i)
    net.plot_heatmap(mappings[rep], 
                         ('High'+ ' Oxygen Replicate '+str(i)+': Most Significant Interactions'), 
                         './plots/'+'High'+str(i), 
                         file_type='.png',
                         width=14, height=10) 

    k += 1   
    plt.figure(k)


<matplotlib.figure.Figure at 0x1aca9400>

Aggregate the matrices for the 4 high-oxygen samples, then the 4 low-oxygen samples

Low O2 replicates

Find the standard deviation and mean of the replicates adjacency matrix:


In [20]:
# mappings, nodes
low_agg = net.aggregate_adjacency_matrices([
        mappings[('Low', 1)],
        mappings[('Low', 2)],
        mappings[('Low', 3)],
        mappings[('Low', 4)]
    ])

In [21]:
low_agg.keys()


Out[21]:
['standard deviation', 'signal to noise', 'mean']

In [22]:
p_low_mean = net.plot_heatmap(low_agg['mean'], 
                         'Low Oxygen: Most Significant Interactions', 
                         './plots/poster/low_avg--FROM_PANELS', 
                         file_type='.png',
                         width=14, height=10)



In [23]:
p_low_std = net.plot_heatmap(low_agg['standard deviation'], 
                         'Low Oxygen: Most Significant Interactions', 
                         './plots/poster/low_avg--FROM_PANELS', 
                         file_type='.png',
                         width=14, height=10)



In [28]:
p_low_snr = net.plot_heatmap(low_agg['signal to noise'], 
                         'Low Oxygen: Most Significant Interactions', 
                         './plots/poster/low_avg--FROM_PANELS', 
                         file_type='.png',
                         width=14, height=10)


High O2 replicates

Find the standard deviation and mean of the replicates adjacency matrix:


In [24]:
# mappings, nodes
high_agg = net.aggregate_adjacency_matrices([
        mappings[('High', 1)],
        mappings[('High', 2)],
        mappings[('High', 3)],
        mappings[('High', 4)]
    ])

In [25]:
high_agg.keys()


Out[25]:
['standard deviation', 'signal to noise', 'mean']

In [26]:
p_high_mean = net.plot_heatmap(high_agg['mean'], 
                          'High Oxygen: Most Significant Interactions', 
                          './plots/poster/high_avg--FROM_PANELS', 
                         file_type='.png',
                width=14, height=10)



In [27]:
p_high_std = net.plot_heatmap(high_agg['standard deviation'], 
                          'High Oxygen: Most Significant Interactions', 
                          './plots/poster/high_avg--FROM_PANELS', 
                         file_type='.png',
                width=14, height=10)



In [29]:
p_high_snr = net.plot_heatmap(high_agg['signal to noise'], 
                          'High Oxygen: Most Significant Interactions', 
                          './plots/poster/high_avg--FROM_PANELS', 
                         file_type='.png',
                width=14, height=10)



In [ ]: