Imports


In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import scipy
import pylab
import scipy.cluster.hierarchy as sch
import pickle 

sys.path.append("../src/")
import utils 
import bmu
import mi


/usr/local/lib/python2.7/site-packages/pandas-0.14.0-py2.7-linux-x86_64.egg/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))

American Gut Project


In [2]:
data, samples_sex, features_sex = bmu.load_biom("/home/ditzler/Git/DataCollections/AmericanGut/AmericanGut-Gut-Sex.biom")
map_sex = bmu.load_map("/home/ditzler/Git/DataCollections/AmericanGut/AmericanGut-Gut-Sex.txt")
labels_sex_numeric, label_sex_map = utils.label_formatting(map_sex, samples_sex, "SEX")
data_sex = utils.normalize(1.+data)
data_sex_log = utils.normalize(1.+data, scale="log")

data, samples_diet, features_diet = bmu.load_biom("/home/ditzler/Git/DataCollections/AmericanGut/AmericanGut-Gut-Diet.biom")
map_diet = bmu.load_map("/home/ditzler/Git/DataCollections/AmericanGut/AmericanGut-Gut-Diet-OV.txt")
labels_diet_numeric, label_diet_map = utils.label_formatting(map_diet, samples_diet, "DIET_TYPE")
data_diet = utils.normalize(1.+data)
data_diet_log = utils.normalize(1.+data, scale="log")

Average Mutual Information


In [3]:
n_averages = 25
n_show = 2000
mutual_infos = np.zeros((len(features_sex),))
mutual_infos_log = np.zeros((len(features_sex),))
plt.figure()

for n in range(n_averages):
    # draw a bootstrap sample from the normalized data and compute the mutual 
    # information. 
    idx = np.random.randint(0, len(samples_sex), len(samples_sex))
    data_n = data_sex[idx]
    labels_n = labels_sex_numeric[idx]
    mutual_info = mi.calc_mi(data=data_n, labels=labels_n)
    mutual_infos += mutual_info
    
    # if this is the first time through, determine and fixed ordering based on 
    # the mutual information 
    if n == 0:
        idx_sorted = np.argsort(mutual_info)
    
    # draw a bootstrap sample from the normalized data that has a log transform
    # applied then compute the mutual information. 
    idx = np.random.randint(0, len(samples_sex), len(samples_sex))
    data_n_log = data_sex_log[idx]
    labels_n_log = labels_sex_numeric[idx]
    mutual_info_log = mi.calc_mi(data=data_n_log, labels=labels_n_log)
    mutual_infos_log += mutual_info_log
    
    plt.subplot(2,1,1)
    plt.plot(range(n_show), mutual_info[idx_sorted][::-1][:n_show], color='.7')
    
    plt.subplot(2,1,2)
    plt.plot(range(n_show), mutual_info_log[idx_sorted][::-1][:n_show], color='.7')
    

plt.subplot(2,1,1)
plt.plot(range(n_show), mutual_infos[idx_sorted][::-1][:n_show]/n_averages, color='0.')
plt.ylabel("I(X;Y)")

plt.subplot(2,1,2)
plt.plot(range(n_show), mutual_infos_log[idx_sorted][::-1][:n_show]/n_averages, color='0.')
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
plt.savefig("../files/americangut-sex-mi-transforms.pdf",format="pdf")

features_sex_sorted = np.array(features_sex)[idx_sorted][::-1]
for feature in features_sex_sorted[:10]:
    print feature.replace("[","").replace("]","").replace("\"","")


k__Bacteria, p__Bacteroidetes, c__Bacteroidia, o__Bacteroidales, f__Bacteroidaceae, g__Bacteroides, s__
k__Bacteria, p__Bacteroidetes, c__Bacteroidia, o__Bacteroidales, f__Bacteroidaceae, g__Bacteroides, s__uniformis
k__Bacteria, p__Bacteroidetes, c__Bacteroidia, o__Bacteroidales, f__Bacteroidaceae, g__Bacteroides, s__ovatus
k__Bacteria, p__Firmicutes, c__Clostridia, o__Clostridiales, f__Ruminococcaceae, g__, s__
k__Bacteria, p__Firmicutes, c__Clostridia, o__Clostridiales, f__Lachnospiraceae, g__, s__
k__Bacteria, p__Firmicutes, c__Clostridia, o__Clostridiales, f__Ruminococcaceae, g__Faecalibacterium, s__prausnitzii
k__Bacteria, p__Bacteroidetes, c__Bacteroidia, o__Bacteroidales, f__Porphyromonadaceae, g__Parabacteroides, s__distasonis
k__Bacteria, p__Firmicutes, c__Clostridia, o__Clostridiales, f__Ruminococcaceae, g__, s__
k__Bacteria, p__Bacteroidetes, c__Bacteroidia, o__Bacteroidales, f__Bacteroidaceae, g__Bacteroides, s__
k__Bacteria, p__Bacteroidetes, c__Bacteroidia, o__Bacteroidales, f__Rikenellaceae, g__, s__
Computing MI Matrix

In [ ]:
# only use the features with the highest I(X;Y) even though we are computing MI unconditional in Y
si = np.array(sorted(range(len(mutual_info)), key=lambda k: mutual_infos[k]/n_averages)[::-1][:1000])
mi_mat = mi.mi_matrix(data_sex[:,si], par=True, cpus=10)
output_sex = {"mi_mat":mi_mat, "si":si, "mutual_info":mutual_infos/n_averages}
pickle.dump(output_sex, open( "../files/mi-mat-sex.pkl", "wb" ) )

In [ ]:
n_averages = 25
n_show = 2000
mutual_infos = np.zeros((len(features_diet),))
mutual_infos_log = np.zeros((len(features_diet),))
plt.figure()

for n in range(n_averages):
    # draw a bootstrap sample from the normalized data and compute the mutual 
    # information. 
    idx = np.random.randint(0, len(samples_diet), len(samples_diet))
    data_n = data_diet[idx]
    labels_n = labels_diet_numeric[idx]
    mutual_info = mi.calc_mi(data=data_n, labels=labels_n)
    mutual_infos += mutual_info
    
    # if this is the first time through, determine and fixed ordering based on 
    # the mutual information 
    if n == 0:
        idx_sorted = np.argsort(mutual_info)
    
    # draw a bootstrap sample from the normalized data that has a log transform
    # applied then compute the mutual information. 
    idx = np.random.randint(0, len(samples_diet), len(samples_diet))
    data_n_log = data_diet_log[idx]
    labels_n_log = labels_diet_numeric[idx]
    mutual_info_log = mi.calc_mi(data=data_n_log, labels=labels_n_log)
    mutual_infos_log += mutual_info_log
    
    plt.subplot(2,1,1)
    plt.plot(range(n_show), mutual_info[idx_sorted][::-1][:n_show], color='.7')
    
    plt.subplot(2,1,2)
    plt.plot(range(n_show), mutual_info_log[idx_sorted][::-1][:n_show], color='.7')
    

plt.subplot(2,1,1)
plt.plot(range(n_show), mutual_infos[idx_sorted][::-1][:n_show]/n_averages, color='0.')
plt.ylabel("I(X;Y)")

plt.subplot(2,1,2)
plt.plot(range(n_show), mutual_infos_log[idx_sorted][::-1][:n_show]/n_averages, color='0.')
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
plt.savefig("../files/americangut-diet-mi-transforms.pdf",format="pdf")

features_sex_sorted = np.array(features_sex)[idx_sorted][::-1]
for feature in features_sex_sorted[:10]:
    print feature.replace("[","").replace("]","").replace("\"","")

In [ ]:


In [ ]:
# only use the features with the highest I(X;Y) even though we are computing MI unconditional in Y
si = np.array(sorted(range(len(mutual_info)), key=lambda k: mutual_infos[k]/n_averages)[::-1][:1000])
mi_mat = mi.mi_matrix(data_diet[:,si], par=True, cpus=10)
output_diet = {"mi_mat":mi_mat, "si":si, "mutual_info":mutual_infos/n_averages}
pickle.dump(output_diet, open( "../files/mi-mat-diet.pkl", "wb" ))