About


In [1]:
import sys
sys.path.append("../src/")
import numpy as np 
import bmu
import utils
import matplotlib.pylab as plt
import mi
import feast
import time 
import scipy.cluster.hierarchy as sch
import scipy as sp 
import pickle

Loading the Data


In [2]:
tag = "ag-gut"
biom_fp = "../data/american-gut-mf.biom"
map_fp = "../data/american-gut-mf.txt"
n_select = 25

data, samples, features = bmu.load_biom(biom_fp)
map_data = bmu.load_map(map_fp)
labels, label_map = utils.label_formatting(map_data, samples, "SEX", signed=False)
samples = np.array(samples)
features = np.array(features)

print "# of Samples: "+str(len(data))
print "# of Features: "+str(len(features))
print "# of Male/Females: "+str(len(np.where(labels==1)[0]))+"/"+str(len(np.where(labels==0)[0]))

plt.figure()

plt.subplot(1, 2, 1)
plt.plot(range(len(features)),np.sort(np.sum(utils.normalize(data+1.,scale=None),axis=0))[::-1])
plt.xlabel("arbitrary index")
plt.ylabel("scaled total abundance")
plt.title("Scaled Normalization")
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))

plt.subplot(1, 2, 2)
plt.plot(range(len(features)),np.sort(np.sum(utils.normalize(data+1.,scale="log"),axis=0))[::-1])
plt.xlabel("arbitrary index")
#plt.ylabel("scaled abundance")
plt.title("Log-Scaled Normalization")
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.savefig("../files/ag-mf-log-scaling.pdf",format="pdf")

data = utils.normalize(data+1.)


# of Samples: 469
# of Features: 25703
# of Male/Females: 231/238

Evaluating the Mutual Information


In [3]:
mutual_info = mi.calc_mi(data=data, labels=labels)

plt.figure()
plt.plot(range(len(features)), np.sort(mutual_info)[::-1], lw=2)
plt.title("Mutual Information")
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
a = axes([0.5, 0.5, .35, .35])
plt.plot(range(2000), np.sort(mutual_info)[::-1][:2000], lw=2)
plt.ylabel("I(X;Y)")
plt.savefig("../files/ag-mf-mi.pdf",format="pdf")

plt.figure()
plt.plot(range(len(features)), np.cumsum(np.sort(mutual_info)[::-1]), lw=2)
plt.title("CDF Mutual Information")
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
a = axes([0.5, 0.4, .35, .35])
plt.plot(range(2000), np.cumsum(np.sort(mutual_info)[::-1][:2000]), lw=2)
plt.savefig("../files/ag-mf-cdf-mi.pdf",format="pdf")



In [4]:
n_averages = 50
mutual_infos = np.zeros((len(features),))
plt.figure()
for n in range(n_averages):
    idx = np.random.randint(0, len(samples), len(samples))
    data_n = data[idx]
    labels_n = labels[idx]
    mutual_info = mi.calc_mi(data=data_n, labels=labels_n)
    if n == 0:
        idx_sorted = np.argsort(mutual_info)
    plt.plot(range(2000), mutual_info[idx_sorted][::-1][:2000], color='.7')
    mutual_infos += mutual_info

plt.plot(range(2000), mutual_infos[idx_sorted][::-1][:2000]/n_averages, color='0.')
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
plt.savefig("../files/ag-mf-mi-bootstraps.pdf",format="pdf")



In [ ]:
mutual_info = mi.calc_mi(data=data, labels=labels)
si = np.array(sorted(range(len(mutual_info)), key=lambda k: mutual_info[k])[::-1][:1000])
cmi_mat = mi.cmi_matrix(data[:,si], labels, par=True, cpus=10)

In [4]:
output = pickle.load( open( "../files/cmi-mat.pkl", "rb" ) ) 
cmi_mat = output["cmi_mat"]

D = sp.zeros(cmi_mat.shape)
D = 1. - cmi_mat

print "Creating Clusters + Generating the Plot"
fig = plt.figure()
axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
Y = sch.linkage(D, method='centroid')
Z = sch.dendrogram(Y, orientation='right')
axdendro.set_xticks([])
axdendro.set_yticks([])

# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
index = Z['leaves']
D = D[index,:]
D = D[:,index]
im = axmatrix.matshow(D, aspect='auto', origin='lower')
axmatrix.set_xticks([])
axmatrix.set_yticks([])

# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
plt.colorbar(im, cax=axcolor)

In [5]:
import scipy
import pylab
import scipy.cluster.hierarchy as sch
import pickle

# Generate random features and distance matrix.
"""
x = scipy.rand(40)
D = scipy.zeros([40,40])
for i in range(40):
    for j in range(40):
        D[i,j] = abs(x[i] - x[j])
"""
output = pickle.load( open( "../files/mi-mat.pkl", "rb" ) ) 
D = 1/output["mi_mat"]


# Compute and plot first dendrogram.
fig = figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(D, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])

# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Y = sch.linkage(D, method='centroid')
Z2 = sch.dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])

# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
Q = output["mi_mat"]
Q = Q[idx1,:]
Q = Q[:,idx2]

im = axmatrix.matshow(Q, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])

axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
plt.colorbar(im, cax=axcolor)
plt.savefig("../files/ag-t1000-mutual-information.pdf", bbox_inches="tight")