In [1]:
import sys
sys.path.append("../src/")
import numpy as np
import bmu
import utils
import matplotlib.pylab as plt
import mi
import feast
import time
import scipy.cluster.hierarchy as sch
import scipy as sp
import pickle
In [2]:
tag = "ag-gut"
biom_fp = "../data/american-gut-mf.biom"
map_fp = "../data/american-gut-mf.txt"
n_select = 25
data, samples, features = bmu.load_biom(biom_fp)
map_data = bmu.load_map(map_fp)
labels, label_map = utils.label_formatting(map_data, samples, "SEX", signed=False)
samples = np.array(samples)
features = np.array(features)
print "# of Samples: "+str(len(data))
print "# of Features: "+str(len(features))
print "# of Male/Females: "+str(len(np.where(labels==1)[0]))+"/"+str(len(np.where(labels==0)[0]))
plt.figure()
plt.subplot(1, 2, 1)
plt.plot(range(len(features)),np.sort(np.sum(utils.normalize(data+1.,scale=None),axis=0))[::-1])
plt.xlabel("arbitrary index")
plt.ylabel("scaled total abundance")
plt.title("Scaled Normalization")
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.subplot(1, 2, 2)
plt.plot(range(len(features)),np.sort(np.sum(utils.normalize(data+1.,scale="log"),axis=0))[::-1])
plt.xlabel("arbitrary index")
#plt.ylabel("scaled abundance")
plt.title("Log-Scaled Normalization")
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.savefig("../files/ag-mf-log-scaling.pdf",format="pdf")
data = utils.normalize(data+1.)
In [3]:
mutual_info = mi.calc_mi(data=data, labels=labels)
plt.figure()
plt.plot(range(len(features)), np.sort(mutual_info)[::-1], lw=2)
plt.title("Mutual Information")
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
a = axes([0.5, 0.5, .35, .35])
plt.plot(range(2000), np.sort(mutual_info)[::-1][:2000], lw=2)
plt.ylabel("I(X;Y)")
plt.savefig("../files/ag-mf-mi.pdf",format="pdf")
plt.figure()
plt.plot(range(len(features)), np.cumsum(np.sort(mutual_info)[::-1]), lw=2)
plt.title("CDF Mutual Information")
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
a = axes([0.5, 0.4, .35, .35])
plt.plot(range(2000), np.cumsum(np.sort(mutual_info)[::-1][:2000]), lw=2)
plt.savefig("../files/ag-mf-cdf-mi.pdf",format="pdf")
In [4]:
n_averages = 50
mutual_infos = np.zeros((len(features),))
plt.figure()
for n in range(n_averages):
idx = np.random.randint(0, len(samples), len(samples))
data_n = data[idx]
labels_n = labels[idx]
mutual_info = mi.calc_mi(data=data_n, labels=labels_n)
if n == 0:
idx_sorted = np.argsort(mutual_info)
plt.plot(range(2000), mutual_info[idx_sorted][::-1][:2000], color='.7')
mutual_infos += mutual_info
plt.plot(range(2000), mutual_infos[idx_sorted][::-1][:2000]/n_averages, color='0.')
plt.xlabel("arbitrary index")
plt.ylabel("I(X;Y)")
plt.savefig("../files/ag-mf-mi-bootstraps.pdf",format="pdf")
In [ ]:
mutual_info = mi.calc_mi(data=data, labels=labels)
si = np.array(sorted(range(len(mutual_info)), key=lambda k: mutual_info[k])[::-1][:1000])
cmi_mat = mi.cmi_matrix(data[:,si], labels, par=True, cpus=10)
In [4]:
output = pickle.load( open( "../files/cmi-mat.pkl", "rb" ) )
cmi_mat = output["cmi_mat"]
D = sp.zeros(cmi_mat.shape)
D = 1. - cmi_mat
print "Creating Clusters + Generating the Plot"
fig = plt.figure()
axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
Y = sch.linkage(D, method='centroid')
Z = sch.dendrogram(Y, orientation='right')
axdendro.set_xticks([])
axdendro.set_yticks([])
# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
index = Z['leaves']
D = D[index,:]
D = D[:,index]
im = axmatrix.matshow(D, aspect='auto', origin='lower')
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
plt.colorbar(im, cax=axcolor)
In [5]:
import scipy
import pylab
import scipy.cluster.hierarchy as sch
import pickle
# Generate random features and distance matrix.
"""
x = scipy.rand(40)
D = scipy.zeros([40,40])
for i in range(40):
for j in range(40):
D[i,j] = abs(x[i] - x[j])
"""
output = pickle.load( open( "../files/mi-mat.pkl", "rb" ) )
D = 1/output["mi_mat"]
# Compute and plot first dendrogram.
fig = figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(D, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Y = sch.linkage(D, method='centroid')
Z2 = sch.dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
Q = output["mi_mat"]
Q = Q[idx1,:]
Q = Q[:,idx2]
im = axmatrix.matshow(Q, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
plt.colorbar(im, cax=axcolor)
plt.savefig("../files/ag-t1000-mutual-information.pdf", bbox_inches="tight")