In [558]:
%pylab inline
In [559]:
import numpy as np
import random
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 8.0)
from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
# from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score
In [560]:
%%latex
Generated by:
$$x_i = c + \mathcal{N}(0,1)$$
where $c$ is the center of the bicluster.
In [597]:
def generate_data(shape, n_clusters, noise, shuffle=0, random_state=0):
n, m = shape
data, rows, columns = make_biclusters(
shape=shape,
n_clusters=2,
noise=1,
shuffle=False,
random_state=0
)
data = data + (-np.min(data))
return data
shape = (10,12)
n_clusters = 2
noise = 1
data = generate_data(shape, n_clusters, noise)
min_value = np.min(data)
max_value = np.max(data)
data
Out[597]:
In [598]:
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")
Out[598]:
In [599]:
data_shuffled, row_idx, col_idx = sg._shuffle(data, random_state=0)
data_shuffled
Out[599]:
In [600]:
plt.matshow(data_shuffled, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")
Out[600]:
In [601]:
plt.plot(data)
plt.title("parallel coordinates (rows)")
plt.show()
plt.plot(data.T)
plt.title("parallel coordinates (columns)")
plt.show()
In [602]:
%%latex
Define Mean Square Residue (MSR) $H(I,J)$:
$$H(I,J) = \frac{1}{|I||J|} \sum_{i \in I} \sum_{j \in J} (a_{ij} - a_{Ij} - a_{iJ} + a_{IJ})^2$$
where:
$$a_{iJ} = \frac{1}{|J|} \sum_{j \in J} a_{ij}$$
$$a_{Ij} = \frac{1}{|I|} \sum_{i \in I} a_{ij}$$
$$a_{IJ} = \frac{1}{|I||J|} \sum_{i \in I, j \in J} a_{ij}$$
In [603]:
class MSR(object):
def __init__(self, data):
self.data = data
# print "calculating aiJ, aIj and aIJ..."
self.n, self.m = data.shape
self.aiJ = np.mean(data, axis=1)
self.aIj = np.mean(data, axis=0)
self.aIJ = np.mean(data)
self._H = None
self._HiJ = None
self._HIj = None
@property
def H(self):
if self._H is None:
print "computing MSR..."
self._H = self._compute_H()
print "MSR value: %s" % self._H
return self._H
@property
def HiJ(self):
if self._HiJ is None:
self._HiJ = self._compute_HiJ()
return self._HiJ
@property
def HIj(self):
if self._HIj is None:
self._HIj = self._compute_HIj()
return self._HIj
def _compute_H(self):
# it could be changed to change the shape of aiJ and aIj to simplify the code
# by calculating H in a vectorized way, but it'd spend more memory
H = 0
for i in xrange(self.n):
for j in xrange(self.m):
H += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ ) ** 2
H *= 1.0/(self.n * self.m)
return H
def _compute_HiJ(self):
HiJ = np.zeros(self.n)
for i in xrange(self.n):
for j in xrange(self.m):
HiJ[i] += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ )**2
HiJ *= 1.0/self.m
return HiJ
def _compute_HIj(self):
HIj = np.zeros(self.m)
for j in xrange(self.m):
for i in xrange(self.n):
HIj[j] += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ )**2
HIj *= 1.0/self.n
return HIj
msr = MSR(data)
print "aIJ value: %s" % msr.aIJ
print "H(I,J) value: %s" % msr.H
print "aiJ value: %s" % msr.aiJ
print "H(i,J) value: %s" % msr.HiJ
print "aIj value: %s" % msr.aIj
print "H(I,j) value: %s" % msr.HIj
In [604]:
def remove_unique_nodes(data, delta, I=None, J=None):
it = 1
if I is None:
I = np.arange(len(data))
if J is None:
J = np.arange(len(data[0]))
while True:
print "%s iteration of unique node deletion..." % it
it += 1
msr = MSR(data[I][:,J])
if msr.H < delta:
break
if len(I) == 1:
break
if len(J) == 1:
break
row_idx_to_remove = np.argmax(msr.HiJ)
col_idx_to_remove = np.argmax(msr.HIj)
if msr.HiJ[row_idx_to_remove] > msr.HIj[col_idx_to_remove]:
print "removing %s row." % row_idx_to_remove
I = np.delete(I, row_idx_to_remove)
else:
print "removing %s column." % row_idx_to_remove
J = np.delete(J, col_idx_to_remove)
return (data[I][:,J],I,J)
In [605]:
def remove_multiple_nodes(data, delta, alpha, I=None, J=None):
it = 1
if I is None:
I = np.arange(len(data))
if J is None:
J = np.arange(len(data[0]))
removal_ocurred = True
while True:
print "%s iteration of multiple node deletion..." % it
it += 1
msr = MSR(data[I][:,J])
if msr.H < delta:
break
if len(I) == 1:
break
if len(J) == 1:
break
# remove lines first
rows_idxs_to_remove = np.nonzero(msr.HiJ > alpha*msr.H)[0]
if len(rows_idxs_to_remove) > 0:
print "removing %s rows..." % len(rows_idxs_to_remove)
I = np.delete(I, rows_idxs_to_remove)
else:
print "rows removal haven't ocurred..."
removal_ocurred = False
msr = MSR(data[I][:,J])
cols_idxs_to_remove = np.nonzero(msr.HIj > alpha*msr.H)[0]
if len(cols_idxs_to_remove) > 0:
print "removing %s columns..." % len(cols_idxs_to_remove)
J = np.delete(J, cols_idxs_to_remove)
else:
print "columns removal haven't ocurred..."
if not removal_ocurred:
return remove_unique_nodes(data[I][:,J], delta, I, J)
return (data[I][:,J],I,J)
In [606]:
def add_nodes(data, orig_data, alpha, I, J):
it = 1
orig_n, orig_m = orig_data.shape
orig_I = np.arange(orig_n)
orig_J = np.arange(orig_m)
I_not_in_bicluster = np.setdiff1d(orig_I, I)
J_not_in_bicluster = np.setdiff1d(orig_J, J)
addition_ocurred = True
while addition_ocurred:
print "%s iteration of node addition..." % it
it += 1
addition_ocurred = False
msr = MSR(orig_data[I][:,J])
# column addition
for j in J_not_in_bicluster:
data_with_j = np.column_stack((data, orig_data[I][:,j]))
msr_with_j = MSR(data_with_j)
if msr_with_j.HIj[-1] <= msr.H:
J = np.append(J, j)
J_not_in_bicluster = np.delete(J_not_in_bicluster, np.where(j == J_not_in_bicluster)[0])
data = data_with_j
addition_ocurred = True
msr = MSR(orig_data[I][:,J])
# row addition
for i in I_not_in_bicluster:
data_with_i = np.row_stack((data, orig_data[i,J]))
msr_with_i = MSR(data_with_i)
if msr_with_i.HIj[-1] <= msr.H:
I = np.append(I, i)
I_not_in_bicluster = np.delete(I_not_in_bicluster, np.where(i == I_not_in_bicluster)[0])
data = data_with_i
adition_ocurred = True
# My guess is that we don't need invertions, because we are not in genes expression domain
# inverted row addition
# msr = MSR(orig_data)
# rows_idxs_to_add_overlap = np.nonzero(msr.HIj <= alpha*msr.H)[0]
# rows_idxs_to_add = np.setdiff1d(rows_idxs_to_add_overlap, new_I)
# if len(rows_idxs_to_add) > 0:
# new_I = np.append(new_I, rows_idxs_to_add)
# inverted_rows = np.fliplr(orig_data[rows_idxs_to_add][:,J])
# data = np.row_stack((data, inverted_rows))
# else:
# addition_ocurred = False
return (orig_data[I][:,J],I,J)
In [607]:
def mask(data, I, J, min_v, max_v):
for i in I:
for j in J:
data[i,j] = random.uniform(min_v, max_v)
return data
In [608]:
# %debug
def find_biclusters(data, delta, alpha, num_biclusters, num_iterations=1):
# import pdb; pdb.set_trace()
biclusters = []
min_v = np.min(data)
max_v = np.max(data)
for i in range(num_biclusters):
n, m = data.shape
print data.shape
plt.matshow(data, cmap=plt.cm.Blues, vmin=min_value, vmax=max_value)
plt.show()
print "--- Looking for bicluster %s:" % (i+1)
# do not perform multiple node deletion
if len(data) < 100 or len(data[0]) < 100:
(C,I,J) = remove_unique_nodes(data, delta)
else:
(C,I,J) = remove_multiple_nodes(data, delta, alpha)
(D,I,J) = add_nodes(C, data, alpha, I, J)
biclusters.append(D.copy())
plt.matshow(D, cmap=plt.cm.Blues, vmin=min_v, vmax=max_v)
plt.show()
A_line = mask(data, I, J, min_v, max_v)
return biclusters
num_biclusters = 2
biclusters = find_biclusters(data, 1.0, 1.0, num_biclusters)
In [609]:
for i in range(num_biclusters):
print "biclustering %s: %s" % (i+1, biclusters[i].shape)
print "MSR Value of bicluster: %s" % MSR(biclusters[i]).H
plt.matshow(biclusters[i], cmap=plt.cm.Blues, vmin=min_value, vmax=max_value)
plt.show()
In [615]:
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Resulting original dataset")
Out[615]:
In [631]:
from sklearn import cluster, datasets
noisy_circles = datasets.make_circles(n_samples=30, factor=.5, noise=0.0001)
X, y = noisy_circles
plt.scatter(X[:, 0], X[:, 1])
plt.show()
plt.plot(X)
plt.show()
plt.plot(X.T)
plt.show()
In [630]:
In [ ]: