In [558]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [559]:
import numpy as np
import random
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 8.0)

from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
# from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score

Sintetic Dataset


In [560]:
%%latex
Generated by:
$$x_i = c + \mathcal{N}(0,1)$$
where $c$ is the center of the bicluster.


Generated by: $$x_i = c + \mathcal{N}(0,1)$$ where $c$ is the center of the bicluster.

In [597]:
def generate_data(shape, n_clusters, noise, shuffle=0, random_state=0):
    n, m = shape

    data, rows, columns = make_biclusters(
        shape=shape,
        n_clusters=2,
        noise=1,
        shuffle=False,
        random_state=0
    )
    data = data + (-np.min(data))
    return data

shape = (10,12)
n_clusters = 2
noise = 1

data = generate_data(shape, n_clusters, noise)
min_value = np.min(data)
max_value = np.max(data)
data


Out[597]:
array([[ 62.98705595,  60.14222008,  62.06958637,  60.96814075,
         61.0162791 ,  61.53009646,   1.87032617,   3.18055611,
          2.48732033,   1.84795762,   2.17014584,   2.05995693],
       [ 62.61357703,  60.91433969,  61.43256566,  60.26540222,
         58.56650814,  61.77311655,   2.5907188 ,   0.98411758,
          3.99603723,   0.27191693,   1.77204112,   1.53909875],
       [ 62.65227717,  62.58885673,  61.27444538,  61.49766048,
         60.23171221,  59.13870149,   1.37837045,   1.88263157,
          2.95657328,   2.92866245,   1.33895578,   1.42397985],
       [ 60.07094499,  59.69948002,  59.41322777,  63.07027335,
         60.60984577,  60.68142365,   0.47348724,   2.50377296,
          0.11238475,   1.51354232,   0.83081604,   2.1131851 ],
       [ 60.60869282,  59.93886577,  61.09131573,  61.54782983,
         61.18601518,  61.42196985,   1.09196051,   1.36354144,
          1.05382215,   1.36672944,   0.91313632,   0.        ],
       [  1.90370874,   1.32450167,   0.09608426,   2.18906486,
          0.81898424,   1.778228  ,  76.82241614,  76.22230849,
         77.23272626,  74.85849976,  76.49566722,  75.40851548],
       [  0.85548545,   1.14743294,   1.41473007,   1.78244794,
          0.56113276,   2.62710909,  76.55898802,  74.55708189,
         77.58157777,  77.98921475,  77.27210515,  75.91340074],
       [  0.65552998,   2.78073433,   1.32310566,   2.94872767,
          1.93455758,   2.70292164,  76.44969197,  76.79989874,
         76.1038256 ,  77.87919607,  76.22023767,  76.49531494],
       [  3.6094333 ,   0.37852354,   0.4557976 ,   2.69567931,
          0.5531592 ,   3.66990379,  75.6797066 ,  75.34587076,
         78.0162676 ,  77.57384037,  77.96088454,  76.99937023],
       [  0.86505692,   3.63634756,   1.45827923,   2.528739  ,
          2.67353457,   1.57127251,  76.70740495,  77.01553225,
         76.46975111,  74.99392479,  76.39156375,  77.41971147]])

In [598]:
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")


Out[598]:
<matplotlib.text.Text at 0x122f979d0>

In [599]:
data_shuffled, row_idx, col_idx = sg._shuffle(data, random_state=0)
data_shuffled


Out[599]:
array([[  2.92866245,   1.33895578,  61.49766048,  59.13870149,
         62.58885673,  61.27444538,   1.42397985,  62.65227717,
          2.95657328,   1.37837045,   1.88263157,  60.23171221],
       [ 77.57384037,  77.96088454,   2.69567931,   3.66990379,
          0.37852354,   0.4557976 ,  76.99937023,   3.6094333 ,
         78.0162676 ,  75.6797066 ,  75.34587076,   0.5531592 ],
       [  1.36672944,   0.91313632,  61.54782983,  61.42196985,
         59.93886577,  61.09131573,   0.        ,  60.60869282,
          1.05382215,   1.09196051,   1.36354144,  61.18601518],
       [ 74.99392479,  76.39156375,   2.528739  ,   1.57127251,
          3.63634756,   1.45827923,  77.41971147,   0.86505692,
         76.46975111,  76.70740495,  77.01553225,   2.67353457],
       [  0.27191693,   1.77204112,  60.26540222,  61.77311655,
         60.91433969,  61.43256566,   1.53909875,  62.61357703,
          3.99603723,   2.5907188 ,   0.98411758,  58.56650814],
       [ 77.98921475,  77.27210515,   1.78244794,   2.62710909,
          1.14743294,   1.41473007,  75.91340074,   0.85548545,
         77.58157777,  76.55898802,  74.55708189,   0.56113276],
       [ 77.87919607,  76.22023767,   2.94872767,   2.70292164,
          2.78073433,   1.32310566,  76.49531494,   0.65552998,
         76.1038256 ,  76.44969197,  76.79989874,   1.93455758],
       [  1.51354232,   0.83081604,  63.07027335,  60.68142365,
         59.69948002,  59.41322777,   2.1131851 ,  60.07094499,
          0.11238475,   0.47348724,   2.50377296,  60.60984577],
       [  1.84795762,   2.17014584,  60.96814075,  61.53009646,
         60.14222008,  62.06958637,   2.05995693,  62.98705595,
          2.48732033,   1.87032617,   3.18055611,  61.0162791 ],
       [ 74.85849976,  76.49566722,   2.18906486,   1.778228  ,
          1.32450167,   0.09608426,  75.40851548,   1.90370874,
         77.23272626,  76.82241614,  76.22230849,   0.81898424]])

In [600]:
plt.matshow(data_shuffled, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")


Out[600]:
<matplotlib.text.Text at 0x11d9fe390>

In [601]:
plt.plot(data)
plt.title("parallel coordinates (rows)")
plt.show()

plt.plot(data.T)
plt.title("parallel coordinates (columns)")
plt.show()


Biclustering algorithm: δ-biclustering (Cheng & Church or CC)

ftp://ftp.sdsc.edu/pub/sdsc/biology/ISMB00/157.pdf


In [602]:
%%latex
Define Mean Square Residue (MSR) $H(I,J)$:
$$H(I,J) = \frac{1}{|I||J|} \sum_{i \in I} \sum_{j \in J} (a_{ij} - a_{Ij} - a_{iJ} + a_{IJ})^2$$
where:
$$a_{iJ} = \frac{1}{|J|} \sum_{j \in J} a_{ij}$$
$$a_{Ij} = \frac{1}{|I|} \sum_{i \in I} a_{ij}$$
$$a_{IJ} = \frac{1}{|I||J|} \sum_{i \in I, j \in J} a_{ij}$$


Define Mean Square Residue (MSR) $H(I,J)$: $$H(I,J) = \frac{1}{|I||J|} \sum_{i \in I} \sum_{j \in J} (a_{ij} - a_{Ij} - a_{iJ} + a_{IJ})^2$$ where: $$a_{iJ} = \frac{1}{|J|} \sum_{j \in J} a_{ij}$$ $$a_{Ij} = \frac{1}{|I|} \sum_{i \in I} a_{ij}$$ $$a_{IJ} = \frac{1}{|I||J|} \sum_{i \in I, j \in J} a_{ij}$$

In [603]:
class MSR(object):

    def __init__(self, data):
        self.data = data
        # print "calculating aiJ, aIj and aIJ..."
        self.n, self.m = data.shape
        self.aiJ = np.mean(data, axis=1)
        self.aIj = np.mean(data, axis=0)
        self.aIJ = np.mean(data)
        self._H = None
        self._HiJ = None
        self._HIj = None


    @property
    def H(self):
        if self._H is None:
            print "computing MSR..."
            self._H = self._compute_H()
            print "MSR value: %s" % self._H
        return self._H


    @property
    def HiJ(self):
        if self._HiJ is None:
            self._HiJ = self._compute_HiJ()
        return self._HiJ


    @property 
    def HIj(self):
        if self._HIj is None:
            self._HIj = self._compute_HIj()
        return self._HIj


    def _compute_H(self):
        # it could be changed to change the shape of aiJ and aIj to simplify the code
        #   by calculating H in a vectorized way, but it'd spend more memory
        H = 0
        for i in xrange(self.n):
            for j in xrange(self.m):
                H += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ ) ** 2
        H *= 1.0/(self.n * self.m)
        return H


    def _compute_HiJ(self):
        HiJ = np.zeros(self.n)
        for i in xrange(self.n):
            for j in xrange(self.m):
                HiJ[i] += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ )**2
        HiJ *= 1.0/self.m
        return HiJ


    def _compute_HIj(self):
        HIj = np.zeros(self.m)
        for j in xrange(self.m):
            for i in xrange(self.n):
                HIj[j] += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ )**2
        HIj *= 1.0/self.n
        return HIj


msr = MSR(data)

print "aIJ value: %s" % msr.aIJ
print "H(I,J) value: %s" % msr.H

print "aiJ value: %s" % msr.aiJ
print "H(i,J) value: %s" % msr.HiJ

print "aIj value: %s" % msr.aIj
print "H(I,j) value: %s" % msr.HIj


aIJ value: 35.2616073308
computing MSR...
MSR value: 1126.10540977
H(I,J) value: 1126.10540977
aiJ value: [ 31.86080347  31.39328664  31.60773557  30.92436533  30.96532325
  38.76255876  39.02172555  39.35781182  39.4115364   39.31092651]
H(i,J) value: [ 1119.71152715  1116.54796945  1122.23994116  1125.21655639  1146.99020975
  1125.82683203  1140.50721106  1118.66021797  1133.75964174  1111.59399101]
aIj value: [ 31.68217623  31.25513023  31.00291377  31.94939654  30.81517288
  31.6894743   38.96230708  38.98553118  39.60102861  39.12234845
  39.13655534  38.93725335]
H(I,j) value: [ 1158.04216854  1110.8320204   1153.92622398  1118.41746003  1117.62719101
  1098.43791834  1127.28363089  1095.69933821  1127.82591989  1131.87368758
  1144.04991471  1129.24944368]

In [604]:
def remove_unique_nodes(data, delta, I=None, J=None):
    it = 1

    if I is None:
        I = np.arange(len(data))
    if J is None:
        J = np.arange(len(data[0]))

    while True:
        print "%s iteration of unique node deletion..." % it
        it += 1

        msr = MSR(data[I][:,J])

        if msr.H < delta:
            break

        if len(I) == 1:
            break

        if len(J) == 1:
            break

        row_idx_to_remove = np.argmax(msr.HiJ)
        col_idx_to_remove = np.argmax(msr.HIj)
        if msr.HiJ[row_idx_to_remove] > msr.HIj[col_idx_to_remove]:
            print "removing %s row." % row_idx_to_remove
            I = np.delete(I, row_idx_to_remove)
        else:
            print "removing %s column." % row_idx_to_remove
            J = np.delete(J, col_idx_to_remove)

    return (data[I][:,J],I,J)

In [605]:
def remove_multiple_nodes(data, delta, alpha, I=None, J=None):
    it = 1
    
    if I is None:
        I = np.arange(len(data))
    if J is None:
        J = np.arange(len(data[0]))

    removal_ocurred = True

    while True:
        print "%s iteration of multiple node deletion..." % it
        it += 1

        msr = MSR(data[I][:,J])

        if msr.H < delta:
            break

        if len(I) == 1:
            break

        if len(J) == 1:
            break

        # remove lines first
        rows_idxs_to_remove = np.nonzero(msr.HiJ > alpha*msr.H)[0]
        if len(rows_idxs_to_remove) > 0:
            print "removing %s rows..." % len(rows_idxs_to_remove)
            I = np.delete(I, rows_idxs_to_remove)
        else:
            print "rows removal haven't ocurred..."
            removal_ocurred = False

        msr = MSR(data[I][:,J])
        cols_idxs_to_remove = np.nonzero(msr.HIj > alpha*msr.H)[0]
        if len(cols_idxs_to_remove) > 0:
            print "removing %s columns..." % len(cols_idxs_to_remove)
            J = np.delete(J, cols_idxs_to_remove)
        else:
            print "columns removal haven't ocurred..."
            if not removal_ocurred:
                return remove_unique_nodes(data[I][:,J], delta, I, J)

    return (data[I][:,J],I,J)

In [606]:
def add_nodes(data, orig_data, alpha, I, J):
    it = 1
    orig_n, orig_m = orig_data.shape
    orig_I = np.arange(orig_n)
    orig_J = np.arange(orig_m)
    I_not_in_bicluster = np.setdiff1d(orig_I, I)
    J_not_in_bicluster = np.setdiff1d(orig_J, J)
    addition_ocurred = True
    while addition_ocurred:
        print "%s iteration of node addition..." % it
        it += 1
        addition_ocurred = False
        msr = MSR(orig_data[I][:,J])
        # column addition
        for j in J_not_in_bicluster:
            data_with_j = np.column_stack((data, orig_data[I][:,j]))
            msr_with_j = MSR(data_with_j)
            if msr_with_j.HIj[-1] <= msr.H:
                J = np.append(J, j)
                J_not_in_bicluster = np.delete(J_not_in_bicluster, np.where(j == J_not_in_bicluster)[0])
                data = data_with_j
                addition_ocurred = True
        msr = MSR(orig_data[I][:,J])
        # row addition
        for i in I_not_in_bicluster:
            data_with_i = np.row_stack((data, orig_data[i,J]))
            msr_with_i = MSR(data_with_i)
            if msr_with_i.HIj[-1] <= msr.H:
                I = np.append(I, i)
                I_not_in_bicluster = np.delete(I_not_in_bicluster, np.where(i == I_not_in_bicluster)[0])
                data = data_with_i
                adition_ocurred = True
# My guess is that we don't need invertions, because we are not in genes expression domain        
        # inverted row addition
#         msr = MSR(orig_data)
#         rows_idxs_to_add_overlap = np.nonzero(msr.HIj <= alpha*msr.H)[0]
#         rows_idxs_to_add = np.setdiff1d(rows_idxs_to_add_overlap, new_I)
#         if len(rows_idxs_to_add) > 0:
#             new_I = np.append(new_I, rows_idxs_to_add)
#             inverted_rows = np.fliplr(orig_data[rows_idxs_to_add][:,J])
#             data = np.row_stack((data, inverted_rows))
#         else:
#             addition_ocurred = False
    return (orig_data[I][:,J],I,J)

In [607]:
def mask(data, I, J, min_v, max_v):
    for i in I:
        for j in J:
            data[i,j] = random.uniform(min_v, max_v)
    return data

In [608]:
# %debug
def find_biclusters(data, delta, alpha, num_biclusters, num_iterations=1):
#     import pdb; pdb.set_trace()
    biclusters = []

    min_v = np.min(data)
    max_v = np.max(data)

    for i in range(num_biclusters):
        n, m = data.shape
        print data.shape

        plt.matshow(data, cmap=plt.cm.Blues, vmin=min_value, vmax=max_value)
        plt.show()

        print "--- Looking for bicluster %s:" % (i+1)

        # do not perform multiple node deletion
        if len(data) < 100 or len(data[0]) < 100:
            (C,I,J) = remove_unique_nodes(data, delta)
        else:
            (C,I,J) = remove_multiple_nodes(data, delta, alpha)

        (D,I,J) = add_nodes(C, data, alpha, I, J)

        biclusters.append(D.copy())

        plt.matshow(D, cmap=plt.cm.Blues, vmin=min_v, vmax=max_v)
        plt.show()

        A_line = mask(data, I, J, min_v, max_v)

    return biclusters

num_biclusters = 2
biclusters = find_biclusters(data, 1.0, 1.0, num_biclusters)


(10, 12)
--- Looking for bicluster 1:
1 iteration of unique node deletion...
computing MSR...
MSR value: 1126.10540977
removing 4 column.
2 iteration of unique node deletion...
computing MSR...
MSR value: 1113.63147163
removing 4 column.
3 iteration of unique node deletion...
computing MSR...
MSR value: 1073.90479437
removing 4 column.
4 iteration of unique node deletion...
computing MSR...
MSR value: 993.348777709
removing 4 column.
5 iteration of unique node deletion...
computing MSR...
MSR value: 836.594949534
removing 4 column.
6 iteration of unique node deletion...
computing MSR...
MSR value: 545.0403536
removing 4 column.
7 iteration of unique node deletion...
computing MSR...
MSR value: 0.659354830469
1 iteration of node addition...
computing MSR...
MSR value: 0.659354830469
(10, 12)
--- Looking for bicluster 2:
1 iteration of unique node deletion...
computing MSR...
MSR value: 406.194015495
removing 0 column.
2 iteration of unique node deletion...
computing MSR...
MSR value: 340.348421543
removing 7 column.
3 iteration of unique node deletion...
computing MSR...
MSR value: 256.9891031
removing 7 column.
4 iteration of unique node deletion...
computing MSR...
MSR value: 187.023377373
removing 7 column.
5 iteration of unique node deletion...
computing MSR...
MSR value: 138.608709062
removing 7 column.
6 iteration of unique node deletion...
computing MSR...
MSR value: 69.391819931
removing 0 column.
7 iteration of unique node deletion...
computing MSR...
MSR value: 0.876196051522
1 iteration of node addition...
computing MSR...
MSR value: 0.876196051522

In [609]:
for i in range(num_biclusters):
    print "biclustering %s: %s" % (i+1, biclusters[i].shape)
    print "MSR Value of bicluster: %s" % MSR(biclusters[i]).H
    plt.matshow(biclusters[i], cmap=plt.cm.Blues, vmin=min_value, vmax=max_value)
    plt.show()


biclustering 1: (10, 6)
computing MSR...
MSR value: 0.659354830469
MSR Value of bicluster: 0.659354830469
biclustering 2: (10, 6)
computing MSR...
MSR value: 0.876196051522
MSR Value of bicluster: 0.876196051522

In [615]:
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Resulting original dataset")


Out[615]:
<matplotlib.text.Text at 0x1261788d0>

In [631]:
from sklearn import cluster, datasets 
noisy_circles = datasets.make_circles(n_samples=30, factor=.5, noise=0.0001)
X, y = noisy_circles

plt.scatter(X[:, 0], X[:, 1])
plt.show()

plt.plot(X)
plt.show()

plt.plot(X.T)
plt.show()



In [630]:


In [ ]: