In [2]:
import numpy as np
import random
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 8.0)

from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
# from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score
from biclustering import DeltaBiclustering, MSR

In [26]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy

In [5]:
from IPython.core.display import Image 
Image(filename='synteticBiclusters.png')


Out[5]:

In [71]:
def generate_dataset(option, noise=1, noise_background=True, shuffle=False):
    shape = (150,150)
    n,m = shape
    
    # values can't be a lot far...
    centers = [20, 40, 60, 80, 100]
    
    y_row = np.zeros(150)
    y_col = np.zeros(150)

    if noise_background:
        data = np.random.rand(n, m)*100
    else:
        data = np.zeros(n*m).reshape(shape)

    if option == 'a':
        data[60:110][:,70:140] = np.random.rand(50,70)*noise + centers[0]
        y_row[60:110] += 1
        y_col[70:140] += 1
    elif option == 'd':
        data[0:50][:,0:70] = np.random.rand(50,70)*noise + centers[0]
        y_row[0:50] += 1
        y_col[0:70] += 1

        data[50:100][:,50:100] = np.random.rand(50,50)*noise + centers[2]
        y_row[50:100] += 2
        y_col[50:100] += 2

        data[100:150][:,80:150] = np.random.rand(50,70)*noise + centers[1]
        y_row[100:150] += 3
        y_col[80:150] += 3
    elif option == 'e':
        data[0:70][:,0:50] = np.random.rand(70,50)*noise + centers[3]
        y_row[0:70] += 1
        y_col[0:50] += 1

        data[50:100][:,50:100] = np.random.rand(50,50)*noise + centers[1]
        y_row[50:100] += 2
        y_col[50:100] += 2

        data[80:150][:,100:150] = np.random.rand(70,50)*noise + centers[2]
        y_row[80:150] += 3
        y_col[100:150] += 3
    elif option == 'f':
        data[0:50][:,0:40] = np.random.rand(50,40)*noise + centers[4]
        y_row[0:50] += 1
        y_col[0:40] += 1

        data[50:150][:,0:40] = np.random.rand(100,40)*noise + centers[0]
        y_row[50:150] += 2

        data[110:150][:,40:95] = np.random.rand(40,55)*noise + centers[2]
        y_row[110:150] += 3
        y_col[40:95] += 2

        data[110:150][:,95:150] = np.random.rand(40,55)*noise + centers[1]
        y_row[110:150] += 3
        y_col[95:150] += 3
    elif option == 'g':
        data[0:110][:,0:40] = np.random.rand(110,40)*noise + centers[0]
        data[110:150][:,0:110] = np.random.rand(40,110)*noise + centers[2]
        data[40:150][:,110:150] = np.random.rand(110,40)*noise + centers[1]
        data[0:40][:,40:150] = np.random.rand(40,110)*noise + centers[3]
    elif option == 'h':
        data[0:90][:,0:90] = np.random.rand(90,90)*noise + centers[0]
        data[35:55][:,35:55] = (np.random.rand(20,20)*noise + centers[1]) + data[35:55][:,35:55]
        data[110:140][:,35:90] = np.random.rand(30,55)*noise + centers[4]
        data[0:140][:,110:150] = np.random.rand(140,40)*noise + centers[2]
        data[0:55][:,130:150] = (np.random.rand(55,20)*noise + centers[3]) + data[0:55][:,130:150]
    elif option == 'i':
        data[20:70][:,20:70] = np.random.rand(50,50)*noise + centers[0]
        data[20:70][:,100:150] = np.random.rand(50,50)*noise + centers[1]
        data[50:110][:,50:120] = np.random.rand(60,70)*noise + centers[2]
        data[120:150][:,20:100] = np.random.rand(30,80)*noise + centers[3]

    if shuffle:
        np.random.shuffle(data)
        np.random.shuffle(data.T)

    return data, y_row, y_col

Nonnegative Block Value Decomposition


In [59]:
%%latex
This is a coclustering algorithm called Block Value Decomposition (BVD) based on Nonnegative Matrix Factorization (NMF)
technique.
The goal is to find a factorization for the data matrix $X \in \mathbb{R}_{+}^{n \times m}$, where $n$ is the number of objects
and $m$ is the number of features of theses objects and the factorization takes the form: $X \approx USV^T$, where
$U \in \mathbb{R}_{+}^{n \times l}$ is a matrix of rows factors representing features clusters,
$S \in \mathbb{R}_{+}^{l \times k}$ is a block matrix representing how blocks are related, and
$V \in \mathbb{R}_{+}^{m \times k}$ is a matrix of columns factors representing rows clusters.


This is a coclustering algorithm called Block Value Decomposition (BVD) based on Nonnegative Matrix Factorization (NMF) technique. The goal is to find a factorization for the data matrix $X \in \mathbb{R}_{+}^{n \times m}$, where $n$ is the number of objects and $m$ is the number of features of theses objects and the factorization takes the form: $X \approx USV^T$, where $U \in \mathbb{R}_{+}^{n \times l}$ is a matrix of rows factors representing features clusters, $S \in \mathbb{R}_{+}^{l \times k}$ is a block matrix representing how blocks are related, and $V \in \mathbb{R}_{+}^{m \times k}$ is a matrix of columns factors representing rows clusters.

In [60]:
%%latex
This algorithm solves the following optimization problem:
$$\textit{min } ||X - USV^T||^2 \textit{ s.t. } U \geq 0, S \geq 0, V \geq 0$$


This algorithm solves the following optimization problem: $$\textit{min } ||X - USV^T||^2 \textit{ s.t. } U \geq 0, S \geq 0, V \geq 0$$

In [61]:
%%latex
The optimization problem can be solved using Lagrange multipliers ($\lambda$), optimizing the following Lagrange function:
$${\cal L} = |X - USV^T||^2 - tr(\lambda_1U^T) - tr(\lambda_2S^T) - tr(\lambda_3V^T)$$


The optimization problem can be solved using Lagrange multipliers ($\lambda$), optimizing the following Lagrange function: $${\cal L} = |X - USV^T||^2 - tr(\lambda_1U^T) - tr(\lambda_2S^T) - tr(\lambda_3V^T)$$

In [62]:
%%latex
Then ${\cal L}$ must satisfy the K.K.T. conditions:
$$\frac{\partial {\cal L}}{\partial U} = 0$$
$$\frac{\partial {\cal L}}{\partial S} = 0$$
$$\frac{\partial {\cal L}}{\partial V} = 0$$
$$\lambda_1 \odot U = 0$$
$$\lambda_2 \odot S = 0$$
$$\lambda_3 \odot V = 0$$


Then ${\cal L}$ must satisfy the K.K.T. conditions: $$\frac{\partial {\cal L}}{\partial U} = 0$$ $$\frac{\partial {\cal L}}{\partial S} = 0$$ $$\frac{\partial {\cal L}}{\partial V} = 0$$ $$\lambda_1 \odot U = 0$$ $$\lambda_2 \odot S = 0$$ $$\lambda_3 \odot V = 0$$

In [63]:
%%latex
Solving the derivatives and equal them to $0$, is possible to solve the optimization problem by applying gradient
ascending on ${\cal L}$ with the following update rules:
$$U \gets U \odot \frac{XVS^T}{USV^TVS^T}$$
$$V \gets V \odot \frac{X^TUS}{VS^TU^TUS}$$
$$S \gets S \odot \frac{U^TXV}{U^TU^TSV^TV}$$


Solving the derivatives and equal them to $0$, is possible to solve the optimization problem by applying gradient ascending on ${\cal L}$ with the following update rules: $$U \gets U \odot \frac{XVS^T}{USV^TVS^T}$$ $$V \gets V \odot \frac{X^TUS}{VS^TU^TUS}$$ $$S \gets S \odot \frac{U^TXV}{U^TU^TSV^TV}$$

In [64]:
def nmtf(X, k, l, num_iter=1):
    """
    Performs nonnegative block value decomposition
    
    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.
    k : int
        number of rows clusters
    l : int
        number of columns clusters
    num_iter : int, optional
        number of iterations to make the factorization

    Returns
    -------
    U : array matrix of shape (n_samples, n_features_clusters)
        matrix of rows factors representing features clusters
    S : array matrix of shape (n_samples_clusters, n_features_clusters)
        block matrix representing how blocks are related 
    V : array matrix of shape (n_features, n_samples_clusters)
    """
    def scale(A):
        return (A - A.min()) / (A.max() - A.min())

    n, m = X.shape
    
    U = (np.random.rand(n,k))
    S = (np.random.rand(k,l))
    V = (np.random.rand(l,m))

    X_norm = (X)
    for i in xrange(num_iter):
        U_delta = (X_norm.dot(V.T).dot(S.T)) / (U.dot(S).dot(V).dot(V.T).dot(S.T))
        U_new = np.multiply(U,U_delta)

        V_delta = (S.T.dot(U.T).dot(X_norm)) / (S.T.dot(U.T).dot(U).dot(S).dot(V))
        V_new = np.nan_to_num(np.multiply(V,V_delta))

        S_delta = (U.T.dot(X_norm).dot(V.T)) / (U.T.dot(U).dot(S).dot(V).dot(V.T))
        S_new = np.nan_to_num(np.multiply(S,S_delta))

        np.sum((X - U_new.dot(S_new).dot(V_new.T))

        U = U_new
        V = V_new
        S = S_new

    # normalization
#     U_diag = np.diag(np.diag(np.ones(n).dot(U)))
#     V_diag = np.diag(np.diag(np.ones(m).dot(U.T).T))
    
#     U = np.multiply(U, np.diag(S.dot(V_diag)))
#     V = np.multiply(V, np.diag(U_diag.dot(S)))

    return U, S, V

Fast Nonnegative Matrix Tri Factorization


In [1]:
%%latex
In this case, the goal is to optimize the following problem:
$$\textit{min } ||X - USV^T||^2 \textit{ s.t. } U \in \Psi^{n \times k}, S \in \mathbb{R}_{+}^{l \times k}, V \in \Psi^{m \times l}$$
where $U$ and $V$ turns into cluster indicator matrices, with vectors $\vec{u_i}$ and $\vec{v_j}$ which contains $1$ in
only one position, indicating the cluster that that this vector belongs, and $0s$ in the rest.


In this case, the goal is to optimize the following problem: $$\textit{min } ||X - USV^T||^2 \textit{ s.t. } U \in \Psi^{n \times k}, S \in \mathbb{R}_{+}^{l \times k}, V \in \Psi^{m \times l}$$ where $U$ and $V$ turns into cluster indicator matrices, with vectors $\vec{u_i}$ and $\vec{v_j}$ which contains $1$ in only one position, indicating the cluster that that this vector belongs, and $0s$ in the rest.

In [23]:
%%latex
Similar to the other algorithm, it optimizes $S$ with a multiplicative update rule and the following subproblems:
$$S \gets (U^TU)^{-1}U^TXV(V^TV)^{-1}$$
$$v_{ij} \left\{
\begin{array}{ll}
        1 & j = \textit{argmin}_l ||\vec{x_i} - \vec{\tilde{u_l}}||^2 \\
        0 & \textit{otherwise}
    \end{array}
\right.$$
$$u_{ij} \left\{
\begin{array}{ll}
        1 & i = \textit{argmin}_k ||\vec{x_j} - \vec{\tilde{v_k}}||^2 \\
        0 & \textit{otherwise}
    \end{array}
\right.$$
where $\tilde{U} = US$ and $\tilde{V} = SV^T$


Similar to the other algorithm, it optimizes $S$ with a multiplicative update rule and the following subproblems: $$S \gets (U^TU)^{-1}U^TXV(V^TV)^{-1}$$ $$v_{ij} \left\{ \begin{array}{ll} 1 & j = \textit{argmin}_l ||\vec{x_i} - \vec{\tilde{u_l}}||^2 \\ 0 & \textit{otherwise} \end{array} \right.$$ $$u_{ij} \left\{ \begin{array}{ll} 1 & i = \textit{argmin}_k ||\vec{x_j} - \vec{\tilde{v_k}}||^2 \\ 0 & \textit{otherwise} \end{array} \right.$$ where $\tilde{U} = US$ and $\tilde{V} = SV^T$

In [ ]:
def fnmtf(X, k, l, num_iter=1):
    def scale(A):
        return (A - A.min()) / (A.max() - A.min())

    n, m = X.shape
    
    U = (np.random.rand(n,k))
    S = (np.random.rand(k,l))
    V = (np.random.rand(m,l))

    # solve subproblem to update U
    U_new = np.zeros(n*k).reshape(n,k)
    X_norm = (X)
    for i in xrange(num_iter):
        S = np.linalg.pinv(U.T.dot(U)).dot(U.T).dot(X_norm).dot(V).dot(np.linalg.pinv(V.T.dot(V)))

        U_tilde = U.dot(S)
        for i in xrange(m):
            subproblem_result = np.zeros(l)
            for j in xrange(l):
                subproblem_result[j] = np.linalg.norm(X[:][:,i] - U_tilde[:][:,j])
            ind = np.argmin(subproblem_result)
            U_new[i][ind] = 1

        # solve subproblem to update V
        for j in xrange(n):
            subproblem_result = np.zeros(k)
            for i in xrange(k):
                optimization_result[i] = np.linalg.norm(X[j][:] - V_tilde.T[i][:])
            ind = np.argmin(optimization_result)
            V_new[j][ind] = 1

        U = U_new
        V = V_new
        S = S_new

    # normalization
#     U_diag = np.diag(np.diag(np.ones(n).dot(U)))
#     V_diag = np.diag(np.diag(np.ones(m).dot(U.T).T))
    
#     U = np.multiply(U, np.diag(S.dot(V_diag)))
#     V = np.multiply(V, np.diag(U_diag.dot(S)))

    return U, S, V

In [74]:
X, _, _ = generate_dataset(option='a', noise=1.0, noise_background=False)

plt.matshow(X, cmap=plt.cm.Blues)
plt.title("Original Data")
plt.show()

U, S, V = nmtf(X,2,2,1)

plt.matshow(U, cmap=plt.cm.Blues)
plt.matshow(S, cmap=plt.cm.Blues)
plt.matshow(V, cmap=plt.cm.Blues)

plt.matshow(U.dot(S).dot(V), cmap=plt.cm.Blues)


Out[74]:
<matplotlib.image.AxesImage at 0x11363b6d0>

In [22]:
X = np.zeros(10*10).reshape((10,10))
X[3:8][:,3:8] = X[3:8][:,3:8] + 20

plt.matshow(X, cmap=plt.cm.Blues)
plt.title("Original Data")
plt.show()

U, S, V = nmtf(X,2,2,1)

plt.matshow(U, cmap=plt.cm.Blues)
plt.matshow(S, cmap=plt.cm.Blues)
plt.matshow(V, cmap=plt.cm.Blues)

plt.matshow(U.dot(S).dot(V), cmap=plt.cm.Blues)


Out[22]:
<matplotlib.image.AxesImage at 0x11266da50>

In [1]:
X = np.zeros(10*10).reshape((10,10))
X[0:4][:,0:4] = X[0:4][:,0:4] + 20
X[2:8][:,4:6] = X[2:8][:,4:6] + 40
X[6:10][:,6:10] = X[6:10][:,6:10] + 60

plt.matshow(X, cmap=plt.cm.Blues)
plt.title("Original Data")
plt.show()

U, S, V = nmtf(X,4,4,9)

plt.matshow(U, cmap=plt.cm.Blues)
plt.matshow(S, cmap=plt.cm.Blues)
plt.matshow(V, cmap=plt.cm.Blues)

plt.matshow(U.dot(S).dot(V), cmap=plt.cm.Blues)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-5c5f31752c60> in <module>()
----> 1 X = np.zeros(10*10).reshape((10,10))
      2 X[0:4][:,0:4] = X[0:4][:,0:4] + 20
      3 X[2:8][:,4:6] = X[2:8][:,4:6] + 40
      4 X[6:10][:,6:10] = X[6:10][:,6:10] + 60
      5 

NameError: name 'np' is not defined

In [14]:
plt.matshow(generate_dataset(option='g', noise=1.0, noise_background=False), cmap=plt.cm.Blues)
plt.title("Original Data")
plt.show()

data = generate_dataset(option='g', noise=1.0, noise_background=False, shuffle=False)
X = data

U, S, V = nmtf(X,5,5,num_iter=1)

plt.matshow(U, cmap=plt.cm.Blues)
plt.matshow(S, cmap=plt.cm.Blues)
plt.matshow(V, cmap=plt.cm.Blues)

plt.matshow(U.dot(S).dot(V), cmap=plt.cm.Blues)


Out[14]:
<matplotlib.image.AxesImage at 0x111de8b90>

In [ ]: