In [1]:
import numpy as np

In [2]:
import os

In [3]:
import pandas as pd

In [4]:
from sklearn.cluster import bicluster

2015 December 4-6

Loading and exploring UTSW RNAi dataset...


In [5]:
dfFile = os.path.join('..', 'data', 'siRNA_dataframe.csv')

In [6]:
RNAiDf = pd.read_csv(dfFile, index_col=0)

In [9]:
RNAiDf.tail()


Out[9]:
HBEC30 H1155 HCC366 H1819 HCC44 HCC4017 H1993 H460 H2073 H2009 H2122 H1395 HCC95
54553 NaN 0.789991 -0.520375 1.868076 0.383222 -0.741300 0.027237 1.527195 0.151016 -0.070423 0.092664 0.280136 0.91
54729 NaN -1.022032 -3.813564 0.266868 -7.329748 -0.474432 -8.524133 -2.004592 -2.934604 -1.527374 -0.520543 -0.540677 -0.70
54753 NaN -0.363565 -0.320215 1.334340 0.652826 1.037820 -0.397525 1.052426 -0.236109 -0.043509 0.639026 -0.181569 0.76
54906 NaN 0.485187 -0.234777 0.118608 0.152439 1.230558 0.271865 0.848898 1.423527 -0.002306 1.346933 0.865910 -0.19
54944 NaN 1.102957 0.397106 0.252042 3.988061 1.215732 3.433279 1.975440 1.119274 1.999617 0.728126 -1.167717 -0.17

Impute NaN values by replacing with the column median. Choosing the column b/c more points exist to give better statistics along with the attempting to establish that the gene deletion in question has little effect on the cell line.


In [7]:
rplVals = np.nanmedian(RNAiDf.values, axis=0)

In [8]:
for i,col in enumerate(RNAiDf.columns):
    RNAiDf[col].replace(np.nan, rplVals[i], inplace=True)

Spectral Co-Clustering


In [9]:
modelSC = bicluster.SpectralCoclustering()

In [10]:
modelSC.fit(RNAiDf.values)

Spectral Biclustering


In [11]:
modelSB = bicluster.SpectralBiclustering()

In [12]:
modelSB.fit(RNAiDf.values)

In [ ]: