In [1]:
import os
import common

# Assign notebook and folder names
notebook_name = '05_pca_vs_ica'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [3]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')

csv = os.path.join(input_folder, 'expression_table1_subset.csv')

table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()


(300, 259)
Out[3]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 14 3 1 3 12 0 1 7 2 2 ... 1 1 2 0 0 0 0 1 0 0
r1_TGGAGATACTCT 23 8 6 4 13 9 2 19 1 1 ... 3 0 2 1 0 1 0 2 0 1
r1_CGTCTACATCCG 14 4 7 1 6 3 0 13 2 2 ... 0 1 0 3 0 1 0 2 0 0
r1_CAAGCTTGGCGC 62 18 10 20 29 2 8 31 9 2 ... 0 5 7 3 2 6 2 3 7 11
r1_ACTCACATAGAG 10 1 0 1 5 2 1 7 3 1 ... 1 1 2 3 1 2 1 0 3 0

5 rows × 259 columns


In [4]:
input_folder = os.path.join(common.DATA_FOLDER, '002_robust_pca')

csv = os.path.join(input_folder, 'lowrank.csv')

lowrank = pd.read_csv(csv, index_col=0)
print(lowrank.shape)
lowrank.head()


(300, 259)
Out[4]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 2.107094 1.557394 0.807730 1.053738 2.056140 0.772676 1.148585 2.274124 0.939969 0.769930 ... 0.386077 0.616134 0.590651 0.551323 0.581623 0.549251 0.498720 0.622783 0.980489 0.597849
r1_TGGAGATACTCT 3.564449 2.631513 1.362509 1.778671 3.479139 1.303118 1.941908 3.850221 1.586342 1.299321 ... 0.662193 1.049206 1.001763 0.943025 0.995895 0.933101 0.854436 1.058849 1.666429 1.015786
r1_CGTCTACATCCG 2.829169 2.088554 1.081298 1.411538 2.761437 1.034065 1.541298 3.056093 1.258905 1.031110 ... 0.526003 0.832973 0.795049 0.749016 0.791121 0.740852 0.678725 0.840589 1.323006 0.806466
r1_CAAGCTTGGCGC 6.462507 4.735729 2.429696 3.185960 6.309599 2.324588 3.504922 7.006445 2.838329 2.324829 ... 1.293263 1.975855 1.850447 1.812795 1.933828 1.724248 1.668428 1.969213 3.102769 1.887946
r1_ACTCACATAGAG 2.097722 1.549264 0.802647 1.047497 2.047217 0.767650 1.143032 2.265178 0.934300 0.765252 ... 0.387953 0.616088 0.588936 0.552997 0.583952 0.548329 0.500862 0.621944 0.979270 0.596974

5 rows × 259 columns

Assign colors based on clusters


In [5]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()


(44808,)
Out[5]:
barcode
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [6]:
cluster_identities_lowrank = cluster_identities.loc[lowrank.index]
cluster_identities_lowrank.head()


Out[6]:
barcode
r1_TTCCTGCTAGGC    24
r1_TGGAGATACTCT    24
r1_CGTCTACATCCG    24
r1_CAAGCTTGGCGC    24
r1_ACTCACATAGAG    24
Name: cluster_id, dtype: int64

In [7]:
cluster_ids = cluster_identities_lowrank.unique()
cluster_ids


Out[7]:
array([24, 25, 26, 27, 33, 34])

In [8]:
cluster_id_to_name = {24: 'Rods', 25: 'Cones', 26: 'Bipolar cells (group1)', 27: 'Bipolar cells (group2)', 
                      33: 'Bipolar cells (group3)', 34: 'Muller glia'}

In [9]:
cluster_names_lowrank = cluster_identities_lowrank.map(cluster_id_to_name)
cluster_names_lowrank.head()


Out[9]:
barcode
r1_TTCCTGCTAGGC    Rods
r1_TGGAGATACTCT    Rods
r1_CGTCTACATCCG    Rods
r1_CAAGCTTGGCGC    Rods
r1_ACTCACATAGAG    Rods
Name: cluster_id, dtype: object

In [11]:
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))

color_labels = [id_to_color[i] for i in cluster_identities_lowrank]
color_labels[:4]


Out[11]:
[(0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118)]

In [32]:
cluster_names_to_color = dict((cluster_id_to_name[i], id_to_color[i]) for i in cluster_ids)
cluster_names_to_color


Out[32]:
{'Bipolar cells (group1)': (0.55294117647058827,
  0.62745098039215685,
  0.79607843137254897),
 'Bipolar cells (group2)': (0.90588235294117647,
  0.54117647058823526,
  0.76470588235294112),
 'Bipolar cells (group3)': (0.65098039215686276,
  0.84705882352941175,
  0.32941176470588235),
 'Cones': (0.9882352941176471, 0.55294117647058827, 0.3843137254901961),
 'Muller glia': (1.0, 0.85098039215686272, 0.18431372549019609),
 'Rods': (0.40000000000000002, 0.76078431372549016, 0.6470588235294118)}

Plot the low rank data


In [12]:
sns.set(style='whitegrid')

In [84]:
fig, ax = plt.subplots()
sns.heatmap(lowrank, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')


Out[84]:
[<matplotlib.text.Text at 0x12923bda0>, <matplotlib.text.Text at 0x128e0a898>]

Maybe this is small enough for a clustered heatmap


In [16]:
clustergrid = sns.clustermap(lowrank, mask=mask, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [13]:
from sklearn.decomposition import FastICA

ica = FastICA(n_components=6)

reduced = pd.DataFrame(ica.fit_transform(lowrank), index=lowrank.index)
reduced.head()


Out[13]:
0 1 2 3 4 5
barcode
r1_TTCCTGCTAGGC -0.017281 0.047082 -0.046493 0.044163 0.026868 0.025426
r1_TGGAGATACTCT -0.017581 0.051352 -0.096292 0.024597 0.018292 0.013639
r1_CGTCTACATCCG -0.025346 0.045580 -0.067072 0.017563 0.019891 0.019199
r1_CAAGCTTGGCGC -0.021076 -0.064964 -0.130531 -0.035461 -0.000332 -0.008107
r1_ACTCACATAGAG -0.024881 0.042757 -0.041752 0.027513 0.024378 0.025131

In [14]:
component_norms = reduced.apply(np.linalg.norm).sort_values(ascending=False)
component_norms


Out[14]:
2    1.0
1    1.0
5    1.0
0    1.0
4    1.0
3    1.0
dtype: float64

In [15]:
plt.plot(np.arange(len(component_norms)), component_norms, 'o')


Out[15]:
[<matplotlib.lines.Line2D at 0x11a2c06a0>]

In [16]:
reduced = reduced[component_norms.index]
reduced.head()


Out[16]:
2 1 5 0 4 3
barcode
r1_TTCCTGCTAGGC -0.046493 0.047082 0.025426 -0.017281 0.026868 0.044163
r1_TGGAGATACTCT -0.096292 0.051352 0.013639 -0.017581 0.018292 0.024597
r1_CGTCTACATCCG -0.067072 0.045580 0.019199 -0.025346 0.019891 0.017563
r1_CAAGCTTGGCGC -0.130531 -0.064964 -0.008107 -0.021076 -0.000332 -0.035461
r1_ACTCACATAGAG -0.041752 0.042757 0.025131 -0.024881 0.024378 0.027513

In [17]:
reduced_names = reduced.join(cluster_names_lowrank)
reduced_names.head()


Out[17]:
2 1 5 0 4 3 cluster_id
barcode
r1_TTCCTGCTAGGC -0.046493 0.047082 0.025426 -0.017281 0.026868 0.044163 Rods
r1_TGGAGATACTCT -0.096292 0.051352 0.013639 -0.017581 0.018292 0.024597 Rods
r1_CGTCTACATCCG -0.067072 0.045580 0.019199 -0.025346 0.019891 0.017563 Rods
r1_CAAGCTTGGCGC -0.130531 -0.064964 -0.008107 -0.021076 -0.000332 -0.035461 Rods
r1_ACTCACATAGAG -0.041752 0.042757 0.025131 -0.024881 0.024378 0.027513 Rods

In [19]:
sns.heatmap(reduced_names.groupby('cluster_id').median())


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a41d978>

In [101]:
sns.pairplot(reduced_names, hue='cluster_id', palette=cluster_names_to_color)


Out[101]:
<seaborn.axisgrid.PairGrid at 0x12a9d06d8>

In [89]:
ica_components = pd.DataFrame(ica.components_, columns=lowrank.columns)
print(ica_components.shape)
ica_components.head()


(6, 259)
Out[89]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
0 0.050505 0.048688 0.051475 0.032922 -0.001720 0.027235 0.027126 -0.001834 0.035298 0.024644 ... -0.042106 -0.050606 -0.046519 -0.035074 0.008691 -0.017919 -0.006807 -0.067759 -0.011596 -0.000858
1 -0.010744 -0.011213 -0.008328 -0.019160 -0.013628 -0.020105 -0.004159 -0.008439 -0.016243 -0.015508 ... 0.015240 -0.004480 -0.022567 0.024237 0.032721 0.013698 0.022497 -0.000798 0.006728 0.007369
2 0.056020 0.049924 0.056653 -0.007357 -0.028842 -0.022705 0.039446 -0.013762 0.005357 -0.009711 ... -0.021280 -0.108222 -0.153687 0.023846 0.111879 0.030194 0.060665 -0.094987 -0.011357 0.022221
3 -0.018848 -0.016249 -0.017150 0.003296 0.006208 0.008822 -0.013986 0.001988 -0.000734 0.003916 ... 0.007825 0.039505 0.051376 -0.008005 -0.034071 -0.013715 -0.020567 0.025608 0.009547 -0.006761
4 -0.005825 -0.005053 -0.005657 -0.001225 0.001956 -0.000173 -0.003313 0.000882 -0.002049 -0.000688 ... 0.002521 0.007171 0.010216 -0.000166 -0.007556 -0.000389 -0.003378 0.009009 -0.000332 -0.001525

5 rows × 259 columns


In [91]:
sns.distplot(ica_components.values.flat)


Out[91]:
<matplotlib.axes._subplots.AxesSubplot at 0x12a33f3c8>

What about plain old PCA?


In [22]:
from sklearn.decomposition import PCA
pca = PCA()
pca_reduced = pd.DataFrame(pca.fit_transform(lowrank), index=lowrank.index)
pca_reduced.head()


Out[22]:
0 1 2 3 4 5 6 7 8 9 ... 249 250 251 252 253 254 255 256 257 258
barcode
r1_TTCCTGCTAGGC 2.016365 -3.082168 -2.008168 -0.197335 -0.132597 -0.008867 3.950665e-14 1.818672e-14 -1.025019e-14 4.136603e-15 ... 8.028969e-17 -1.490587e-16 -2.518648e-16 -4.396792e-17 -9.250631e-17 -5.428418e-17 -6.933617e-17 1.060000e-15 2.228168e-16 3.487098e-16
r1_TGGAGATACTCT 6.926507 -2.421480 -0.808254 -0.247768 -0.191300 -0.004699 -2.852697e-14 1.705282e-14 2.040679e-15 -4.399704e-15 ... -3.546638e-17 -5.397391e-17 4.788619e-17 8.364647e-18 9.656126e-17 -1.009183e-16 6.690166e-17 1.235923e-15 -3.243360e-16 -3.199985e-16
r1_CGTCTACATCCG 4.453695 -2.744673 -1.408121 -0.211491 -0.146644 -0.002034 -6.070872e-16 -4.277527e-15 -2.317508e-15 -8.725150e-15 ... -1.374832e-16 9.535285e-17 -2.410900e-16 9.143571e-18 -1.875402e-16 8.684627e-17 -8.596237e-17 8.657653e-16 1.409920e-15 -6.613824e-17
r1_CAAGCTTGGCGC 16.780411 -0.735881 1.754018 -0.054682 0.013814 0.006292 -1.176347e-14 9.579850e-15 -4.885728e-15 1.696122e-14 ... 7.998233e-17 2.215946e-17 -1.039295e-16 4.281491e-17 -3.677119e-17 1.014838e-16 6.715735e-17 1.346684e-15 2.863966e-16 3.439058e-17
r1_ACTCACATAGAG 1.990009 -3.074045 -2.009732 -0.183536 -0.115751 -0.004209 1.468397e-15 -2.317685e-15 6.186075e-16 1.090401e-15 ... -4.065115e-17 3.152358e-16 7.185775e-17 9.565203e-17 2.943767e-16 -4.617675e-17 -2.513610e-16 -7.147581e-19 2.212935e-16 -1.636018e-16

5 rows × 259 columns


In [23]:
plt.plot(pca.explained_variance_ratio_[:10], 'o-')


Out[23]:
[<matplotlib.lines.Line2D at 0x11a8b94a8>]

In [24]:
pca_reduced_subset = pca_reduced.loc[:, :5]

In [25]:
pca_reduced_names = pca_reduced_subset.join(cluster_names_lowrank)
pca_reduced_names.head()


Out[25]:
0 1 2 3 4 5 cluster_id
barcode
r1_TTCCTGCTAGGC 2.016365 -3.082168 -2.008168 -0.197335 -0.132597 -0.008867 Rods
r1_TGGAGATACTCT 6.926507 -2.421480 -0.808254 -0.247768 -0.191300 -0.004699 Rods
r1_CGTCTACATCCG 4.453695 -2.744673 -1.408121 -0.211491 -0.146644 -0.002034 Rods
r1_CAAGCTTGGCGC 16.780411 -0.735881 1.754018 -0.054682 0.013814 0.006292 Rods
r1_ACTCACATAGAG 1.990009 -3.074045 -2.009732 -0.183536 -0.115751 -0.004209 Rods

In [26]:
sns.heatmap(pca_reduced_names.groupby('cluster_id').mean())


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a8e2be0>

In [27]:
sns.pairplot(pca_reduced_names, hue='cluster_id', 
             palette=cluster_names_to_color)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-402fa88bc438> in <module>()
      1 sns.pairplot(pca_reduced_names, hue='cluster_id', 
----> 2              palette=cluster_names_to_color)

NameError: name 'cluster_names_to_color' is not defined

In [53]:
matrix = np.array([[1, 1], [0, 2]]).astype(float).T
matrix


Out[53]:
array([[ 1.,  0.],
       [ 1.,  2.]])

In [54]:
np.linalg.svd(matrix)


Out[54]:
(array([[-0.22975292, -0.97324899],
        [-0.97324899,  0.22975292]]),
 array([ 2.28824561,  0.87403205]),
 array([[-0.52573111, -0.85065081],
        [-0.85065081,  0.52573111]]))

In [55]:
pca0 = PCA()
ica0 = FastICA()

In [59]:
pcad = pca0.fit_transform(matrix)
pcad


Out[59]:
array([[ 1.,  0.],
       [-1.,  0.]])

In [47]:
pca0.explained_variance_ratio_


Out[47]:
array([  1.00000000e+00,   2.81351049e-34])

In [48]:
pca0.components_


Out[48]:
array([[-0.70710678,  0.70710678],
       [-0.70710678, -0.70710678]])

In [49]:
matrix


Out[49]:
array([[ 1.,  1.],
       [ 0.,  2.]])

In [50]:
%pdb


Automatic pdb calling has been turned OFF

In [51]:
icad = ica0.fit_transform(matrix)
icad


Out[51]:
array([[-0.703125,  0.      ],
       [ 0.703125,  0.      ]])

TSNE


In [128]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0, perplexity=120)

smushed = pd.DataFrame(tsne.fit_transform(reduced), index=lowrank.index)

# smushed = pd.DataFrame(tsne.fit_transform(pca_reduced.loc[:, :2]), index=lowrank.index)
smushed.head()


Out[128]:
0 1
barcode
r1_TTCCTGCTAGGC 9.451248 -0.661464
r1_TGGAGATACTCT 8.999652 2.190746
r1_CGTCTACATCCG 5.145081 -0.198609
r1_CAAGCTTGGCGC 11.408211 8.913246
r1_ACTCACATAGAG 0.572704 4.590956

In [129]:
smushed_names = smushed.join(cluster_names_lowrank)
smushed_names.head()


Out[129]:
0 1 cluster_id
barcode
r1_TTCCTGCTAGGC 9.451248 -0.661464 Rods
r1_TGGAGATACTCT 8.999652 2.190746 Rods
r1_CGTCTACATCCG 5.145081 -0.198609 Rods
r1_CAAGCTTGGCGC 11.408211 8.913246 Rods
r1_ACTCACATAGAG 0.572704 4.590956 Rods

In [130]:
sns.pairplot(smushed_names, hue='cluster_id', palette=cluster_names_to_color)


Out[130]:
<seaborn.axisgrid.PairGrid at 0x133a4f588>

In [ ]:


In [14]:
rpca_alm.lmbda


Out[14]:
0.057735026918962568

In [15]:
U, s, V = np.linalg.svd(rpca_alm.L)

In [16]:
U


Out[16]:
array([[-0.04272573, -0.02831748, -0.00597595, ...,  0.00333803,
         0.00177218,  0.01580328],
       [-0.07864946, -0.04817703, -0.01001383, ..., -0.0114292 ,
         0.00301747,  0.00501156],
       [-0.05291033, -0.03174095, -0.00638981, ..., -0.00100707,
        -0.00933078,  0.01188982],
       ..., 
       [-0.01026694,  0.02322763, -0.03528178, ...,  0.03940001,
         0.00723089, -0.05019267],
       [-0.04240066,  0.0962479 , -0.13841497, ..., -0.00552324,
         0.00943978, -0.00371454],
       [-0.00583065,  0.02502047, -0.03610277, ..., -0.0189998 ,
        -0.03432167,  0.05701413]])

In [17]:
sns.distplot(s[s > 0.1], kde=False)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f209b70>

In [59]:
diff = rpca_alm.L - lowrank

In [60]:
datasets = {'Original': lowrank, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff}

common.heatmaps(datasets)



In [61]:
L = pd.DataFrame(rpca_alm.L, index=lowrank.index, columns=lowrank.columns)
L.head()


Out[61]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 7.272295 3.048168 1.562439 2.198954 4.326513 1.385031 1.989380 6.541110 1.689744 1.345555 ... 0.313114 0.767164 0.864139 0.388544 0.150222 0.752207 0.272070 0.832861 1.258661 0.856469
r1_TGGAGATACTCT 13.174154 5.480442 2.635686 3.713715 7.521109 2.310180 3.257731 11.804131 2.741139 2.363235 ... 0.815807 1.031292 1.597556 0.999699 0.884529 1.033574 0.724387 1.236772 2.351665 1.286273
r1_CGTCTACATCCG 8.820288 3.785870 1.735920 2.396476 5.171789 1.561243 2.092837 7.836292 1.876383 1.537487 ... 0.382253 0.972003 0.828423 0.656586 0.391978 0.916053 0.429003 0.958758 1.373776 0.703660
r1_CAAGCTTGGCGC 25.982230 11.021147 5.608453 7.951609 15.774301 4.953723 7.427482 24.229816 6.077871 4.769225 ... 1.644721 3.139924 3.271503 2.906614 1.999058 2.751189 2.020806 3.190714 5.678790 3.812966
r1_ACTCACATAGAG 7.614892 3.096781 1.282597 1.838656 4.131712 1.165996 1.602792 6.662531 1.363853 1.237918 ... 0.687691 0.804546 1.079846 0.874430 0.684653 0.541885 0.754388 0.499074 1.349995 0.391946

5 rows × 259 columns


In [63]:
sns.distplot(lowrank.values.flat)


Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x132164ac8>

In [62]:
sns.distplot(L.values.flat)


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x12180e0f0>

In [74]:
lowrank_tidy = lowrank.unstack().reset_index()
lowrank_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'

tidy = pd.concat([lowrank_tidy, L_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()


Out[74]:
level_0 barcode molecules dataset
0 RHO r1_TTCCTGCTAGGC 14.0 Original
1 RHO r1_TGGAGATACTCT 23.0 Original
2 RHO r1_CGTCTACATCCG 14.0 Original
3 RHO r1_CAAGCTTGGCGC 62.0 Original
4 RHO r1_ACTCACATAGAG 10.0 Original

In [75]:
sns.violinplot(x='dataset', y='molecules', data=tidy)


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x124e40da0>

In [76]:
sns.boxplot(x='dataset', y='molecules', data=tidy)


Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f390a20>

In [37]:
S = pd.DataFrame(rpca_alm.S, index=lowrank.index, columns=lowrank.columns)
S.head()


Out[37]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [21]:
diff.head()


Out[21]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC -6.727705 0.048168 0.562439 -0.801046 -7.673487 1.385031 0.989380 -0.458890 -0.310256 -0.654445 ... -0.686886 -0.232836 -1.135861 0.388544 0.150222 0.752207 0.272070 -0.167139 1.258661 0.856469
r1_TGGAGATACTCT -9.825846 -2.519558 -3.364314 -0.286285 -5.478891 -6.689820 1.257731 -7.195869 1.741139 1.363235 ... -2.184193 1.031292 -0.402444 -0.000301 0.884529 0.033574 0.724387 -0.763228 2.351665 0.286273
r1_CGTCTACATCCG -5.179712 -0.214130 -5.264080 1.396476 -0.828211 -1.438757 2.092837 -5.163708 -0.123617 -0.462513 ... 0.382253 -0.027997 0.828423 -2.343414 0.391978 -0.083947 0.429003 -1.041242 1.373776 0.703660
r1_CAAGCTTGGCGC -36.017770 -6.978853 -4.391547 -12.048391 -13.225699 2.953723 -0.572518 -6.770184 -2.922129 2.769225 ... 1.644721 -1.860076 -3.728497 -0.093386 -0.000942 -3.248811 0.020806 0.190714 -1.321210 -7.187034
r1_ACTCACATAGAG -2.385108 2.096781 1.282597 0.838656 -0.868288 -0.834004 0.602792 -0.337469 -1.636147 0.237918 ... -0.312309 -0.195454 -0.920154 -2.125570 -0.315347 -1.458115 -0.245612 0.499074 -1.650005 0.391946

5 rows × 259 columns


In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = lowrank - gr0

datasets = {'Original': lowrank, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff_gr0}

common.heatmaps(datasets)



In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [24]:
g_original = sns.clustermap(lowrank.T.corr(method='spearman'), xticklabels=[], yticklabels=[], 
                             col_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [39]:
S.head()


Out[39]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [49]:
sns.distplot(S.values.flat)


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x1248a71d0>

In [52]:
np.median(S.values)


Out[52]:
0.0

In [56]:
high_in_sparse = (S > 10).any()
print(high_in_sparse.sum())
S.loc[:, high_in_sparse]


88
Out[56]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... TTYH1 PAX6 MGARP HSP90AA1 SLC6A6 MAP1B TMA7 SYT1 SNAP25 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... -0.000000 -0.000000 -0.302098 -0.081445 0.686886 0.232836 1.135861 -0.150222 -0.272070 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 1.331571 0.998086 -1.093359 -1.256187 2.184193 -1.031292 0.402444 -0.884529 -0.724387 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.341183 0.999751 1.902183 3.171176 -0.382253 0.027997 -0.828423 -0.391978 -0.429003 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... 0.814278 0.000000 -1.179403 -0.254800 -1.644721 1.860076 3.728497 -0.000000 -0.020806 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... -0.082806 -0.000000 -0.509653 -0.000000 0.312309 0.195454 0.920154 0.315347 0.245612 -0.391946
r1_TAACGGACACGC 21.380107 1.663735 -0.021046 -0.222488 12.299926 4.287691 -0.672176 0.000000 -2.237709 9.267142 ... 4.740199 0.000000 0.000000 -0.000000 0.952529 -1.441017 -0.406451 1.892411 -1.088772 0.354429
r1_CGCATGGGATAC 13.959024 3.970865 -1.348141 -0.892688 2.954439 -0.215551 1.402263 -0.000000 -0.442402 5.729592 ... 2.569047 0.000000 -0.719147 1.672199 0.435023 1.349910 -0.755655 -0.504931 0.403079 -0.428073
r1_TAACGACGCTTG 3.180794 -0.447518 0.109158 -0.257555 -2.279500 0.203602 3.582963 2.171476 -0.986389 0.327993 ... -0.133089 -0.000000 -0.236654 0.158758 -0.000000 0.434394 -0.410368 -0.045431 0.000000 2.149970
r1_TCGGCAGCCTCT 0.000000 6.456903 -1.145033 1.899016 0.735434 1.092624 0.135634 9.117565 0.692410 2.080789 ... -0.266130 0.000000 0.000000 -0.887233 -0.409884 -1.027818 -0.134590 -0.516927 -0.389400 -1.188493
r1_TAGGATGCAAAC 0.973777 -2.880712 2.373638 0.301869 -3.082936 -2.280995 5.395161 0.074149 8.163804 0.806921 ... -0.570322 -0.000000 -1.104568 -2.087004 -0.265266 0.564875 1.717807 -0.707985 -0.499107 5.146910
r1_CGGTTACAGTAG 12.505315 1.261856 2.428380 -1.565133 -1.709170 -1.225406 3.767564 3.748447 0.259539 -1.144302 ... -0.334235 0.000000 -0.000000 -0.807533 2.114339 0.758462 1.419423 1.893210 0.000000 -0.454211
r1_AATCGGATACGT 11.933689 -0.000000 2.991893 1.107978 -1.669472 1.209005 5.214369 0.000000 -0.186109 1.227780 ... -0.452205 0.000000 -1.065484 1.154736 0.482219 0.946174 -0.982452 1.195778 2.396707 1.631832
r1_AGTGGGCTTGAG 7.947213 4.715814 -1.638802 -1.356734 0.401653 -0.493621 0.793865 -0.000000 5.179635 0.523270 ... -0.212292 0.000000 0.039120 1.593622 0.396195 -0.750549 -0.893586 1.234860 0.221824 1.878540
r1_TTCACCTACCGC 10.781769 1.930491 0.537446 -2.004599 0.901076 -0.239981 -1.714882 -0.079131 2.512117 -0.276403 ... 0.165085 -0.000000 0.017907 0.238652 -0.090347 -0.579233 1.459530 -0.516233 -0.232060 -0.625630
r1_TTATGTCGTCCT 2.270350 3.943689 2.119016 1.400406 3.516772 1.362922 -2.202215 1.637913 -1.913613 1.275680 ... -0.533194 0.000000 -0.000000 2.163319 -0.309056 3.075198 0.000000 0.209408 -0.560234 -0.595371
r1_ATCAGCGCAGTC 0.000000 -0.565269 0.021651 0.567229 -2.430373 0.126985 4.493267 2.215334 0.862813 -0.714553 ... -0.000000 0.000000 0.537775 0.056646 -0.073409 0.411675 0.550113 -0.144370 0.780021 7.097770
r1_CTTTATGGTGAC 7.572463 -0.903545 -0.000000 2.227700 5.103486 3.413487 1.485007 10.665482 0.584033 -0.507655 ... 1.319728 0.000000 0.206072 -1.192753 2.364657 -1.385998 -0.295737 0.000000 1.001619 1.978113
r1_GAATCGGGAACA 0.130061 -2.574404 0.207521 -0.441166 -2.970246 -0.523037 3.736323 3.206105 -0.825140 -0.521023 ... 1.157655 -0.000000 -0.024933 0.125958 -0.152853 -0.815796 1.130103 0.207391 -0.320568 -0.858338
r1_GAAGTGATCACC 7.198676 1.907994 -0.363277 -0.764196 8.988425 0.000000 -1.520747 9.571634 -0.658129 -1.910792 ... 0.000000 0.000000 -1.730510 3.680007 -0.000000 0.058526 0.000000 -0.000000 -0.994082 -1.859172
r1_AGTGGGCGGCCG 8.848392 0.895194 -0.163581 3.340794 0.000000 0.958213 -0.538988 -1.387186 5.711675 0.000000 ... -0.115035 0.000000 0.269487 1.013680 -0.113011 -0.443464 -0.403637 0.567080 1.632395 5.046891
r1_ACTGATGATTAA -0.166009 0.258877 0.679583 0.564598 1.074236 -0.259644 -0.234408 -0.000000 -0.281426 0.673068 ... 0.000000 -0.000000 0.771423 -0.207019 0.917991 1.837832 -0.114600 -0.030735 -0.216464 -0.000000
r1_CGCCCGTCTGTA 11.020429 0.000000 0.081976 -2.722703 4.451928 0.291661 0.406819 2.286490 0.938283 0.300258 ... -0.292419 0.000000 0.727774 0.068704 -0.214730 -0.000000 -0.923698 1.250572 -0.505095 -0.963568
r1_GGGCTTGGGAAG -0.000000 2.448623 -0.000000 0.493179 -0.228614 -0.881305 -1.331266 0.641512 -0.062537 1.000179 ... -0.278898 -0.000000 3.872065 1.717896 -0.459817 -0.628542 0.258535 4.167146 1.243851 0.345875
r1_TTAATGACTACA -0.000000 0.866470 -1.088180 3.399241 0.000000 1.020583 6.455940 -0.432606 0.780824 -0.900402 ... -0.071543 0.000000 2.491098 2.013177 -0.132724 -0.505365 0.455063 -0.155937 0.761623 3.373602
r1_CGGCTGTCTGCT 19.308350 5.889286 0.084709 4.388549 -0.419955 -0.645631 -2.231481 3.700417 -0.926757 -1.735463 ... 0.000000 -0.000000 1.313213 0.745053 -0.448072 -0.966678 0.017107 0.784523 0.099009 -0.836805
r1_ATGATTATGGTT 4.632167 3.751768 -0.967775 -0.363413 4.001080 0.158328 0.758410 -2.679809 -0.982320 -0.870134 ... -0.059819 -0.000000 -0.253736 1.133169 -0.000000 -0.453014 2.358229 -0.074023 -0.050509 -0.206752
r1_TTTACTTCAAGG -1.347545 0.221071 0.604238 0.056137 0.059580 -0.206978 9.143060 0.000000 -1.473954 -0.173521 ... -0.016236 -0.000000 -0.419281 -0.088003 -0.000000 0.197948 0.362459 -0.000000 -0.000000 0.143271
r1_ATGGCTCGCAAA 6.387332 -0.915061 1.178258 0.446398 2.697413 0.388423 -2.239412 2.655880 -1.887992 -1.644699 ... -0.250518 -0.000000 0.000000 2.312154 -0.363505 -0.821810 -0.000000 -0.645316 0.432266 -0.786629
r1_CGATGGCTGGAC 17.101099 3.885594 -0.803805 -0.557101 3.556439 -0.605906 -1.171681 2.418840 1.088428 -0.678596 ... -0.257935 0.000000 -0.000000 -1.433706 2.095816 0.051474 -1.320981 0.017615 0.947546 -0.662259
r1_GCGTGCTACTAC 2.224974 0.063991 -1.916026 -0.721554 -1.373457 0.313343 -0.444382 4.467587 -2.033359 1.282551 ... 0.485015 0.000000 -0.935773 0.270485 0.478233 -0.788953 -0.000000 0.252681 -0.563404 0.925160
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
r1_CGAAACTATCGC 2.195243 0.011369 0.000000 -0.000000 -0.107809 0.000000 -0.000000 -0.534394 -0.000000 -0.000000 ... -1.361469 1.173269 0.000000 -0.291303 0.000000 0.775780 -0.000000 1.852239 -0.000000 0.000000
r1_CCCCTCTCTGGC 0.000000 -0.000000 0.000000 -0.000000 -0.114969 -0.000000 0.932384 0.363043 -0.009413 -0.000000 ... 5.316246 0.353796 0.000000 -0.077829 -0.000000 -0.146776 0.000000 0.000000 0.015977 -0.000000
r1_ATCAATATTCTC -0.000000 -0.000000 0.000000 0.000000 0.817163 0.000000 -0.000000 0.000000 -0.000000 0.000000 ... 1.092017 0.044050 4.603456 -0.867112 0.023728 -0.000000 -0.000000 1.872482 -0.000000 -0.019444
r1_TCTCTGTGACGC 1.528452 -0.012077 0.011134 1.023459 0.917759 1.983044 -0.000000 0.586865 -0.000000 0.000000 ... 2.050997 -0.538215 0.736698 0.470899 0.000000 -0.000000 0.014210 0.000000 0.000000 -0.069790
r1_GGCGGACTGCGT 1.628706 -0.096092 -0.000000 -0.016540 3.745887 -0.029418 -0.050897 0.000000 -0.017742 0.935019 ... -2.207297 -0.633798 2.238783 -0.073710 1.017559 -0.080742 -0.000000 0.037455 0.000000 0.000000
r1_GAAGAGTATCTT -0.165410 0.000000 -0.000000 0.000000 0.855833 0.994807 -0.000000 -0.000000 0.013863 0.000000 ... -0.000000 0.480201 4.156182 2.273838 0.000000 -0.000000 0.000000 1.541622 -0.000000 0.000000
r1_GTTACACGAGTC 1.943480 -0.000000 0.957764 -0.000000 -0.047855 -0.000000 0.000000 -0.000000 -0.000000 0.000000 ... 5.790545 0.630836 -0.187083 1.666287 -0.000000 -0.000000 0.000000 0.000000 -0.000000 0.000000
r1_CATTGGTCTCAC 3.070188 -0.174132 -1.029044 -0.515659 0.703719 0.066136 -1.424556 0.614585 0.795668 -0.903366 ... 5.600525 -0.425672 10.884224 -2.120829 0.643494 2.271292 -0.588402 -0.182806 -0.312266 0.047062
r1_TATACTAAGTTT 2.770301 -0.445298 0.776546 -0.158671 0.332642 0.835105 -0.002061 1.015680 -0.066518 0.703998 ... 2.984812 1.252434 -0.264520 -0.000000 -0.000000 -0.000000 -0.105667 3.254264 0.579125 0.817419
r1_AGATCATCGTCC -0.000000 -0.048309 0.000000 -0.000000 0.866396 -0.000000 -0.000000 0.000000 0.999137 0.000000 ... -1.932864 0.409695 1.500891 3.290392 0.000000 -0.000000 -0.000000 0.000000 0.000000 0.000000
r1_CGGATTTACACT 1.784631 -0.000000 0.000000 -0.000000 -0.045397 -0.000000 0.938590 -0.002960 0.007074 0.000000 ... 0.000000 2.048196 4.595213 -0.458753 0.000000 0.885203 -0.000000 -0.000000 0.000000 -0.000000
r1_TATAGGAACAAA 0.438695 0.032053 0.000000 0.000000 0.000000 0.000000 -0.002251 0.562090 0.000000 -0.000000 ... 3.084729 0.793422 0.798523 -0.000000 0.974947 0.893590 0.000000 0.883523 -0.000000 -0.000000
r1_ACCATGTTGGGA 1.041601 0.219584 -0.108511 -0.156109 0.000000 -0.084239 -0.019090 -1.420963 -0.089953 -0.222936 ... 9.790146 -0.121851 7.362139 -0.428579 -0.000000 -0.102834 -0.036901 -0.136796 -0.000000 0.000000
r1_TCAAAGATAGGG -0.000000 0.974528 0.000000 1.011693 0.945346 -0.000000 -0.000000 -0.000000 0.000000 -0.000000 ... -0.000000 0.622028 1.443698 -0.204755 0.000000 -0.008213 0.000000 0.000000 -0.000000 -0.000000
r1_TTTATATTTGGG 0.659228 -0.000000 0.007339 -0.000000 -0.045322 -0.000000 0.918379 0.659567 0.986917 0.000000 ... 2.566947 -0.151579 -0.455806 0.884180 0.000000 0.000000 0.000000 0.000000 0.000000 -0.000000
r1_CACACCGCGTAG 1.186377 0.307160 0.821878 -0.101919 -0.506890 -0.041569 0.955889 -0.016220 -0.000000 -0.149948 ... -0.673384 0.136749 7.032807 1.076661 -0.075586 -0.022994 -0.000000 1.865617 -0.325853 0.000000
r1_GCTCGGTTAGTT -0.000000 -0.235687 -0.026272 -0.074866 3.449860 -0.061053 -0.000000 -0.000000 -0.039245 0.872183 ... 10.688130 0.627932 -0.508217 1.689609 1.875097 -0.000000 0.927727 -0.000000 -0.000000 -0.000000
r1_TAGAGGCCTATA -0.000000 -0.000000 0.000000 0.015248 -0.031060 0.000000 0.000000 -0.000000 0.010458 0.000000 ... 5.591531 0.283414 4.287004 5.259208 0.000000 -0.000000 0.018663 -0.000000 -0.000000 -0.009140
r1_TATAAAAAATTT -0.000000 -0.007146 0.000000 -0.000000 0.000000 -0.000000 -0.000000 0.000000 -0.000000 0.000000 ... -0.670504 0.829290 0.232661 0.875470 0.000000 0.000000 0.015976 0.000000 0.000000 0.000000
r1_TCTAATATTCGC -0.331777 0.000000 1.008258 0.019039 -0.000000 0.000000 2.973341 0.683886 -0.000000 -0.000000 ... 3.736617 -0.362063 -2.351407 -0.000000 -0.000000 0.000000 -0.000000 -0.000000 0.000000 0.943395
r1_AGGGTGGGTACA -0.000000 0.000000 -0.020779 0.997030 -0.000000 0.000000 0.000000 0.969479 0.000000 0.000000 ... 0.599839 1.621370 -0.534722 -0.000000 -0.000000 -0.000000 -0.021030 0.070620 0.040233 -0.000000
r1_AATGCTGCAAGA -0.000000 0.007975 -0.000000 0.006630 -0.008046 0.000000 0.000000 -0.000000 0.000000 0.000000 ... 2.606610 -0.746857 -1.296138 0.232468 -0.000000 0.000000 0.000000 -0.000000 -0.000000 0.000000
r1_GTCGGGCCTTTC -0.213827 -0.000000 -0.009420 0.000000 -0.104125 0.000000 -0.000000 0.559870 0.012508 0.000000 ... -2.811222 0.000000 16.412101 0.950820 -0.000000 0.000000 1.889713 -0.000000 -0.000000 -0.000000
r1_GGGTCAGCGGCG 0.920600 1.260236 -0.105873 -0.157246 -0.929874 -0.083545 -0.074611 -0.552968 0.880902 0.779157 ... 4.037112 -1.204179 -4.443643 -0.000000 -0.000000 -0.000000 1.804412 -0.000000 -0.000000 0.000000
r1_CTGGACCTGCCC 0.000000 -0.191608 -0.026919 -0.016774 -0.234621 -0.024769 -0.000000 -0.661173 -0.000000 0.927448 ... 3.241622 -0.367363 6.982276 -0.138741 -0.000000 -0.000000 0.000000 -0.006284 -0.000000 0.000000
r1_AAGATATTGCTG -0.185538 0.960408 1.035136 0.000000 0.801181 0.000000 0.000000 0.712134 -0.000000 -0.000000 ... 4.298957 1.678050 4.851130 1.807929 0.016362 -0.030886 0.000000 0.000000 -0.028761 -0.057232
r1_GAGACCTCATGG 0.000000 -0.713697 -0.271694 0.508802 0.860829 -0.239491 0.488049 -0.363523 1.646615 -0.267568 ... 3.518335 -0.540237 1.018306 0.000000 -0.000000 0.666421 -0.003371 -0.136384 0.778941 1.797944
r1_CGGAGCGCGACA 1.052581 -0.099141 -0.000000 -0.000000 -0.136871 0.984826 -0.030601 0.970290 -0.000000 -0.000000 ... -1.453753 -0.163724 2.973280 0.522468 -0.000000 0.000000 -0.000000 -0.000000 0.000000 -0.007794
r1_AAGGACAGATCC 0.000000 1.739662 2.596491 -0.543682 -0.524763 -0.284546 0.579656 -1.319067 -0.308557 1.550640 ... 8.426755 -0.829454 3.722234 0.369819 0.000000 -0.061672 -0.120301 -0.437313 -0.014598 -0.000000
r1_ATATGCACCCTA 0.864573 -0.000000 -0.000000 -0.000000 0.000000 0.000000 0.000000 -0.000000 -0.000000 -0.000000 ... -1.270910 -0.296835 4.645703 -0.000000 -0.000000 -0.003225 0.000000 0.000000 -0.000000 0.000000

300 rows × 88 columns


In [47]:
data = S[S > 0]
data = data.fillna(0)

g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[], row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [38]:
data = S.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
                             col_colors=color_labels, row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [44]:
data = S.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [25]:
data = L.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
                             col_colors=color_labels, row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

So this seemed to have flipped some of the cells into different types, and made the within-cluster distances smaller


In [45]:
reconstructed = L + S

data = reconstructed.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
                             col_colors=color_labels, row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [81]:
csv = os.path.join(data_folder, 'sparse.csv')

S.to_csv(csv)

In [82]:
csv = os.path.join(data_folder, 'lowrank.csv')

L.to_csv(csv)

Try ICA before and after


In [79]:
L.shape


Out[79]:
(300, 259)

In [ ]:
from sklearn.decomposition import ICA

ica = ICA(n_components=)

ADMM implementation


In [36]:
reduced = rpcaADMM.rpcaADMM(lowrank)
# print(reduced.shape)
# reduced.head()


iter	    r norm	   eps pri	    s norm	  eps dual	 objective
   1	  386.9989	    5.6518	  560.3478	    5.5259	  46003.39
  10	  126.6648	    7.4377	   57.8995	    7.7838	 274951.69
  20	   19.5261	    7.9579	   26.0279	    7.5066	 297640.42
  30	   10.9950	    8.0745	   10.9821	    7.3901	 300438.68

In [62]:
rpcaADMM.rpcaADMM()

In [38]:
reduced.keys()


Out[38]:
dict_keys(['objval', 'r_norm', 's_norm', 'eps_pri', 'eps_dual', 'addm_toc', 'admm_iter', 'X1_admm', 'X2_admm', 'X3_admm'])

In [76]:
ncols = 4
nrows = 1

axsize = 3

width = ncols * axsize
height = nrows * axsize

fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat

x_keys = [key for key in reduced if key.startswith('X')]

ax = next(axes_iter)
data = lowrank
mask = data == 0
sns.heatmap(lowrank, mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title='Original')

for ax, key in zip(axes_iter, x_keys):
    data = reduced[key]
    mask = data == 0
    
    vmin = data.min().min()
    vmax = data.max().max()
    center = 0
    sns.heatmap(reduced[key], mask=mask, ax=ax, xticklabels=[], yticklabels=[])
    ax.set(title=key)



In [79]:
ncols = 4
nrows = 1

axsize = 3

width = ncols * axsize * 1.25
height = nrows * axsize

fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat

x_keys = [key for key in reduced if key.startswith('X')]

ax = next(axes_iter)
common.heatmap(lowrank, ax=ax)
ax.set(title='Original')

for ax, key in zip(axes_iter, x_keys):
    common.heatmap(reduced[key], ax=ax)
    ax.set(title=key)



In [55]:
U, s, V = np.linalg.svd(reduced['X3_admm'])

In [61]:
reduced['X2_admm'][reduced['X2_admm'].nonzero()]


Out[61]:
array([  0.06196856,   6.6911438 ,  13.40747184,   0.27285075,
        48.22056903,   2.24481196,   2.75026673,   0.05984601,   6.10384102])

In [80]:
ax


[autoreload of common failed: Traceback (most recent call last):
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 246, in check
    superreload(m, reload, self.old_objects)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 369, in superreload
    module = reload(module)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 608, in _exec
  File "<frozen importlib._bootstrap_external>", line 674, in exec_module
  File "<frozen importlib._bootstrap_external>", line 781, in get_code
  File "<frozen importlib._bootstrap_external>", line 741, in source_to_code
  File "<frozen importlib._bootstrap>", line 205, in _call_with_frames_removed
  File "/Users/olgabot/code/cshl-singlecell-2017/notebooks/02_tissue_subpopulations/common.py", line 34
    for ax, (key, data) in zip(key, datas.items()):
                                                  ^
SyntaxError: unexpected EOF while parsing
]
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x124f07048>

In [81]:
sns.heatmap??

In [ ]: