In [2]:
import os
import common

# Assign notebook and folder names
notebook_name = '02_robust_pca'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [3]:
%pdb


Automatic pdb calling has been turned ON

In [4]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [5]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')

csv = os.path.join(input_folder, 'expression_table1_subset.csv')

table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()


(300, 259)
Out[5]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 14 3 1 3 12 0 1 7 2 2 ... 1 1 2 0 0 0 0 1 0 0
r1_TGGAGATACTCT 23 8 6 4 13 9 2 19 1 1 ... 3 0 2 1 0 1 0 2 0 1
r1_CGTCTACATCCG 14 4 7 1 6 3 0 13 2 2 ... 0 1 0 3 0 1 0 2 0 0
r1_CAAGCTTGGCGC 62 18 10 20 29 2 8 31 9 2 ... 0 5 7 3 2 6 2 3 7 11
r1_ACTCACATAGAG 10 1 0 1 5 2 1 7 3 1 ... 1 1 2 3 1 2 1 0 3 0

5 rows × 259 columns

Assign colors basd on clusters


In [6]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()


(44808,)
Out[6]:
barcode
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [7]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()


Out[7]:
barcode
r1_TTCCTGCTAGGC    24
r1_TGGAGATACTCT    24
r1_CGTCTACATCCG    24
r1_CAAGCTTGGCGC    24
r1_ACTCACATAGAG    24
Name: cluster_id, dtype: int64

In [8]:
cluster_ids = cluster_identities_table1.unique()
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))

color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]


Out[8]:
[(0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118)]

Plot the original, dropout'd data


In [9]:
sns.set(style='whitegrid')

In [10]:
mask = table1 == 0

fig, ax = plt.subplots()
sns.heatmap(table1, mask=mask, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')


Out[10]:
[<matplotlib.text.Text at 0x11a9df860>, <matplotlib.text.Text at 0x11a9cb5c0>]

Maybe this is small enough for a clustered heatmap


In [11]:
clustergrid = sns.clustermap(table1, mask=mask, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Add Robust PCA implementations to path


In [12]:
import sys

sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])

import r_pca
import rpcaADMM

In [104]:
r_pca.R_pca??

In [13]:
%%time
rpca_alm = r_pca.R_pca(table1.as_matrix())
rpca_alm.fit()


iteration: 1, error: 56422.70929985199
iteration: 100, error: 0.6880109089868683
iteration: 200, error: 0.14702804887281945
iteration: 251, error: 0.08979496489742976
CPU times: user 15.1 s, sys: 145 ms, total: 15.3 s
Wall time: 3.9 s

In [14]:
rpca_alm.lmbda


Out[14]:
0.057735026918962568

In [15]:
U, s, V = np.linalg.svd(rpca_alm.L)

In [16]:
U


Out[16]:
array([[-0.04272573, -0.02831748, -0.00597595, ...,  0.00333803,
         0.00177218,  0.01580328],
       [-0.07864946, -0.04817703, -0.01001383, ..., -0.0114292 ,
         0.00301747,  0.00501156],
       [-0.05291033, -0.03174095, -0.00638981, ..., -0.00100707,
        -0.00933078,  0.01188982],
       ..., 
       [-0.01026694,  0.02322763, -0.03528178, ...,  0.03940001,
         0.00723089, -0.05019267],
       [-0.04240066,  0.0962479 , -0.13841497, ..., -0.00552324,
         0.00943978, -0.00371454],
       [-0.00583065,  0.02502047, -0.03610277, ..., -0.0189998 ,
        -0.03432167,  0.05701413]])

In [17]:
sns.distplot(s[s > 0.1], kde=False)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f209b70>

In [59]:
diff = rpca_alm.L - table1

In [60]:
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff}

common.heatmaps(datasets)



In [61]:
L = pd.DataFrame(rpca_alm.L, index=table1.index, columns=table1.columns)
L.head()


Out[61]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 7.272295 3.048168 1.562439 2.198954 4.326513 1.385031 1.989380 6.541110 1.689744 1.345555 ... 0.313114 0.767164 0.864139 0.388544 0.150222 0.752207 0.272070 0.832861 1.258661 0.856469
r1_TGGAGATACTCT 13.174154 5.480442 2.635686 3.713715 7.521109 2.310180 3.257731 11.804131 2.741139 2.363235 ... 0.815807 1.031292 1.597556 0.999699 0.884529 1.033574 0.724387 1.236772 2.351665 1.286273
r1_CGTCTACATCCG 8.820288 3.785870 1.735920 2.396476 5.171789 1.561243 2.092837 7.836292 1.876383 1.537487 ... 0.382253 0.972003 0.828423 0.656586 0.391978 0.916053 0.429003 0.958758 1.373776 0.703660
r1_CAAGCTTGGCGC 25.982230 11.021147 5.608453 7.951609 15.774301 4.953723 7.427482 24.229816 6.077871 4.769225 ... 1.644721 3.139924 3.271503 2.906614 1.999058 2.751189 2.020806 3.190714 5.678790 3.812966
r1_ACTCACATAGAG 7.614892 3.096781 1.282597 1.838656 4.131712 1.165996 1.602792 6.662531 1.363853 1.237918 ... 0.687691 0.804546 1.079846 0.874430 0.684653 0.541885 0.754388 0.499074 1.349995 0.391946

5 rows × 259 columns


In [63]:
sns.distplot(table1.values.flat)


Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x132164ac8>

In [62]:
sns.distplot(L.values.flat)


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x12180e0f0>

In [101]:
diff = table1 - L
diff_tidy = diff.unstack().reset_index()
diff_tidy['dataset'] = 'Difference'

table1_tidy = table1.unstack().reset_index()
table1_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'

tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()

sns.violinplot(x='dataset', y='molecules', data=tidy)


Out[101]:
level_0 barcode molecules dataset
0 RHO r1_TTCCTGCTAGGC 14.0 Original
1 RHO r1_TGGAGATACTCT 23.0 Original
2 RHO r1_CGTCTACATCCG 14.0 Original
3 RHO r1_CAAGCTTGGCGC 62.0 Original
4 RHO r1_ACTCACATAGAG 10.0 Original

In [103]:
sns.boxplot(x='dataset', y='molecules', data=tidy)


Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f08da58>

In [37]:
S = pd.DataFrame(rpca_alm.S, index=table1.index, columns=table1.columns)
S.head()


Out[37]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [21]:
diff.head()


Out[21]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC -6.727705 0.048168 0.562439 -0.801046 -7.673487 1.385031 0.989380 -0.458890 -0.310256 -0.654445 ... -0.686886 -0.232836 -1.135861 0.388544 0.150222 0.752207 0.272070 -0.167139 1.258661 0.856469
r1_TGGAGATACTCT -9.825846 -2.519558 -3.364314 -0.286285 -5.478891 -6.689820 1.257731 -7.195869 1.741139 1.363235 ... -2.184193 1.031292 -0.402444 -0.000301 0.884529 0.033574 0.724387 -0.763228 2.351665 0.286273
r1_CGTCTACATCCG -5.179712 -0.214130 -5.264080 1.396476 -0.828211 -1.438757 2.092837 -5.163708 -0.123617 -0.462513 ... 0.382253 -0.027997 0.828423 -2.343414 0.391978 -0.083947 0.429003 -1.041242 1.373776 0.703660
r1_CAAGCTTGGCGC -36.017770 -6.978853 -4.391547 -12.048391 -13.225699 2.953723 -0.572518 -6.770184 -2.922129 2.769225 ... 1.644721 -1.860076 -3.728497 -0.093386 -0.000942 -3.248811 0.020806 0.190714 -1.321210 -7.187034
r1_ACTCACATAGAG -2.385108 2.096781 1.282597 0.838656 -0.868288 -0.834004 0.602792 -0.337469 -1.636147 0.237918 ... -0.312309 -0.195454 -0.920154 -2.125570 -0.315347 -1.458115 -0.245612 0.499074 -1.650005 0.391946

5 rows × 259 columns


In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = table1 - gr0

datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff_gr0}

common.heatmaps(datasets)



In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [24]:
g_original = sns.clustermap(table1.T.corr(method='spearman'), xticklabels=[], yticklabels=[], 
                             col_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [88]:
import fastcluster

In [89]:
fastcluster.pdist?

In [95]:
table1_clustergrid = common.clustermap(table1.T.corr(method='spearman'), col_colors=color_labels)
table1_clustergrid.savefig(os.path.join(figure_folder, 'expression_table1_clustermap.pdf'))


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [39]:
S.head()


Out[39]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [49]:
sns.distplot(S.values.flat)


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x1248a71d0>

In [52]:
np.median(S.values)


Out[52]:
0.0

In [56]:
high_in_sparse = (S > 10).any()
print(high_in_sparse.sum())
S.loc[:, high_in_sparse]


88
Out[56]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... TTYH1 PAX6 MGARP HSP90AA1 SLC6A6 MAP1B TMA7 SYT1 SNAP25 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... -0.000000 -0.000000 -0.302098 -0.081445 0.686886 0.232836 1.135861 -0.150222 -0.272070 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 1.331571 0.998086 -1.093359 -1.256187 2.184193 -1.031292 0.402444 -0.884529 -0.724387 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.341183 0.999751 1.902183 3.171176 -0.382253 0.027997 -0.828423 -0.391978 -0.429003 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... 0.814278 0.000000 -1.179403 -0.254800 -1.644721 1.860076 3.728497 -0.000000 -0.020806 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... -0.082806 -0.000000 -0.509653 -0.000000 0.312309 0.195454 0.920154 0.315347 0.245612 -0.391946
r1_TAACGGACACGC 21.380107 1.663735 -0.021046 -0.222488 12.299926 4.287691 -0.672176 0.000000 -2.237709 9.267142 ... 4.740199 0.000000 0.000000 -0.000000 0.952529 -1.441017 -0.406451 1.892411 -1.088772 0.354429
r1_CGCATGGGATAC 13.959024 3.970865 -1.348141 -0.892688 2.954439 -0.215551 1.402263 -0.000000 -0.442402 5.729592 ... 2.569047 0.000000 -0.719147 1.672199 0.435023 1.349910 -0.755655 -0.504931 0.403079 -0.428073
r1_TAACGACGCTTG 3.180794 -0.447518 0.109158 -0.257555 -2.279500 0.203602 3.582963 2.171476 -0.986389 0.327993 ... -0.133089 -0.000000 -0.236654 0.158758 -0.000000 0.434394 -0.410368 -0.045431 0.000000 2.149970
r1_TCGGCAGCCTCT 0.000000 6.456903 -1.145033 1.899016 0.735434 1.092624 0.135634 9.117565 0.692410 2.080789 ... -0.266130 0.000000 0.000000 -0.887233 -0.409884 -1.027818 -0.134590 -0.516927 -0.389400 -1.188493
r1_TAGGATGCAAAC 0.973777 -2.880712 2.373638 0.301869 -3.082936 -2.280995 5.395161 0.074149 8.163804 0.806921 ... -0.570322 -0.000000 -1.104568 -2.087004 -0.265266 0.564875 1.717807 -0.707985 -0.499107 5.146910
r1_CGGTTACAGTAG 12.505315 1.261856 2.428380 -1.565133 -1.709170 -1.225406 3.767564 3.748447 0.259539 -1.144302 ... -0.334235 0.000000 -0.000000 -0.807533 2.114339 0.758462 1.419423 1.893210 0.000000 -0.454211
r1_AATCGGATACGT 11.933689 -0.000000 2.991893 1.107978 -1.669472 1.209005 5.214369 0.000000 -0.186109 1.227780 ... -0.452205 0.000000 -1.065484 1.154736 0.482219 0.946174 -0.982452 1.195778 2.396707 1.631832
r1_AGTGGGCTTGAG 7.947213 4.715814 -1.638802 -1.356734 0.401653 -0.493621 0.793865 -0.000000 5.179635 0.523270 ... -0.212292 0.000000 0.039120 1.593622 0.396195 -0.750549 -0.893586 1.234860 0.221824 1.878540
r1_TTCACCTACCGC 10.781769 1.930491 0.537446 -2.004599 0.901076 -0.239981 -1.714882 -0.079131 2.512117 -0.276403 ... 0.165085 -0.000000 0.017907 0.238652 -0.090347 -0.579233 1.459530 -0.516233 -0.232060 -0.625630
r1_TTATGTCGTCCT 2.270350 3.943689 2.119016 1.400406 3.516772 1.362922 -2.202215 1.637913 -1.913613 1.275680 ... -0.533194 0.000000 -0.000000 2.163319 -0.309056 3.075198 0.000000 0.209408 -0.560234 -0.595371
r1_ATCAGCGCAGTC 0.000000 -0.565269 0.021651 0.567229 -2.430373 0.126985 4.493267 2.215334 0.862813 -0.714553 ... -0.000000 0.000000 0.537775 0.056646 -0.073409 0.411675 0.550113 -0.144370 0.780021 7.097770
r1_CTTTATGGTGAC 7.572463 -0.903545 -0.000000 2.227700 5.103486 3.413487 1.485007 10.665482 0.584033 -0.507655 ... 1.319728 0.000000 0.206072 -1.192753 2.364657 -1.385998 -0.295737 0.000000 1.001619 1.978113
r1_GAATCGGGAACA 0.130061 -2.574404 0.207521 -0.441166 -2.970246 -0.523037 3.736323 3.206105 -0.825140 -0.521023 ... 1.157655 -0.000000 -0.024933 0.125958 -0.152853 -0.815796 1.130103 0.207391 -0.320568 -0.858338
r1_GAAGTGATCACC 7.198676 1.907994 -0.363277 -0.764196 8.988425 0.000000 -1.520747 9.571634 -0.658129 -1.910792 ... 0.000000 0.000000 -1.730510 3.680007 -0.000000 0.058526 0.000000 -0.000000 -0.994082 -1.859172
r1_AGTGGGCGGCCG 8.848392 0.895194 -0.163581 3.340794 0.000000 0.958213 -0.538988 -1.387186 5.711675 0.000000 ... -0.115035 0.000000 0.269487 1.013680 -0.113011 -0.443464 -0.403637 0.567080 1.632395 5.046891
r1_ACTGATGATTAA -0.166009 0.258877 0.679583 0.564598 1.074236 -0.259644 -0.234408 -0.000000 -0.281426 0.673068 ... 0.000000 -0.000000 0.771423 -0.207019 0.917991 1.837832 -0.114600 -0.030735 -0.216464 -0.000000
r1_CGCCCGTCTGTA 11.020429 0.000000 0.081976 -2.722703 4.451928 0.291661 0.406819 2.286490 0.938283 0.300258 ... -0.292419 0.000000 0.727774 0.068704 -0.214730 -0.000000 -0.923698 1.250572 -0.505095 -0.963568
r1_GGGCTTGGGAAG -0.000000 2.448623 -0.000000 0.493179 -0.228614 -0.881305 -1.331266 0.641512 -0.062537 1.000179 ... -0.278898 -0.000000 3.872065 1.717896 -0.459817 -0.628542 0.258535 4.167146 1.243851 0.345875
r1_TTAATGACTACA -0.000000 0.866470 -1.088180 3.399241 0.000000 1.020583 6.455940 -0.432606 0.780824 -0.900402 ... -0.071543 0.000000 2.491098 2.013177 -0.132724 -0.505365 0.455063 -0.155937 0.761623 3.373602
r1_CGGCTGTCTGCT 19.308350 5.889286 0.084709 4.388549 -0.419955 -0.645631 -2.231481 3.700417 -0.926757 -1.735463 ... 0.000000 -0.000000 1.313213 0.745053 -0.448072 -0.966678 0.017107 0.784523 0.099009 -0.836805
r1_ATGATTATGGTT 4.632167 3.751768 -0.967775 -0.363413 4.001080 0.158328 0.758410 -2.679809 -0.982320 -0.870134 ... -0.059819 -0.000000 -0.253736 1.133169 -0.000000 -0.453014 2.358229 -0.074023 -0.050509 -0.206752
r1_TTTACTTCAAGG -1.347545 0.221071 0.604238 0.056137 0.059580 -0.206978 9.143060 0.000000 -1.473954 -0.173521 ... -0.016236 -0.000000 -0.419281 -0.088003 -0.000000 0.197948 0.362459 -0.000000 -0.000000 0.143271
r1_ATGGCTCGCAAA 6.387332 -0.915061 1.178258 0.446398 2.697413 0.388423 -2.239412 2.655880 -1.887992 -1.644699 ... -0.250518 -0.000000 0.000000 2.312154 -0.363505 -0.821810 -0.000000 -0.645316 0.432266 -0.786629
r1_CGATGGCTGGAC 17.101099 3.885594 -0.803805 -0.557101 3.556439 -0.605906 -1.171681 2.418840 1.088428 -0.678596 ... -0.257935 0.000000 -0.000000 -1.433706 2.095816 0.051474 -1.320981 0.017615 0.947546 -0.662259
r1_GCGTGCTACTAC 2.224974 0.063991 -1.916026 -0.721554 -1.373457 0.313343 -0.444382 4.467587 -2.033359 1.282551 ... 0.485015 0.000000 -0.935773 0.270485 0.478233 -0.788953 -0.000000 0.252681 -0.563404 0.925160
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
r1_CGAAACTATCGC 2.195243 0.011369 0.000000 -0.000000 -0.107809 0.000000 -0.000000 -0.534394 -0.000000 -0.000000 ... -1.361469 1.173269 0.000000 -0.291303 0.000000 0.775780 -0.000000 1.852239 -0.000000 0.000000
r1_CCCCTCTCTGGC 0.000000 -0.000000 0.000000 -0.000000 -0.114969 -0.000000 0.932384 0.363043 -0.009413 -0.000000 ... 5.316246 0.353796 0.000000 -0.077829 -0.000000 -0.146776 0.000000 0.000000 0.015977 -0.000000
r1_ATCAATATTCTC -0.000000 -0.000000 0.000000 0.000000 0.817163 0.000000 -0.000000 0.000000 -0.000000 0.000000 ... 1.092017 0.044050 4.603456 -0.867112 0.023728 -0.000000 -0.000000 1.872482 -0.000000 -0.019444
r1_TCTCTGTGACGC 1.528452 -0.012077 0.011134 1.023459 0.917759 1.983044 -0.000000 0.586865 -0.000000 0.000000 ... 2.050997 -0.538215 0.736698 0.470899 0.000000 -0.000000 0.014210 0.000000 0.000000 -0.069790
r1_GGCGGACTGCGT 1.628706 -0.096092 -0.000000 -0.016540 3.745887 -0.029418 -0.050897 0.000000 -0.017742 0.935019 ... -2.207297 -0.633798 2.238783 -0.073710 1.017559 -0.080742 -0.000000 0.037455 0.000000 0.000000
r1_GAAGAGTATCTT -0.165410 0.000000 -0.000000 0.000000 0.855833 0.994807 -0.000000 -0.000000 0.013863 0.000000 ... -0.000000 0.480201 4.156182 2.273838 0.000000 -0.000000 0.000000 1.541622 -0.000000 0.000000
r1_GTTACACGAGTC 1.943480 -0.000000 0.957764 -0.000000 -0.047855 -0.000000 0.000000 -0.000000 -0.000000 0.000000 ... 5.790545 0.630836 -0.187083 1.666287 -0.000000 -0.000000 0.000000 0.000000 -0.000000 0.000000
r1_CATTGGTCTCAC 3.070188 -0.174132 -1.029044 -0.515659 0.703719 0.066136 -1.424556 0.614585 0.795668 -0.903366 ... 5.600525 -0.425672 10.884224 -2.120829 0.643494 2.271292 -0.588402 -0.182806 -0.312266 0.047062
r1_TATACTAAGTTT 2.770301 -0.445298 0.776546 -0.158671 0.332642 0.835105 -0.002061 1.015680 -0.066518 0.703998 ... 2.984812 1.252434 -0.264520 -0.000000 -0.000000 -0.000000 -0.105667 3.254264 0.579125 0.817419
r1_AGATCATCGTCC -0.000000 -0.048309 0.000000 -0.000000 0.866396 -0.000000 -0.000000 0.000000 0.999137 0.000000 ... -1.932864 0.409695 1.500891 3.290392 0.000000 -0.000000 -0.000000 0.000000 0.000000 0.000000
r1_CGGATTTACACT 1.784631 -0.000000 0.000000 -0.000000 -0.045397 -0.000000 0.938590 -0.002960 0.007074 0.000000 ... 0.000000 2.048196 4.595213 -0.458753 0.000000 0.885203 -0.000000 -0.000000 0.000000 -0.000000
r1_TATAGGAACAAA 0.438695 0.032053 0.000000 0.000000 0.000000 0.000000 -0.002251 0.562090 0.000000 -0.000000 ... 3.084729 0.793422 0.798523 -0.000000 0.974947 0.893590 0.000000 0.883523 -0.000000 -0.000000
r1_ACCATGTTGGGA 1.041601 0.219584 -0.108511 -0.156109 0.000000 -0.084239 -0.019090 -1.420963 -0.089953 -0.222936 ... 9.790146 -0.121851 7.362139 -0.428579 -0.000000 -0.102834 -0.036901 -0.136796 -0.000000 0.000000
r1_TCAAAGATAGGG -0.000000 0.974528 0.000000 1.011693 0.945346 -0.000000 -0.000000 -0.000000 0.000000 -0.000000 ... -0.000000 0.622028 1.443698 -0.204755 0.000000 -0.008213 0.000000 0.000000 -0.000000 -0.000000
r1_TTTATATTTGGG 0.659228 -0.000000 0.007339 -0.000000 -0.045322 -0.000000 0.918379 0.659567 0.986917 0.000000 ... 2.566947 -0.151579 -0.455806 0.884180 0.000000 0.000000 0.000000 0.000000 0.000000 -0.000000
r1_CACACCGCGTAG 1.186377 0.307160 0.821878 -0.101919 -0.506890 -0.041569 0.955889 -0.016220 -0.000000 -0.149948 ... -0.673384 0.136749 7.032807 1.076661 -0.075586 -0.022994 -0.000000 1.865617 -0.325853 0.000000
r1_GCTCGGTTAGTT -0.000000 -0.235687 -0.026272 -0.074866 3.449860 -0.061053 -0.000000 -0.000000 -0.039245 0.872183 ... 10.688130 0.627932 -0.508217 1.689609 1.875097 -0.000000 0.927727 -0.000000 -0.000000 -0.000000
r1_TAGAGGCCTATA -0.000000 -0.000000 0.000000 0.015248 -0.031060 0.000000 0.000000 -0.000000 0.010458 0.000000 ... 5.591531 0.283414 4.287004 5.259208 0.000000 -0.000000 0.018663 -0.000000 -0.000000 -0.009140
r1_TATAAAAAATTT -0.000000 -0.007146 0.000000 -0.000000 0.000000 -0.000000 -0.000000 0.000000 -0.000000 0.000000 ... -0.670504 0.829290 0.232661 0.875470 0.000000 0.000000 0.015976 0.000000 0.000000 0.000000
r1_TCTAATATTCGC -0.331777 0.000000 1.008258 0.019039 -0.000000 0.000000 2.973341 0.683886 -0.000000 -0.000000 ... 3.736617 -0.362063 -2.351407 -0.000000 -0.000000 0.000000 -0.000000 -0.000000 0.000000 0.943395
r1_AGGGTGGGTACA -0.000000 0.000000 -0.020779 0.997030 -0.000000 0.000000 0.000000 0.969479 0.000000 0.000000 ... 0.599839 1.621370 -0.534722 -0.000000 -0.000000 -0.000000 -0.021030 0.070620 0.040233 -0.000000
r1_AATGCTGCAAGA -0.000000 0.007975 -0.000000 0.006630 -0.008046 0.000000 0.000000 -0.000000 0.000000 0.000000 ... 2.606610 -0.746857 -1.296138 0.232468 -0.000000 0.000000 0.000000 -0.000000 -0.000000 0.000000
r1_GTCGGGCCTTTC -0.213827 -0.000000 -0.009420 0.000000 -0.104125 0.000000 -0.000000 0.559870 0.012508 0.000000 ... -2.811222 0.000000 16.412101 0.950820 -0.000000 0.000000 1.889713 -0.000000 -0.000000 -0.000000
r1_GGGTCAGCGGCG 0.920600 1.260236 -0.105873 -0.157246 -0.929874 -0.083545 -0.074611 -0.552968 0.880902 0.779157 ... 4.037112 -1.204179 -4.443643 -0.000000 -0.000000 -0.000000 1.804412 -0.000000 -0.000000 0.000000
r1_CTGGACCTGCCC 0.000000 -0.191608 -0.026919 -0.016774 -0.234621 -0.024769 -0.000000 -0.661173 -0.000000 0.927448 ... 3.241622 -0.367363 6.982276 -0.138741 -0.000000 -0.000000 0.000000 -0.006284 -0.000000 0.000000
r1_AAGATATTGCTG -0.185538 0.960408 1.035136 0.000000 0.801181 0.000000 0.000000 0.712134 -0.000000 -0.000000 ... 4.298957 1.678050 4.851130 1.807929 0.016362 -0.030886 0.000000 0.000000 -0.028761 -0.057232
r1_GAGACCTCATGG 0.000000 -0.713697 -0.271694 0.508802 0.860829 -0.239491 0.488049 -0.363523 1.646615 -0.267568 ... 3.518335 -0.540237 1.018306 0.000000 -0.000000 0.666421 -0.003371 -0.136384 0.778941 1.797944
r1_CGGAGCGCGACA 1.052581 -0.099141 -0.000000 -0.000000 -0.136871 0.984826 -0.030601 0.970290 -0.000000 -0.000000 ... -1.453753 -0.163724 2.973280 0.522468 -0.000000 0.000000 -0.000000 -0.000000 0.000000 -0.007794
r1_AAGGACAGATCC 0.000000 1.739662 2.596491 -0.543682 -0.524763 -0.284546 0.579656 -1.319067 -0.308557 1.550640 ... 8.426755 -0.829454 3.722234 0.369819 0.000000 -0.061672 -0.120301 -0.437313 -0.014598 -0.000000
r1_ATATGCACCCTA 0.864573 -0.000000 -0.000000 -0.000000 0.000000 0.000000 0.000000 -0.000000 -0.000000 -0.000000 ... -1.270910 -0.296835 4.645703 -0.000000 -0.000000 -0.003225 0.000000 0.000000 -0.000000 0.000000

300 rows × 88 columns


In [47]:
data = S[S > 0]
data = data.fillna(0)

g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[], row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [38]:
data = S.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
                             col_colors=color_labels, row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [44]:
data = S.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [96]:
data = L.T.corr(method='spearman')
g_rpca = common.clustermap(data, col_colors=color_labels)
g_rpca.savefig(os.path.join(figure_folder, 'low_rank_clustermap.pdf'))


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [99]:
U, s, V = np.linalg.svd(L)
plt.plot(s[:10], 'o-')


Out[99]:
[<matplotlib.lines.Line2D at 0x13e2cbcf8>]

In [100]:
U, s, V = np.linalg.svd(table1)
plt.plot(s[:10], 'o-')


Out[100]:
[<matplotlib.lines.Line2D at 0x132a45cf8>]

So this seemed to have flipped some of the cells into different types, and made the within-cluster distances smaller


In [45]:
reconstructed = L + S

data = reconstructed.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
                             col_colors=color_labels, row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [81]:
csv = os.path.join(data_folder, 'sparse.csv')

S.to_csv(csv)

In [83]:
data_folder


Out[83]:
'data/002_robust_pca'

In [82]:
csv = os.path.join(data_folder, 'lowrank.csv')

L.to_csv(csv)

Try ICA before and after


In [79]:
L.shape


Out[79]:
(300, 259)

In [ ]:
from sklearn.decomposition import ICA

ica = ICA(n_components=)

ADMM implementation


In [36]:
reduced = rpcaADMM.rpcaADMM(table1)
# print(reduced.shape)
# reduced.head()


iter	    r norm	   eps pri	    s norm	  eps dual	 objective
   1	  386.9989	    5.6518	  560.3478	    5.5259	  46003.39
  10	  126.6648	    7.4377	   57.8995	    7.7838	 274951.69
  20	   19.5261	    7.9579	   26.0279	    7.5066	 297640.42
  30	   10.9950	    8.0745	   10.9821	    7.3901	 300438.68

In [62]:
rpcaADMM.rpcaADMM()

In [38]:
reduced.keys()


Out[38]:
dict_keys(['objval', 'r_norm', 's_norm', 'eps_pri', 'eps_dual', 'addm_toc', 'admm_iter', 'X1_admm', 'X2_admm', 'X3_admm'])

In [76]:
ncols = 4
nrows = 1

axsize = 3

width = ncols * axsize
height = nrows * axsize

fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat

x_keys = [key for key in reduced if key.startswith('X')]

ax = next(axes_iter)
data = table1
mask = data == 0
sns.heatmap(table1, mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title='Original')

for ax, key in zip(axes_iter, x_keys):
    data = reduced[key]
    mask = data == 0
    
    vmin = data.min().min()
    vmax = data.max().max()
    center = 0
    sns.heatmap(reduced[key], mask=mask, ax=ax, xticklabels=[], yticklabels=[])
    ax.set(title=key)



In [79]:
ncols = 4
nrows = 1

axsize = 3

width = ncols * axsize * 1.25
height = nrows * axsize

fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat

x_keys = [key for key in reduced if key.startswith('X')]

ax = next(axes_iter)
common.heatmap(table1, ax=ax)
ax.set(title='Original')

for ax, key in zip(axes_iter, x_keys):
    common.heatmap(reduced[key], ax=ax)
    ax.set(title=key)



In [55]:
U, s, V = np.linalg.svd(reduced['X3_admm'])

In [61]:
reduced['X2_admm'][reduced['X2_admm'].nonzero()]


Out[61]:
array([  0.06196856,   6.6911438 ,  13.40747184,   0.27285075,
        48.22056903,   2.24481196,   2.75026673,   0.05984601,   6.10384102])

In [80]:
ax


[autoreload of common failed: Traceback (most recent call last):
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 246, in check
    superreload(m, reload, self.old_objects)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 369, in superreload
    module = reload(module)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 608, in _exec
  File "<frozen importlib._bootstrap_external>", line 674, in exec_module
  File "<frozen importlib._bootstrap_external>", line 781, in get_code
  File "<frozen importlib._bootstrap_external>", line 741, in source_to_code
  File "<frozen importlib._bootstrap>", line 205, in _call_with_frames_removed
  File "/Users/olgabot/code/cshl-singlecell-2017/notebooks/02_tissue_subpopulations/common.py", line 34
    for ax, (key, data) in zip(key, datas.items()):
                                                  ^
SyntaxError: unexpected EOF while parsing
]
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x124f07048>

In [81]:
sns.heatmap??

In [ ]: