In [2]:
import os
import common

# Assign notebook and folder names
notebook_name = '02_robust_pca'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [3]:
%pdb


Automatic pdb calling has been turned ON

In [4]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [5]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')

csv = os.path.join(input_folder, 'expression_table1_subset.csv')

table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()


(300, 259)
Out[5]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 14 3 1 3 12 0 1 7 2 2 ... 1 1 2 0 0 0 0 1 0 0
r1_TGGAGATACTCT 23 8 6 4 13 9 2 19 1 1 ... 3 0 2 1 0 1 0 2 0 1
r1_CGTCTACATCCG 14 4 7 1 6 3 0 13 2 2 ... 0 1 0 3 0 1 0 2 0 0
r1_CAAGCTTGGCGC 62 18 10 20 29 2 8 31 9 2 ... 0 5 7 3 2 6 2 3 7 11
r1_ACTCACATAGAG 10 1 0 1 5 2 1 7 3 1 ... 1 1 2 3 1 2 1 0 3 0

5 rows × 259 columns

Assign colors basd on clusters


In [6]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()


(44808,)
Out[6]:
barcode
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [7]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()


Out[7]:
barcode
r1_TTCCTGCTAGGC    24
r1_TGGAGATACTCT    24
r1_CGTCTACATCCG    24
r1_CAAGCTTGGCGC    24
r1_ACTCACATAGAG    24
Name: cluster_id, dtype: int64

In [8]:
cluster_ids = cluster_identities_table1.unique()
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))

color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]


Out[8]:
[(0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118)]

Plot the original, dropout'd data


In [9]:
sns.set(style='whitegrid')

In [10]:
mask = table1 == 0

fig, ax = plt.subplots()
sns.heatmap(table1, mask=mask, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')


Out[10]:
[<matplotlib.text.Text at 0x11a9df860>, <matplotlib.text.Text at 0x11a9cb5c0>]

Maybe this is small enough for a clustered heatmap


In [11]:
clustergrid = sns.clustermap(table1, mask=mask, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Add Robust PCA implementations to path


In [12]:
import sys

sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])

import r_pca
import rpcaADMM

In [104]:
r_pca.R_pca??

In [13]:
%%time
rpca_alm = r_pca.R_pca(table1.as_matrix())
rpca_alm.fit()


iteration: 1, error: 56422.70929985199
iteration: 100, error: 0.6880109089868683
iteration: 200, error: 0.14702804887281945
iteration: 251, error: 0.08979496489742976
CPU times: user 15.1 s, sys: 145 ms, total: 15.3 s
Wall time: 3.9 s

In [14]:
rpca_alm.lmbda


Out[14]:
0.057735026918962568

In [15]:
U, s, V = np.linalg.svd(rpca_alm.L)

In [16]:
U


Out[16]:
array([[-0.04272573, -0.02831748, -0.00597595, ...,  0.00333803,
         0.00177218,  0.01580328],
       [-0.07864946, -0.04817703, -0.01001383, ..., -0.0114292 ,
         0.00301747,  0.00501156],
       [-0.05291033, -0.03174095, -0.00638981, ..., -0.00100707,
        -0.00933078,  0.01188982],
       ..., 
       [-0.01026694,  0.02322763, -0.03528178, ...,  0.03940001,
         0.00723089, -0.05019267],
       [-0.04240066,  0.0962479 , -0.13841497, ..., -0.00552324,
         0.00943978, -0.00371454],
       [-0.00583065,  0.02502047, -0.03610277, ..., -0.0189998 ,
        -0.03432167,  0.05701413]])

In [17]:
sns.distplot(s[s > 0.1], kde=False)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f209b70>

In [59]:
diff = rpca_alm.L - table1

In [60]:
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff}

common.heatmaps(datasets)



In [61]:
L = pd.DataFrame(rpca_alm.L, index=table1.index, columns=table1.columns)
L.head()


Out[61]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 7.272295 3.048168 1.562439 2.198954 4.326513 1.385031 1.989380 6.541110 1.689744 1.345555 ... 0.313114 0.767164 0.864139 0.388544 0.150222 0.752207 0.272070 0.832861 1.258661 0.856469
r1_TGGAGATACTCT 13.174154 5.480442 2.635686 3.713715 7.521109 2.310180 3.257731 11.804131 2.741139 2.363235 ... 0.815807 1.031292 1.597556 0.999699 0.884529 1.033574 0.724387 1.236772 2.351665 1.286273
r1_CGTCTACATCCG 8.820288 3.785870 1.735920 2.396476 5.171789 1.561243 2.092837 7.836292 1.876383 1.537487 ... 0.382253 0.972003 0.828423 0.656586 0.391978 0.916053 0.429003 0.958758 1.373776 0.703660
r1_CAAGCTTGGCGC 25.982230 11.021147 5.608453 7.951609 15.774301 4.953723 7.427482 24.229816 6.077871 4.769225 ... 1.644721 3.139924 3.271503 2.906614 1.999058 2.751189 2.020806 3.190714 5.678790 3.812966
r1_ACTCACATAGAG 7.614892 3.096781 1.282597 1.838656 4.131712 1.165996 1.602792 6.662531 1.363853 1.237918 ... 0.687691 0.804546 1.079846 0.874430 0.684653 0.541885 0.754388 0.499074 1.349995 0.391946

5 rows × 259 columns


In [63]:
sns.distplot(table1.values.flat)


Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x132164ac8>

In [62]:
sns.distplot(L.values.flat)


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x12180e0f0>

In [101]:
diff = table1 - L
diff_tidy = diff.unstack().reset_index()
diff_tidy['dataset'] = 'Difference'

table1_tidy = table1.unstack().reset_index()
table1_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'

tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()

sns.violinplot(x='dataset', y='molecules', data=tidy)


Out[101]:
level_0 barcode molecules dataset
0 RHO r1_TTCCTGCTAGGC 14.0 Original
1 RHO r1_TGGAGATACTCT 23.0 Original
2 RHO r1_CGTCTACATCCG 14.0 Original
3 RHO r1_CAAGCTTGGCGC 62.0 Original
4 RHO r1_ACTCACATAGAG 10.0 Original

In [103]:
sns.boxplot(x='dataset', y='molecules', data=tidy)


Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f08da58>

In [37]:
S = pd.DataFrame(rpca_alm.S, index=table1.index, columns=table1.columns)
S.head()


Out[37]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [21]:
diff.head()


Out[21]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC -6.727705 0.048168 0.562439 -0.801046 -7.673487 1.385031 0.989380 -0.458890 -0.310256 -0.654445 ... -0.686886 -0.232836 -1.135861 0.388544 0.150222 0.752207 0.272070 -0.167139 1.258661 0.856469
r1_TGGAGATACTCT -9.825846 -2.519558 -3.364314 -0.286285 -5.478891 -6.689820 1.257731 -7.195869 1.741139 1.363235 ... -2.184193 1.031292 -0.402444 -0.000301 0.884529 0.033574 0.724387 -0.763228 2.351665 0.286273
r1_CGTCTACATCCG -5.179712 -0.214130 -5.264080 1.396476 -0.828211 -1.438757 2.092837 -5.163708 -0.123617 -0.462513 ... 0.382253 -0.027997 0.828423 -2.343414 0.391978 -0.083947 0.429003 -1.041242 1.373776 0.703660
r1_CAAGCTTGGCGC -36.017770 -6.978853 -4.391547 -12.048391 -13.225699 2.953723 -0.572518 -6.770184 -2.922129 2.769225 ... 1.644721 -1.860076 -3.728497 -0.093386 -0.000942 -3.248811 0.020806 0.190714 -1.321210 -7.187034
r1_ACTCACATAGAG -2.385108 2.096781 1.282597 0.838656 -0.868288 -0.834004 0.602792 -0.337469 -1.636147 0.237918 ... -0.312309 -0.195454 -0.920154 -2.125570 -0.315347 -1.458115 -0.245612 0.499074 -1.650005 0.391946

5 rows × 259 columns


In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = table1 - gr0

datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff_gr0}

common.heatmaps(datasets)



In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [24]:
g_original = sns.clustermap(table1.T.corr(method='spearman'), xticklabels=[], yticklabels=[], 
                             col_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [88]:
import fastcluster

In [89]:
fastcluster.pdist?

In [95]:
table1_clustergrid = common.clustermap(table1.T.corr(method='spearman'), col_colors=color_labels)
table1_clustergrid.savefig(os.path.join(figure_folder, 'expression_table1_clustermap.pdf'))


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)