In [16]:
import os
import common

# Assign notebook and folder names
notebook_name = '06_exploring_with_josh'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [17]:
%pdb


Automatic pdb calling has been turned OFF

In [39]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [24]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')

csv = os.path.join(input_folder, 'expression_table1_subset.csv')

table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()


(300, 259)
Out[24]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 14 3 1 3 12 0 1 7 2 2 ... 1 1 2 0 0 0 0 1 0 0
r1_TGGAGATACTCT 23 8 6 4 13 9 2 19 1 1 ... 3 0 2 1 0 1 0 2 0 1
r1_CGTCTACATCCG 14 4 7 1 6 3 0 13 2 2 ... 0 1 0 3 0 1 0 2 0 0
r1_CAAGCTTGGCGC 62 18 10 20 29 2 8 31 9 2 ... 0 5 7 3 2 6 2 3 7 11
r1_ACTCACATAGAG 10 1 0 1 5 2 1 7 3 1 ... 1 1 2 3 1 2 1 0 3 0

5 rows × 259 columns

Assign colors basd on clusters


In [ ]:
cluster_name_to_id = {'Horizontal cells': [1], 'Retinal Ganglion cells': [2], 
                      'Amacrine cells': np.arange(3, 24),
                      'Rods', [24], 'Cones': [25], 
                      'Bipolar cells': np.arange(26, 34),
                      ''}

In [5]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()


(44808,)
Out[5]:
barcode
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [6]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()


Out[6]:
barcode
r1_TTCCTGCTAGGC    24
r1_TGGAGATACTCT    24
r1_CGTCTACATCCG    24
r1_CAAGCTTGGCGC    24
r1_ACTCACATAGAG    24
Name: cluster_id, dtype: int64

In [15]:
cluster_ids = cluster_identities_table1.unique()
cluster_ids


Out[15]:
array([24, 25, 26, 27, 33, 34])

In [32]:
cluster_names = cluster_identities_table1.map(common.cluster_id_to_name)
cluster_names.head()


Out[32]:
barcode
r1_TTCCTGCTAGGC    Rods
r1_TGGAGATACTCT    Rods
r1_CGTCTACATCCG    Rods
r1_CAAGCTTGGCGC    Rods
r1_ACTCACATAGAG    Rods
Name: cluster_id, dtype: object

In [7]:
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))

color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]


Out[7]:
[(0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118)]

Spot check some genes


In [25]:
genes_of_interest = ['RHO', 'PAX6', 'GNAT1', 'SLC24A1']

In [56]:
subset = table1[genes_of_interest]
subset.head()


Out[56]:
RHO PAX6 GNAT1 SLC24A1
barcode
r1_TTCCTGCTAGGC 14 0 3 1
r1_TGGAGATACTCT 23 1 8 6
r1_CGTCTACATCCG 14 1 4 7
r1_CAAGCTTGGCGC 62 0 18 10
r1_ACTCACATAGAG 10 0 1 0

In [52]:
# subset_log = np.log(subset+1)
# subset_log.head()


Out[52]:
RHO PAX6 GNAT1 SLC24A1
barcode
r1_TTCCTGCTAGGC 2.708050 0.000000 1.386294 0.693147
r1_TGGAGATACTCT 3.178054 0.693147 2.197225 1.945910
r1_CGTCTACATCCG 2.708050 0.693147 1.609438 2.079442
r1_CAAGCTTGGCGC 4.143135 0.000000 2.944439 2.397895
r1_ACTCACATAGAG 2.397895 0.000000 0.693147 0.000000

In [57]:
subset_names = subset.join(cluster_names)
subset_names.head()


Out[57]:
RHO PAX6 GNAT1 SLC24A1 cluster_id
barcode
r1_TTCCTGCTAGGC 14 0 3 1 Rods
r1_TGGAGATACTCT 23 1 8 6 Rods
r1_CGTCTACATCCG 14 1 4 7 Rods
r1_CAAGCTTGGCGC 62 0 18 10 Rods
r1_ACTCACATAGAG 10 0 1 0 Rods

In [58]:
sns.pairplot(subset_names, hue='cluster_id')


Out[58]:
<seaborn.axisgrid.PairGrid at 0x1320d9ef0>

In [46]:
np


Out[46]:
<module 'numpy' from '/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/numpy/__init__.py'>

In [47]:
sns.pairplot(subset.apply(np.log), hue='cluster_id')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-47-0ab0021a6378> in <module>()
----> 1 sns.pairplot(subset.apply(np.log), hue='cluster_id')

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4243         if isinstance(f, np.ufunc):
   4244             with np.errstate(all='ignore'):
-> 4245                 results = f(self.values)
   4246             return self._constructor(data=results, index=self.index,
   4247                                      columns=self.columns, copy=False)

AttributeError: 'int' object has no attribute 'log'

Plot the original, dropout'd data


In [8]:
sns.set(style='whitegrid')

In [9]:
mask = table1 == 0

fig, ax = plt.subplots()
sns.heatmap(table1, mask=mask, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')


Out[9]:
[<matplotlib.text.Text at 0x10c690e80>, <matplotlib.text.Text at 0x10c8c2ba8>]

Maybe this is small enough for a clustered heatmap


In [11]:
clustergrid = sns.clustermap(table1, mask=mask, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Add Robust PCA implementations to path


In [62]:
import sys

sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])

import r_pca
import rpcaADMM

In [63]:
r_pca.R_pca??

In [64]:
%%time
rpca_alm = r_pca.R_pca(table1.as_matrix())
rpca_alm.fit()


iteration: 1, error: 56422.70929985199
iteration: 100, error: 0.6880109089868683
iteration: 200, error: 0.14702804887281945
iteration: 251, error: 0.08979496489742976
CPU times: user 15.4 s, sys: 127 ms, total: 15.6 s
Wall time: 3.92 s

In [71]:
sns.distplot(s[s > 0.1], kde=False)


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x116624e10>

In [72]:
diff = rpca_alm.L - table1

In [73]:
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff}

common.heatmaps(datasets)



In [74]:
L = pd.DataFrame(rpca_alm.L, index=table1.index, columns=table1.columns)
L.head()


Out[74]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 7.272295 3.048168 1.562439 2.198954 4.326513 1.385031 1.989380 6.541110 1.689744 1.345555 ... 0.313114 0.767164 0.864139 0.388544 0.150222 0.752207 0.272070 0.832861 1.258661 0.856469
r1_TGGAGATACTCT 13.174154 5.480442 2.635686 3.713715 7.521109 2.310180 3.257731 11.804131 2.741139 2.363235 ... 0.815807 1.031292 1.597556 0.999699 0.884529 1.033574 0.724387 1.236772 2.351665 1.286273
r1_CGTCTACATCCG 8.820288 3.785870 1.735920 2.396476 5.171789 1.561243 2.092837 7.836292 1.876383 1.537487 ... 0.382253 0.972003 0.828423 0.656586 0.391978 0.916053 0.429003 0.958758 1.373776 0.703660
r1_CAAGCTTGGCGC 25.982230 11.021147 5.608453 7.951609 15.774301 4.953723 7.427482 24.229816 6.077871 4.769225 ... 1.644721 3.139924 3.271503 2.906614 1.999058 2.751189 2.020806 3.190714 5.678790 3.812966
r1_ACTCACATAGAG 7.614892 3.096781 1.282597 1.838656 4.131712 1.165996 1.602792 6.662531 1.363853 1.237918 ... 0.687691 0.804546 1.079846 0.874430 0.684653 0.541885 0.754388 0.499074 1.349995 0.391946

5 rows × 259 columns


In [75]:
L_subset = L[genes_of_interest]
L_names = L_subset.join(cluster_names)

sns.pairplot(L_names, hue='cluster_id')


Out[75]:
<seaborn.axisgrid.PairGrid at 0x117764860>

In [76]:
sns.distplot(table1.values.flat)


Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x11648ff28>

In [78]:
sns.distplot(L.values.flat)


Out[78]:
<matplotlib.axes._subplots.AxesSubplot at 0x117523780>

In [79]:
diff = table1 - L
diff_tidy = diff.unstack().reset_index()
diff_tidy['dataset'] = 'Difference'

table1_tidy = table1.unstack().reset_index()
table1_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'

tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()

sns.violinplot(x='dataset', y='molecules', data=tidy)


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x11596ab38>

In [80]:
sns.boxplot(x='dataset', y='molecules', data=tidy)


Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x115c07278>

In [81]:
S = pd.DataFrame(rpca_alm.S, index=table1.index, columns=table1.columns)
S.head()


Out[81]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [88]:
sns.boxplot(table1[genes_of_interest])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/seaborn/categorical.py:2171: UserWarning: The boxplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x11abaf0b8>

In [86]:
sns.boxplot(L[genes_of_interest])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/seaborn/categorical.py:2171: UserWarning: The boxplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a63b4a8>

In [87]:
sns.boxplot(S[genes_of_interest])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/seaborn/categorical.py:2171: UserWarning: The boxplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0x11aa49470>

In [21]:
diff.head()


Out[21]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC -6.727705 0.048168 0.562439 -0.801046 -7.673487 1.385031 0.989380 -0.458890 -0.310256 -0.654445 ... -0.686886 -0.232836 -1.135861 0.388544 0.150222 0.752207 0.272070 -0.167139 1.258661 0.856469
r1_TGGAGATACTCT -9.825846 -2.519558 -3.364314 -0.286285 -5.478891 -6.689820 1.257731 -7.195869 1.741139 1.363235 ... -2.184193 1.031292 -0.402444 -0.000301 0.884529 0.033574 0.724387 -0.763228 2.351665 0.286273
r1_CGTCTACATCCG -5.179712 -0.214130 -5.264080 1.396476 -0.828211 -1.438757 2.092837 -5.163708 -0.123617 -0.462513 ... 0.382253 -0.027997 0.828423 -2.343414 0.391978 -0.083947 0.429003 -1.041242 1.373776 0.703660
r1_CAAGCTTGGCGC -36.017770 -6.978853 -4.391547 -12.048391 -13.225699 2.953723 -0.572518 -6.770184 -2.922129 2.769225 ... 1.644721 -1.860076 -3.728497 -0.093386 -0.000942 -3.248811 0.020806 0.190714 -1.321210 -7.187034
r1_ACTCACATAGAG -2.385108 2.096781 1.282597 0.838656 -0.868288 -0.834004 0.602792 -0.337469 -1.636147 0.237918 ... -0.312309 -0.195454 -0.920154 -2.125570 -0.315347 -1.458115 -0.245612 0.499074 -1.650005 0.391946

5 rows × 259 columns


In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = table1 - gr0

datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff_gr0}

common.heatmaps(datasets)



In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [24]:
g_original = sns.clustermap(table1.T.corr(method='spearman'), xticklabels=[], yticklabels=[], 
                             col_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [88]:
import fastcluster

In [89]:
fastcluster.pdist?

In [95]:
table1_clustergrid = common.clustermap(table1.T.corr(method='spearman'), col_colors=color_labels)
table1_clustergrid.savefig(os.path.join(figure_folder, 'expression_table1_clustermap.pdf'))


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [39]:
S.head()


Out[39]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [49]:
sns.distplot(S.values.flat)


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x1248a71d0>

In [52]:
np.median(S.values)


Out[52]:
0.0

In [56]:
high_in_sparse = (S > 10).any()
print(high_in_sparse.sum())
S.loc[:, high_in_sparse]


88
Out[56]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... TTYH1 PAX6 MGARP HSP90AA1 SLC6A6 MAP1B TMA7 SYT1 SNAP25 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... -0.000000 -0.000000 -0.302098 -0.081445 0.686886 0.232836 1.135861 -0.150222 -0.272070 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 1.331571 0.998086 -1.093359 -1.256187 2.184193 -1.031292 0.402444 -0.884529 -0.724387 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.341183 0.999751 1.902183 3.171176 -0.382253 0.027997 -0.828423 -0.391978 -0.429003 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... 0.814278 0.000000 -1.179403 -0.254800 -1.644721 1.860076 3.728497 -0.000000 -0.020806 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... -0.082806 -0.000000 -0.509653 -0.000000 0.312309 0.195454 0.920154 0.315347 0.245612 -0.391946
r1_TAACGGACACGC 21.380107 1.663735 -0.021046 -0.222488 12.299926 4.287691 -0.672176 0.000000 -2.237709 9.267142 ... 4.740199 0.000000 0.000000 -0.000000 0.952529 -1.441017 -0.406451 1.892411 -1.088772 0.354429
r1_CGCATGGGATAC 13.959024 3.970865 -1.348141 -0.892688 2.954439 -0.215551 1.402263 -0.000000 -0.442402 5.729592 ... 2.569047 0.000000 -0.719147 1.672199 0.435023 1.349910 -0.755655 -0.504931 0.403079 -0.428073
r1_TAACGACGCTTG 3.180794 -0.447518 0.109158 -0.257555 -2.279500 0.203602 3.582963 2.171476 -0.986389 0.327993 ... -0.133089 -0.000000 -0.236654 0.158758 -0.000000 0.434394 -0.410368 -0.045431 0.000000 2.149970
r1_TCGGCAGCCTCT 0.000000 6.456903 -1.145033 1.899016 0.735434 1.092624 0.135634 9.117565 0.692410 2.080789 ... -0.266130 0.000000 0.000000 -0.887233 -0.409884 -1.027818 -0.134590 -0.516927 -0.389400 -1.188493
r1_TAGGATGCAAAC 0.973777 -2.880712 2.373638 0.301869 -3.082936 -2.280995 5.395161 0.074149 8.163804 0.806921 ... -0.570322 -0.000000 -1.104568 -2.087004 -0.265266 0.564875 1.717807 -0.707985 -0.499107 5.146910
r1_CGGTTACAGTAG 12.505315 1.261856 2.428380 -1.565133 -1.709170 -1.225406 3.767564 3.748447 0.259539 -1.144302 ... -0.334235 0.000000 -0.000000 -0.807533 2.114339 0.758462 1.419423 1.893210 0.000000 -0.454211
r1_AATCGGATACGT 11.933689 -0.000000 2.991893 1.107978 -1.669472 1.209005 5.214369 0.000000 -0.186109 1.227780 ... -0.452205 0.000000 -1.065484 1.154736 0.482219 0.946174 -0.982452 1.195778 2.396707 1.631832
r1_AGTGGGCTTGAG 7.947213 4.715814 -1.638802 -1.356734 0.401653 -0.493621 0.793865 -0.000000 5.179635 0.523270 ... -0.212292 0.000000 0.039120 1.593622 0.396195 -0.750549 -0.893586 1.234860 0.221824 1.878540
r1_TTCACCTACCGC 10.781769 1.930491 0.537446 -2.004599 0.901076 -0.239981 -1.714882 -0.079131 2.512117 -0.276403 ... 0.165085 -0.000000 0.017907 0.238652 -0.090347 -0.579233 1.459530 -0.516233 -0.232060 -0.625630
r1_TTATGTCGTCCT 2.270350 3.943689 2.119016 1.400406 3.516772 1.362922 -2.202215 1.637913 -1.913613 1.275680 ... -0.533194 0.000000 -0.000000 2.163319 -0.309056 3.075198 0.000000 0.209408 -0.560234 -0.595371
r1_ATCAGCGCAGTC 0.000000 -0.565269 0.021651 0.567229 -2.430373 0.126985 4.493267 2.215334 0.862813 -0.714553 ... -0.000000 0.000000 0.537775 0.056646 -0.073409 0.411675 0.550113 -0.144370 0.780021 7.097770
r1_CTTTATGGTGAC 7.572463 -0.903545 -0.000000 2.227700 5.103486 3.413487 1.485007 10.665482 0.584033 -0.507655 ... 1.319728 0.000000 0.206072 -1.192753 2.364657 -1.385998 -0.295737 0.000000 1.001619 1.978113
r1_GAATCGGGAACA 0.130061 -2.574404 0.207521 -0.441166 -2.970246 -0.523037 3.736323 3.206105 -0.825140 -0.521023 ... 1.157655 -0.000000 -0.024933 0.125958 -0.152853 -0.815796 1.130103 0.207391 -0.320568 -0.858338
r1_GAAGTGATCACC 7.198676 1.907994 -0.363277 -0.764196 8.988425 0.000000 -1.520747 9.571634 -0.658129 -1.910792 ... 0.000000 0.000000 -1.730510 3.680007 -0.000000 0.058526 0.000000 -0.000000 -0.994082 -1.859172
r1_AGTGGGCGGCCG 8.848392 0.895194 -0.163581 3.340794 0.000000 0.958213 -0.538988 -1.387186 5.711675 0.000000 ... -0.115035 0.000000 0.269487 1.013680 -0.113011 -0.443464 -0.403637 0.567080 1.632395 5.046891
r1_ACTGATGATTAA -0.166009 0.258877 0.679583 0.564598 1.074236 -0.259644 -0.234408 -0.000000 -0.281426 0.673068 ... 0.000000 -0.000000 0.771423 -0.207019 0.917991 1.837832 -0.114600 -0.030735 -0.216464 -0.000000
r1_CGCCCGTCTGTA 11.020429 0.000000 0.081976 -2.722703 4.451928 0.291661 0.406819 2.286490 0.938283 0.300258 ... -0.292419 0.000000 0.727774 0.068704 -0.214730 -0.000000 -0.923698 1.250572 -0.505095 -0.963568
r1_GGGCTTGGGAAG -0.000000 2.448623 -0.000000 0.493179 -0.228614 -0.881305 -1.331266 0.641512 -0.062537 1.000179 ... -0.278898 -0.000000 3.872065 1.717896 -0.459817 -0.628542 0.258535 4.167146 1.243851 0.345875
r1_TTAATGACTACA -0.000000 0.866470 -1.088180 3.399241 0.000000 1.020583 6.455940 -0.432606 0.780824 -0.900402 ... -0.071543 0.000000 2.491098 2.013177 -0.132724 -0.505365 0.455063 -0.155937 0.761623 3.373602
r1_CGGCTGTCTGCT 19.308350 5.889286 0.084709 4.388549 -0.419955 -0.645631 -2.231481 3.700417 -0.926757 -1.735463 ... 0.000000 -0.000000 1.313213 0.745053 -0.448072 -0.966678 0.017107 0.784523 0.099009 -0.836805
r1_ATGATTATGGTT 4.632167 3.751768 -0.967775 -0.363413 4.001080 0.158328 0.758410 -2.679809 -0.982320 -0.870134 ... -0.059819 -0.000000 -0.253736 1.133169 -0.000000 -0.453014 2.358229 -0.074023 -0.050509 -0.206752
r1_TTTACTTCAAGG -1.347545 0.221071 0.604238 0.056137 0.059580 -0.206978 9.143060 0.000000 -1.473954 -0.173521 ... -0.016236 -0.000000 -0.419281 -0.088003 -0.000000 0.197948 0.362459 -0.000000 -0.000000 0.143271
r1_ATGGCTCGCAAA 6.387332 -0.915061 1.178258 0.446398 2.697413 0.388423 -2.239412 2.655880 -1.887992 -1.644699 ... -0.250518 -0.000000 0.000000 2.312154 -0.363505 -0.821810 -0.000000 -0.645316 0.432266 -0.786629
r1_CGATGGCTGGAC 17.101099 3.885594 -0.803805 -0.557101 3.556439 -0.605906 -1.171681 2.418840 1.088428 -0.678596 ... -0.257935 0.000000 -0.000000 -1.433706 2.095816 0.051474 -1.320981 0.017615 0.947546 -0.662259
r1_GCGTGCTACTAC 2.224974 0.063991 -1.916026 -0.721554 -1.373457 0.313343 -0.444382 4.467587 -2.033359 1.282551 ... 0.485015 0.000000 -0.935773 0.270485 0.478233 -0.788953 -0.000000 0.252681 -0.563404 0.925160
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
r1_CGAAACTATCGC 2.195243 0.011369 0.000000 -0.000000 -0.107809 0.000000 -0.000000 -0.534394 -0.000000 -0.000000 ... -1.361469 1.173269 0.000000 -0.291303 0.000000 0.775780 -0.000000 1.852239 -0.000000 0.000000
r1_CCCCTCTCTGGC 0.000000 -0.000000 0.000000 -0.000000 -0.114969 -0.000000 0.932384 0.363043 -0.009413 -0.000000 ... 5.316246 0.353796 0.000000 -0.077829 -0.000000 -0.146776 0.000000 0.000000 0.015977 -0.000000
r1_ATCAATATTCTC -0.000000 -0.000000 0.000000 0.000000 0.817163 0.000000 -0.000000 0.000000 -0.000000 0.000000 ... 1.092017 0.044050 4.603456 -0.867112 0.023728 -0.000000 -0.000000 1.872482 -0.000000 -0.019444
r1_TCTCTGTGACGC 1.528452 -0.012077 0.011134 1.023459 0.917759 1.983044 -0.000000 0.586865 -0.000000 0.000000 ... 2.050997 -0.538215 0.736698 0.470899 0.000000 -0.000000 0.014210 0.000000 0.000000 -0.069790
r1_GGCGGACTGCGT 1.628706 -0.096092 -0.000000 -0.016540 3.745887 -0.029418 -0.050897 0.000000 -0.017742 0.935019 ... -2.207297 -0.633798 2.238783 -0.073710 1.017559 -0.080742 -0.000000 0.037455 0.000000 0.000000
r1_GAAGAGTATCTT -0.165410 0.000000 -0.000000 0.000000 0.855833 0.994807 -0.000000 -0.000000 0.013863 0.000000 ... -0.000000 0.480201 4.156182 2.273838 0.000000 -0.000000 0.000000 1.541622 -0.000000 0.000000
r1_GTTACACGAGTC 1.943480 -0.000000 0.957764 -0.000000 -0.047855 -0.000000 0.000000 -0.000000 -0.000000 0.000000 ... 5.790545 0.630836 -0.187083 1.666287 -0.000000 -0.000000 0.000000 0.000000 -0.000000 0.000000
r1_CATTGGTCTCAC 3.070188 -0.174132 -1.029044 -0.515659 0.703719 0.066136 -1.424556 0.614585 0.795668 -0.903366 ... 5.600525 -0.425672 10.884224 -2.120829 0.643494 2.271292 -0.588402 -0.182806 -0.312266 0.047062
r1_TATACTAAGTTT 2.770301 -0.445298 0.776546 -0.158671 0.332642 0.835105 -0.002061 1.015680 -0.066518 0.703998 ... 2.984812 1.252434 -0.264520 -0.000000 -0.000000 -0.000000 -0.105667 3.254264 0.579125 0.817419
r1_AGATCATCGTCC -0.000000 -0.048309 0.000000 -0.000000 0.866396 -0.000000 -0.000000 0.000000 0.999137 0.000000 ... -1.932864 0.409695 1.500891 3.290392 0.000000 -0.000000 -0.000000 0.000000 0.000000 0.000000
r1_CGGATTTACACT 1.784631 -0.000000 0.000000 -0.000000 -0.045397 -0.000000 0.938590 -0.002960 0.007074 0.000000 ... 0.000000 2.048196 4.595213 -0.458753 0.000000 0.885203 -0.000000 -0.000000 0.000000 -0.000000
r1_TATAGGAACAAA 0.438695 0.032053 0.000000 0.000000 0.000000 0.000000 -0.002251 0.562090 0.000000 -0.000000 ... 3.084729 0.793422 0.798523 -0.000000 0.974947 0.893590 0.000000 0.883523 -0.000000 -0.000000
r1_ACCATGTTGGGA 1.041601 0.219584 -0.108511 -0.156109 0.000000 -0.084239 -0.019090 -1.420963 -0.089953 -0.222936 ... 9.790146 -0.121851 7.362139 -0.428579 -0.000000 -0.102834 -0.036901 -0.136796 -0.000000 0.000000
r1_TCAAAGATAGGG -0.000000 0.974528 0.000000 1.011693 0.945346 -0.000000 -0.000000 -0.000000 0.000000 -0.000000 ... -0.000000 0.622028 1.443698 -0.204755 0.000000 -0.008213 0.000000 0.000000 -0.000000 -0.000000
r1_TTTATATTTGGG 0.659228 -0.000000 0.007339 -0.000000 -0.045322 -0.000000 0.918379 0.659567 0.986917 0.000000 ... 2.566947 -0.151579 -0.455806 0.884180 0.000000 0.000000 0.000000 0.000000 0.000000 -0.000000
r1_CACACCGCGTAG 1.186377 0.307160 0.821878 -0.101919 -0.506890 -0.041569 0.955889 -0.016220 -0.000000 -0.149948 ... -0.673384 0.136749 7.032807 1.076661 -0.075586 -0.022994 -0.000000 1.865617 -0.325853 0.000000
r1_GCTCGGTTAGTT -0.000000 -0.235687 -0.026272 -0.074866 3.449860 -0.061053 -0.000000 -0.000000 -0.039245 0.872183 ... 10.688130 0.627932 -0.508217 1.689609 1.875097 -0.000000 0.927727 -0.000000 -0.000000 -0.000000
r1_TAGAGGCCTATA -0.000000 -0.000000 0.000000 0.015248 -0.031060 0.000000 0.000000 -0.000000 0.010458 0.000000 ... 5.591531 0.283414 4.287004 5.259208 0.000000 -0.000000 0.018663 -0.000000 -0.000000 -0.009140
r1_TATAAAAAATTT -0.000000 -0.007146 0.000000 -0.000000 0.000000 -0.000000 -0.000000 0.000000 -0.000000 0.000000 ... -0.670504 0.829290 0.232661 0.875470 0.000000 0.000000 0.015976 0.000000 0.000000 0.000000
r1_TCTAATATTCGC -0.331777 0.000000 1.008258 0.019039 -0.000000 0.000000 2.973341 0.683886 -0.000000 -0.000000 ... 3.736617 -0.362063 -2.351407 -0.000000 -0.000000 0.000000 -0.000000 -0.000000 0.000000 0.943395
r1_AGGGTGGGTACA -0.000000 0.000000 -0.020779 0.997030 -0.000000 0.000000 0.000000 0.969479 0.000000 0.000000 ... 0.599839 1.621370 -0.534722 -0.000000 -0.000000 -0.000000 -0.021030 0.070620 0.040233 -0.000000
r1_AATGCTGCAAGA -0.000000 0.007975 -0.000000 0.006630 -0.008046 0.000000 0.000000 -0.000000 0.000000 0.000000 ... 2.606610 -0.746857 -1.296138 0.232468 -0.000000 0.000000 0.000000 -0.000000 -0.000000 0.000000
r1_GTCGGGCCTTTC -0.213827 -0.000000 -0.009420 0.000000 -0.104125 0.000000 -0.000000 0.559870 0.012508 0.000000 ... -2.811222 0.000000 16.412101 0.950820 -0.000000 0.000000 1.889713 -0.000000 -0.000000 -0.000000
r1_GGGTCAGCGGCG 0.920600 1.260236 -0.105873 -0.157246 -0.929874 -0.083545 -0.074611 -0.552968 0.880902 0.779157 ... 4.037112 -1.204179 -4.443643 -0.000000 -0.000000 -0.000000 1.804412 -0.000000 -0.000000 0.000000
r1_CTGGACCTGCCC 0.000000 -0.191608 -0.026919 -0.016774 -0.234621 -0.024769 -0.000000 -0.661173 -0.000000 0.927448 ... 3.241622 -0.367363 6.982276 -0.138741 -0.000000 -0.000000 0.000000 -0.006284 -0.000000 0.000000
r1_AAGATATTGCTG -0.185538 0.960408 1.035136 0.000000 0.801181 0.000000 0.000000 0.712134 -0.000000 -0.000000 ... 4.298957 1.678050 4.851130 1.807929 0.016362 -0.030886 0.000000 0.000000 -0.028761 -0.057232
r1_GAGACCTCATGG 0.000000 -0.713697 -0.271694 0.508802 0.860829 -0.239491 0.488049 -0.363523 1.646615 -0.267568 ... 3.518335 -0.540237 1.018306 0.000000 -0.000000 0.666421 -0.003371 -0.136384 0.778941 1.797944
r1_CGGAGCGCGACA 1.052581 -0.099141 -0.000000 -0.000000 -0.136871 0.984826 -0.030601 0.970290 -0.000000 -0.000000 ... -1.453753 -0.163724 2.973280 0.522468 -0.000000 0.000000 -0.000000 -0.000000 0.000000 -0.007794
r1_AAGGACAGATCC 0.000000 1.739662 2.596491 -0.543682 -0.524763 -0.284546 0.579656 -1.319067 -0.308557 1.550640 ... 8.426755 -0.829454 3.722234 0.369819 0.000000 -0.061672 -0.120301 -0.437313 -0.014598 -0.000000
r1_ATATGCACCCTA 0.864573 -0.000000 -0.000000 -0.000000 0.000000 0.000000 0.000000 -0.000000 -0.000000 -0.000000 ... -1.270910 -0.296835 4.645703 -0.000000 -0.000000 -0.003225 0.000000 0.000000 -0.000000 0.000000

300 rows × 88 columns


In [47]:
data = S[S > 0]
data = data.fillna(0)

g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[], row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [38]:
data = S.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
                             col_colors=color_labels, row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [44]:
data = S.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [96]:
data = L.T.corr(method='spearman')
g_rpca = common.clustermap(data, col_colors=color_labels)
g_rpca.savefig(os.path.join(figure_folder, 'low_rank_clustermap.pdf'))


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [99]:
U, s, V = np.linalg.svd(L)
plt.plot(s[:10], 'o-')


Out[99]:
[<matplotlib.lines.Line2D at 0x13e2cbcf8>]

In [100]:
U, s, V = np.linalg.svd(table1)
plt.plot(s[:10], 'o-')


Out[100]:
[<matplotlib.lines.Line2D at 0x132a45cf8>]

So this seemed to have flipped some of the cells into different types, and made the within-cluster distances smaller


In [45]:
reconstructed = L + S

data = reconstructed.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
                             col_colors=color_labels, row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [81]:
csv = os.path.join(data_folder, 'sparse.csv')

S.to_csv(csv)

In [83]:
data_folder


Out[83]:
'data/002_robust_pca'

In [82]:
csv = os.path.join(data_folder, 'lowrank.csv')

L.to_csv(csv)

Try ICA before and after


In [79]:
L.shape


Out[79]:
(300, 259)

In [ ]:
from sklearn.decomposition import ICA

ica = ICA(n_components=)

ADMM implementation


In [36]:
reduced = rpcaADMM.rpcaADMM(table1)
# print(reduced.shape)
# reduced.head()


iter	    r norm	   eps pri	    s norm	  eps dual	 objective
   1	  386.9989	    5.6518	  560.3478	    5.5259	  46003.39
  10	  126.6648	    7.4377	   57.8995	    7.7838	 274951.69
  20	   19.5261	    7.9579	   26.0279	    7.5066	 297640.42
  30	   10.9950	    8.0745	   10.9821	    7.3901	 300438.68

In [62]:
rpcaADMM.rpcaADMM()

In [38]:
reduced.keys()


Out[38]:
dict_keys(['objval', 'r_norm', 's_norm', 'eps_pri', 'eps_dual', 'addm_toc', 'admm_iter', 'X1_admm', 'X2_admm', 'X3_admm'])

In [76]:
ncols = 4
nrows = 1

axsize = 3

width = ncols * axsize
height = nrows * axsize

fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat

x_keys = [key for key in reduced if key.startswith('X')]

ax = next(axes_iter)
data = table1
mask = data == 0
sns.heatmap(table1, mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title='Original')

for ax, key in zip(axes_iter, x_keys):
    data = reduced[key]
    mask = data == 0
    
    vmin = data.min().min()
    vmax = data.max().max()
    center = 0
    sns.heatmap(reduced[key], mask=mask, ax=ax, xticklabels=[], yticklabels=[])
    ax.set(title=key)



In [79]:
ncols = 4
nrows = 1

axsize = 3

width = ncols * axsize * 1.25
height = nrows * axsize

fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat

x_keys = [key for key in reduced if key.startswith('X')]

ax = next(axes_iter)
common.heatmap(table1, ax=ax)
ax.set(title='Original')

for ax, key in zip(axes_iter, x_keys):
    common.heatmap(reduced[key], ax=ax)
    ax.set(title=key)



In [55]:
U, s, V = np.linalg.svd(reduced['X3_admm'])

In [61]:
reduced['X2_admm'][reduced['X2_admm'].nonzero()]


Out[61]:
array([  0.06196856,   6.6911438 ,  13.40747184,   0.27285075,
        48.22056903,   2.24481196,   2.75026673,   0.05984601,   6.10384102])

In [80]:
ax


[autoreload of common failed: Traceback (most recent call last):
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 246, in check
    superreload(m, reload, self.old_objects)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 369, in superreload
    module = reload(module)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 608, in _exec
  File "<frozen importlib._bootstrap_external>", line 674, in exec_module
  File "<frozen importlib._bootstrap_external>", line 781, in get_code
  File "<frozen importlib._bootstrap_external>", line 741, in source_to_code
  File "<frozen importlib._bootstrap>", line 205, in _call_with_frames_removed
  File "/Users/olgabot/code/cshl-singlecell-2017/notebooks/02_tissue_subpopulations/common.py", line 34
    for ax, (key, data) in zip(key, datas.items()):
                                                  ^
SyntaxError: unexpected EOF while parsing
]
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x124f07048>

In [81]:
sns.heatmap??

In [ ]: