In [16]:
import os
import common

# Assign notebook and folder names
notebook_name = '06_exploring_with_josh'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [17]:
%pdb


Automatic pdb calling has been turned OFF

In [39]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [24]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')

csv = os.path.join(input_folder, 'expression_table1_subset.csv')

table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()


(300, 259)
Out[24]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 14 3 1 3 12 0 1 7 2 2 ... 1 1 2 0 0 0 0 1 0 0
r1_TGGAGATACTCT 23 8 6 4 13 9 2 19 1 1 ... 3 0 2 1 0 1 0 2 0 1
r1_CGTCTACATCCG 14 4 7 1 6 3 0 13 2 2 ... 0 1 0 3 0 1 0 2 0 0
r1_CAAGCTTGGCGC 62 18 10 20 29 2 8 31 9 2 ... 0 5 7 3 2 6 2 3 7 11
r1_ACTCACATAGAG 10 1 0 1 5 2 1 7 3 1 ... 1 1 2 3 1 2 1 0 3 0

5 rows × 259 columns

Assign colors basd on clusters


In [ ]:
cluster_name_to_id = {'Horizontal cells': [1], 'Retinal Ganglion cells': [2], 
                      'Amacrine cells': np.arange(3, 24),
                      'Rods', [24], 'Cones': [25], 
                      'Bipolar cells': np.arange(26, 34),
                      ''}

In [5]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()


(44808,)
Out[5]:
barcode
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [6]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()


Out[6]:
barcode
r1_TTCCTGCTAGGC    24
r1_TGGAGATACTCT    24
r1_CGTCTACATCCG    24
r1_CAAGCTTGGCGC    24
r1_ACTCACATAGAG    24
Name: cluster_id, dtype: int64

In [15]:
cluster_ids = cluster_identities_table1.unique()
cluster_ids


Out[15]:
array([24, 25, 26, 27, 33, 34])

In [32]:
cluster_names = cluster_identities_table1.map(common.cluster_id_to_name)
cluster_names.head()


Out[32]:
barcode
r1_TTCCTGCTAGGC    Rods
r1_TGGAGATACTCT    Rods
r1_CGTCTACATCCG    Rods
r1_CAAGCTTGGCGC    Rods
r1_ACTCACATAGAG    Rods
Name: cluster_id, dtype: object

In [7]:
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))

color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]


Out[7]:
[(0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 (0.40000000000000002, 0.76078431372549016, 0.6470588235294118)]

Spot check some genes


In [25]:
genes_of_interest = ['RHO', 'PAX6', 'GNAT1', 'SLC24A1']

In [56]:
subset = table1[genes_of_interest]
subset.head()


Out[56]:
RHO PAX6 GNAT1 SLC24A1
barcode
r1_TTCCTGCTAGGC 14 0 3 1
r1_TGGAGATACTCT 23 1 8 6
r1_CGTCTACATCCG 14 1 4 7
r1_CAAGCTTGGCGC 62 0 18 10
r1_ACTCACATAGAG 10 0 1 0

In [52]:
# subset_log = np.log(subset+1)
# subset_log.head()


Out[52]:
RHO PAX6 GNAT1 SLC24A1
barcode
r1_TTCCTGCTAGGC 2.708050 0.000000 1.386294 0.693147
r1_TGGAGATACTCT 3.178054 0.693147 2.197225 1.945910
r1_CGTCTACATCCG 2.708050 0.693147 1.609438 2.079442
r1_CAAGCTTGGCGC 4.143135 0.000000 2.944439 2.397895
r1_ACTCACATAGAG 2.397895 0.000000 0.693147 0.000000

In [57]:
subset_names = subset.join(cluster_names)
subset_names.head()


Out[57]:
RHO PAX6 GNAT1 SLC24A1 cluster_id
barcode
r1_TTCCTGCTAGGC 14 0 3 1 Rods
r1_TGGAGATACTCT 23 1 8 6 Rods
r1_CGTCTACATCCG 14 1 4 7 Rods
r1_CAAGCTTGGCGC 62 0 18 10 Rods
r1_ACTCACATAGAG 10 0 1 0 Rods

In [58]:
sns.pairplot(subset_names, hue='cluster_id')


Out[58]:
<seaborn.axisgrid.PairGrid at 0x1320d9ef0>

In [46]:
np


Out[46]:
<module 'numpy' from '/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/numpy/__init__.py'>

In [47]:
sns.pairplot(subset.apply(np.log), hue='cluster_id')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-47-0ab0021a6378> in <module>()
----> 1 sns.pairplot(subset.apply(np.log), hue='cluster_id')

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4243         if isinstance(f, np.ufunc):
   4244             with np.errstate(all='ignore'):
-> 4245                 results = f(self.values)
   4246             return self._constructor(data=results, index=self.index,
   4247                                      columns=self.columns, copy=False)

AttributeError: 'int' object has no attribute 'log'

Plot the original, dropout'd data


In [8]:
sns.set(style='whitegrid')

In [9]:
mask = table1 == 0

fig, ax = plt.subplots()
sns.heatmap(table1, mask=mask, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')


Out[9]:
[<matplotlib.text.Text at 0x10c690e80>, <matplotlib.text.Text at 0x10c8c2ba8>]

Maybe this is small enough for a clustered heatmap


In [11]:
clustergrid = sns.clustermap(table1, mask=mask, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Add Robust PCA implementations to path


In [62]:
import sys

sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])

import r_pca
import rpcaADMM

In [63]:
r_pca.R_pca??

In [64]:
%%time
rpca_alm = r_pca.R_pca(table1.as_matrix())
rpca_alm.fit()


iteration: 1, error: 56422.70929985199
iteration: 100, error: 0.6880109089868683
iteration: 200, error: 0.14702804887281945
iteration: 251, error: 0.08979496489742976
CPU times: user 15.4 s, sys: 127 ms, total: 15.6 s
Wall time: 3.92 s

In [71]:
sns.distplot(s[s > 0.1], kde=False)


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x116624e10>

In [72]:
diff = rpca_alm.L - table1

In [73]:
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff}

common.heatmaps(datasets)



In [74]:
L = pd.DataFrame(rpca_alm.L, index=table1.index, columns=table1.columns)
L.head()


Out[74]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 7.272295 3.048168 1.562439 2.198954 4.326513 1.385031 1.989380 6.541110 1.689744 1.345555 ... 0.313114 0.767164 0.864139 0.388544 0.150222 0.752207 0.272070 0.832861 1.258661 0.856469
r1_TGGAGATACTCT 13.174154 5.480442 2.635686 3.713715 7.521109 2.310180 3.257731 11.804131 2.741139 2.363235 ... 0.815807 1.031292 1.597556 0.999699 0.884529 1.033574 0.724387 1.236772 2.351665 1.286273
r1_CGTCTACATCCG 8.820288 3.785870 1.735920 2.396476 5.171789 1.561243 2.092837 7.836292 1.876383 1.537487 ... 0.382253 0.972003 0.828423 0.656586 0.391978 0.916053 0.429003 0.958758 1.373776 0.703660
r1_CAAGCTTGGCGC 25.982230 11.021147 5.608453 7.951609 15.774301 4.953723 7.427482 24.229816 6.077871 4.769225 ... 1.644721 3.139924 3.271503 2.906614 1.999058 2.751189 2.020806 3.190714 5.678790 3.812966
r1_ACTCACATAGAG 7.614892 3.096781 1.282597 1.838656 4.131712 1.165996 1.602792 6.662531 1.363853 1.237918 ... 0.687691 0.804546 1.079846 0.874430 0.684653 0.541885 0.754388 0.499074 1.349995 0.391946

5 rows × 259 columns


In [75]:
L_subset = L[genes_of_interest]
L_names = L_subset.join(cluster_names)

sns.pairplot(L_names, hue='cluster_id')


Out[75]:
<seaborn.axisgrid.PairGrid at 0x117764860>

In [76]:
sns.distplot(table1.values.flat)


Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x11648ff28>

In [78]:
sns.distplot(L.values.flat)


Out[78]:
<matplotlib.axes._subplots.AxesSubplot at 0x117523780>

In [79]:
diff = table1 - L
diff_tidy = diff.unstack().reset_index()
diff_tidy['dataset'] = 'Difference'

table1_tidy = table1.unstack().reset_index()
table1_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'

tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()

sns.violinplot(x='dataset', y='molecules', data=tidy)


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x11596ab38>

In [80]:
sns.boxplot(x='dataset', y='molecules', data=tidy)


Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x115c07278>

In [81]:
S = pd.DataFrame(rpca_alm.S, index=table1.index, columns=table1.columns)
S.head()


Out[81]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 6.727705 -0.048168 -0.562439 0.801046 7.673487 -1.385031 -0.989380 0.458890 0.310256 0.654445 ... 0.686886 0.232836 1.135861 -0.388544 -0.150222 -0.752207 -0.272070 0.167139 -1.258661 -0.856469
r1_TGGAGATACTCT 9.825846 2.519558 3.364314 0.286285 5.478891 6.689820 -1.257731 7.195869 -1.741139 -1.363235 ... 2.184193 -1.031292 0.402444 -0.000000 -0.884529 -0.033574 -0.724387 0.763228 -2.351665 -0.286273
r1_CGTCTACATCCG 5.179712 0.214130 5.264080 -1.396476 0.828211 1.438757 -2.092837 5.163708 0.123617 0.462513 ... -0.382253 0.027997 -0.828423 2.343414 -0.391978 0.083947 -0.429003 1.041242 -1.373776 -0.703660
r1_CAAGCTTGGCGC 36.017770 6.978853 4.391547 12.048391 13.225699 -2.953723 0.572518 6.770184 2.922129 -2.769225 ... -1.644721 1.860076 3.728497 0.093386 -0.000000 3.248811 -0.020806 -0.190714 1.321210 7.187034
r1_ACTCACATAGAG 2.385108 -2.096781 -1.282597 -0.838656 0.868288 0.834004 -0.602792 0.337469 1.636147 -0.237918 ... 0.312309 0.195454 0.920154 2.125570 0.315347 1.458115 0.245612 -0.499074 1.650005 -0.391946

5 rows × 259 columns


In [88]:
sns.boxplot(table1[genes_of_interest])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/seaborn/categorical.py:2171: UserWarning: The boxplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x11abaf0b8>

In [86]:
sns.boxplot(L[genes_of_interest])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/seaborn/categorical.py:2171: UserWarning: The boxplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a63b4a8>

In [87]:
sns.boxplot(S[genes_of_interest])


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/seaborn/categorical.py:2171: UserWarning: The boxplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0x11aa49470>

In [21]:
diff.head()


Out[21]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC -6.727705 0.048168 0.562439 -0.801046 -7.673487 1.385031 0.989380 -0.458890 -0.310256 -0.654445 ... -0.686886 -0.232836 -1.135861 0.388544 0.150222 0.752207 0.272070 -0.167139 1.258661 0.856469
r1_TGGAGATACTCT -9.825846 -2.519558 -3.364314 -0.286285 -5.478891 -6.689820 1.257731 -7.195869 1.741139 1.363235 ... -2.184193 1.031292 -0.402444 -0.000301 0.884529 0.033574 0.724387 -0.763228 2.351665 0.286273
r1_CGTCTACATCCG -5.179712 -0.214130 -5.264080 1.396476 -0.828211 -1.438757 2.092837 -5.163708 -0.123617 -0.462513 ... 0.382253 -0.027997 0.828423 -2.343414 0.391978 -0.083947 0.429003 -1.041242 1.373776 0.703660
r1_CAAGCTTGGCGC -36.017770 -6.978853 -4.391547 -12.048391 -13.225699 2.953723 -0.572518 -6.770184 -2.922129 2.769225 ... 1.644721 -1.860076 -3.728497 -0.093386 -0.000942 -3.248811 0.020806 0.190714 -1.321210 -7.187034
r1_ACTCACATAGAG -2.385108 2.096781 1.282597 0.838656 -0.868288 -0.834004 0.602792 -0.337469 -1.636147 0.237918 ... -0.312309 -0.195454 -0.920154 -2.125570 -0.315347 -1.458115 -0.245612 0.499074 -1.650005 0.391946

5 rows × 259 columns


In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = table1 - gr0

datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S, 
            'Difference: Original - Low-Rank': diff_gr0}

common.heatmaps(datasets)



In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[], 
                             row_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

In [24]:
g_original = sns.clustermap(table1.T.corr(method='spearman'), xticklabels=[], yticklabels=[], 
                             col_colors=color_labels)


/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)