In [ ]:
import common

In [41]:
common.plot_color_legend()



In [44]:
common.plot_dropout_interactive()



In [39]:




In [2]:
import macosko2015
import seaborn as sns

In [3]:
expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters()

In [4]:
cluster_ids_unique = cell_metadata['cluster_id'].unique()
cluster_ids_unique


Out[4]:
array(['cluster_24', 'cluster_25', 'cluster_26', 'cluster_27',
       'cluster_33', 'cluster_34'], dtype=object)

In [6]:
import pandas as pd

%matplotlib inline

In [13]:
cluster_n_to_name = {24: 'Rods', 25: 'Cones',
                      26: 'Bipolar cells (group1)',
                      27: 'Bipolar cells (group2)',
                      33: 'Bipolar cells (group3)',
                      34: 'Muller glia'}
cluster_id_to_name = dict(('cluster_{}'.format(str(i).zfill(2)), name)
                          for i, name in cluster_n_to_name.items())

colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids_unique))
id_to_color = dict(zip(cluster_ids_unique, colors))
id_to_color


Out[13]:
{'cluster_24': (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 'cluster_25': (0.9882352941176471, 0.55294117647058827, 0.3843137254901961),
 'cluster_26': (0.55294117647058827, 0.62745098039215685, 0.79607843137254897),
 'cluster_27': (0.90588235294117647, 0.54117647058823526, 0.76470588235294112),
 'cluster_33': (0.65098039215686276, 0.84705882352941175, 0.32941176470588235),
 'cluster_34': (1.0, 0.85098039215686272, 0.18431372549019609)}

In [17]:
color_labels = pd.Series[id_to_color[i] for i in cell_metadata.loc[expression.index, 'cluster_id']]
cluster_names_to_color = dict((cluster_id_to_name[i], id_to_color[i])
                              for i in cluster_ids_unique)
cluster_names_to_color = pd.Series(cluster_names_to_color)
cluster_names_to_color


Out[17]:
Bipolar cells (group1)    (0.552941176471, 0.627450980392, 0.796078431373)
Bipolar cells (group2)    (0.905882352941, 0.541176470588, 0.764705882353)
Bipolar cells (group3)    (0.650980392157, 0.847058823529, 0.329411764706)
Cones                      (0.988235294118, 0.552941176471, 0.38431372549)
Muller glia                           (1.0, 0.850980392157, 0.18431372549)
Rods                                 (0.4, 0.760784313725, 0.647058823529)
dtype: object

In [20]:
corr = expression.corr()
corr.head()


Out[20]:
2010107E04RIK 4930447C04RIK A930011O12RIK ABCA8A ABLIM1 ACSL3 AIPL1 ALDOC ANK3 APLP2 ... VEGFA VIM VSTM2B VSX1 VSX2 WIPI1 YWHAB ZBTB20 ZFP365 ZFP36L1
2010107E04RIK 1.000000 -0.040650 0.253482 -0.057395 0.421055 -0.056094 0.183179 0.018484 0.280896 0.492910 ... -0.071227 -0.078278 0.404747 0.169892 0.413562 0.002738 0.479560 0.409382 0.337859 -0.027455
4930447C04RIK -0.040650 1.000000 0.088354 0.097449 -0.054471 0.021556 0.132774 0.075016 0.018536 0.020248 ... 0.102227 0.051832 -0.047962 -0.086311 -0.008172 0.094253 0.085201 -0.005951 -0.002913 0.021031
A930011O12RIK 0.253482 0.088354 1.000000 -0.080432 -0.022657 -0.082728 0.523890 -0.070553 -0.030090 0.214442 ... -0.079095 -0.105954 -0.040201 0.049992 0.017344 -0.034580 0.183478 0.052615 0.172772 -0.067916
ABCA8A -0.057395 0.097449 -0.080432 1.000000 0.005267 0.658647 -0.103829 0.590935 0.121986 0.014062 ... 0.489624 0.667542 0.001470 -0.080088 0.132304 0.613612 0.005695 0.102000 -0.056141 0.266058
ABLIM1 0.421055 -0.054471 -0.022657 0.005267 1.000000 0.052236 -0.145026 0.019399 0.412484 0.387326 ... 0.047961 -0.010137 0.542600 -0.103937 0.680343 0.044357 0.266791 0.585079 0.393497 0.051459

5 rows × 259 columns


In [21]:
common.clustermap(expression.T.corr(), row_colors=color_labels, col_colors=color_labels)
# plt.show()


Out[21]:
<seaborn.matrix.ClusterGrid at 0x11be937b8>

In [19]:
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids_unique))
name_to_color = dict(zip(unique_cluster_names, colors))
name_to_color


Out[19]:
{'cluster_24': (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 'cluster_25': (0.9882352941176471, 0.55294117647058827, 0.3843137254901961),
 'cluster_26': (0.55294117647058827, 0.62745098039215685, 0.79607843137254897),
 'cluster_27': (0.90588235294117647, 0.54117647058823526, 0.76470588235294112),
 'cluster_33': (0.65098039215686276, 0.84705882352941175, 0.32941176470588235),
 'cluster_34': (1.0, 0.85098039215686272, 0.18431372549019609)}

In [20]:
color_labels = [name_to_color[name] for name in cell_metadata.loc[
    expression.index, 'cluster_id']]

In [11]:
expression.head()


Out[11]:
2010107E04RIK 4930447C04RIK A930011O12RIK ABCA8A ABLIM1 ACSL3 AIPL1 ALDOC ANK3 APLP2 ... VEGFA VIM VSTM2B VSX1 VSX2 WIPI1 YWHAB ZBTB20 ZFP365 ZFP36L1
r1_TTCCTGCTAGGC 2 0 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
r1_TGGAGATACTCT 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
r1_CGTCTACATCCG 2 0 0 0 0 0 2 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
r1_CAAGCTTGGCGC 0 0 11 0 1 0 6 0 0 2 ... 0 0 0 0 0 0 0 0 1 0
r1_ACTCACATAGAG 1 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 2 0 0

5 rows × 259 columns


In [9]:
cell_metadata.head()


Out[9]:
cluster_id cluster_name
r1_TTCCTGCTAGGC cluster_24 Rods
r1_TGGAGATACTCT cluster_24 Rods
r1_CGTCTACATCCG cluster_24 Rods
r1_CAAGCTTGGCGC cluster_24 Rods
r1_ACTCACATAGAG cluster_24 Rods

In [ ]:
import os
import common

# Assign notebook and folder names
notebook_name = '04_dropout_widget'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [3]:
common.expression.multiply?

In [2]:




In [ ]:
common.

In [14]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# %matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [3]:
import macosko2015

expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters()
expression.head()


Out[3]:
2010107E04RIK 4930447C04RIK A930011O12RIK ABCA8A ABLIM1 ACSL3 AIPL1 ALDOC ANK3 APLP2 ... VEGFA VIM VSTM2B VSX1 VSX2 WIPI1 YWHAB ZBTB20 ZFP365 ZFP36L1
r1_TTCCTGCTAGGC 2 0 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
r1_TGGAGATACTCT 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
r1_CGTCTACATCCG 2 0 0 0 0 0 2 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
r1_CAAGCTTGGCGC 0 0 11 0 1 0 6 0 0 2 ... 0 0 0 0 0 0 0 0 1 0
r1_ACTCACATAGAG 1 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 2 0 0

5 rows × 259 columns


In [5]:
cell_metadata.head()


Out[5]:
cluster_id cluster_name
r1_TTCCTGCTAGGC cluster_24 Rods
r1_TGGAGATACTCT cluster_24 Rods
r1_CGTCTACATCCG cluster_24 Rods
r1_CAAGCTTGGCGC cluster_24 Rods
r1_ACTCACATAGAG cluster_24 Rods

In [ ]:
expression = pd.read_csv()

In [3]:
expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters()
print(expression.shape)
expression.head()


(300, 259)
Out[3]:
2010107E04RIK 4930447C04RIK A930011O12RIK ABCA8A ABLIM1 ACSL3 AIPL1 ALDOC ANK3 APLP2 ... VEGFA VIM VSTM2B VSX1 VSX2 WIPI1 YWHAB ZBTB20 ZFP365 ZFP36L1
r1_TTCCTGCTAGGC 2 0 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
r1_TGGAGATACTCT 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
r1_CGTCTACATCCG 2 0 0 0 0 0 2 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
r1_CAAGCTTGGCGC 0 0 11 0 1 0 6 0 0 2 ... 0 0 0 0 0 0 0 0 1 0
r1_ACTCACATAGAG 1 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 2 0 0

5 rows × 259 columns


In [4]:
print(cell_metadata.shape)
cell_metadata.head()


(300, 2)
Out[4]:
cluster_id cluster_name
r1_TTCCTGCTAGGC cluster_24 Rods
r1_TGGAGATACTCT cluster_24 Rods
r1_CGTCTACATCCG cluster_24 Rods
r1_CAAGCTTGGCGC cluster_24 Rods
r1_ACTCACATAGAG cluster_24 Rods

In [5]:
print(gene_metadata.shape)
gene_metadata.head()


(259, 6)
Out[5]:
cluster_24 cluster_25 cluster_26 cluster_27 cluster_33 cluster_34
2010107E04RIK False False True False False False
4930447C04RIK False True False False False False
A930011O12RIK False False False False False True
ABCA8A False False False False False True
ABLIM1 False False True False False False

In [7]:
# %matplotlib notebook

In [8]:
import itertools

In [9]:
# cluster_name_to_ids = {'Horizontal cells': 1, 'Retinal ganglion cells': 2,
#                        'Amacrine cells': range(3, 24), "Rods": 24,
#                        'Cones': 25, 'Bipolar cells': range(26, 34),
#                        'Muller glia': 34, 'Astrocytes': 35,
#                        'Fibroblasts': 36, 'Vascular endothelium': 37,
#                        'Pericytes': 38, 'Microglia': 39}

# [zip(itertools.repeat(name), i) 
#  for name, i in cluster_name_to_ids.items()]

In [10]:
# input_folder = os.path.join(common.DATA_FOLDER, '002_robust_pca')

# csv = os.path.join(input_folder, 'lowrank.csv')

# lowrank = pd.read_csv(csv, index_col=0)
# print(lowrank.shape)
# lowrank.head()

Assign colors based on clusters


In [11]:
cluster_ids = np.unique(ds.cell_metadata.sel(cell_feature='cluster_id'))
cluster_ids


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-392dba1a8ceb> in <module>()
----> 1 cluster_ids = np.unique(ds.cell_metadata.sel(cell_feature='cluster_id'))
      2 cluster_ids

NameError: name 'ds' is not defined

In [ ]:
cluster_n_to_name = {24: 'Rods', 25: 'Cones', 
                      26: 'Bipolar cells (group1)', 
                      27: 'Bipolar cells (group2)', 
                      33: 'Bipolar cells (group3)', 
                      34: 'Muller glia'}
cluster_id_to_name = dict(('cluster_{}'.format(str(i).zfill(2)), name) 
                          for i, name in cluster_n_to_name.items())
cluster_id_to_name

In [ ]:
import matplotlib as mpl

In [ ]:
ds.expression.indexes['cell']

In [ ]:
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
# print(colors)
id_to_color = dict(zip(cluster_ids, map(mpl.colors.rgb2hex, colors)))
id_to_color

In [ ]:
ds.cell_metadata['cell_feature']

In [ ]:
cluster_names_to_color = dict((cluster_id_to_name[i], id_to_color[i]) for i in cluster_ids)
cluster_names_to_color

Remove some genes


In [ ]:
dropmask = np.random.randn(*ds.expression.shape) > -1
dropmask.shape

In [ ]:
dropped = pd.DataFrame(ds.expression.values * dropmask)
print(dropped.shape)
dropped.head()
# dropped

In [ ]:
common.clustermap(dropped.T.corr(method='spearman'), col_colors=color_labels)

In [ ]:
plt.show()

In [ ]:
from ipywidgets import interact

from ipywidgets import IntRangeSlider


# table1_t = table1.T
# lowrank_t = lowrank.T

from ipywidgets  import IntSlider

expression = ds.expression.to_pandas()
print(expression.shape)
expression.head()

In [ ]:
cluster_ids_in_data = ds.cell_metadata.sel(cell_feature='cluster_id').values
# cluster_ids_in_data

In [ ]:
color_labels = [id_to_color[i] for i in cluster_ids_in_data]
color_labels[:4]

In [12]:
def plot_dropout(percent_gene_dropout=50,
                 correlation='pearson', linkage_method='ward',
                 distance_metric='euclidean', #dataset='original'
                ):
#     if dataset == 'original':
#         data = expression
#     elif dataset == 'low-rank':
#         data = lowrank_t
    
    
    title = '{}%, {}, {}, {}'.format(percent_gene_dropout, 
                                         correlation, linkage_method, 
                                         distance_metric, )
    
    threshold = percent_gene_dropout / 100.
    print('threshold', threshold)
    mask = np.random.uniform(size=expression.shape) > threshold
    print(mask.shape)
    data = expression * mask
    print(data.head())
    g = common.clustermap(data.corr(method=correlation), 
                         col_colors=color_labels, 
                         row_colors=color_labels, 
                         metric=distance_metric, 
                          method=linkage_method, 
                          figsize=(4, 4))
    g.fig.suptitle(title)
    plt.show()


def plot_dropout_interactive():
    interact(plot_dropout, 
         percent_gene_dropout=IntSlider(value=0, min=0, max=100, step=10), 
         correlation=['pearson', 'spearman'], 
         linkage_method=['ward', 'average', 'single', "complete"],
         distance_metric=['euclidean', "cityblock"], 
#          dataset=['original', 'low-rank']
            )

plot_dropout_interactive()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-bb8fe48bb5b7> in <module>()
     38             )
     39 
---> 40 plot_dropout_interactive()

<ipython-input-12-bb8fe48bb5b7> in plot_dropout_interactive()
     30 
     31 def plot_dropout_interactive():
---> 32     interact(plot_dropout, 
     33          percent_gene_dropout=IntSlider(value=0, min=0, max=100, step=10),
     34          correlation=['pearson', 'spearman'],

NameError: name 'interact' is not defined

In [ ]:
warnings.onceregistry

In [ ]:
common.clustermap(table1.T.corr(method='spearman'), col_colors=color_labels)

Add Robust PCA implementations to path


In [ ]:
import sys

sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])

import r_pca
import rpcaADMM

In [ ]:
%%time
rpca_alm = r_pca.R_pca(dropped.as_matrix(), lmbda=0.1)
rpca_alm.fit()

In [ ]:
rpca_alm.lmbda

In [ ]:
sns.heatmap(dropped)

In [ ]:
sns.heatmap(rpca_alm.L)

In [ ]:
sns.heatmap(rpca_alm.S)

In [ ]:
L = pd.DataFrame(rpca_alm.L, index=dropped.index, columns=dropped.columns)
print(L.shape)
L.head()

In [ ]:
common.clustermap(L.T.corr(method='spearman'), col_colors=color_labels)

In [ ]: