notebook.community

Edit and run



In [ ]:

    
import common



In [41]:

    
common.plot_color_legend()



In [44]:

    
common.plot_dropout_interactive()



In [39]:



In [2]:

    
import macosko2015
import seaborn as sns



In [3]:

    
expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters()



In [4]:

    
cluster_ids_unique = cell_metadata['cluster_id'].unique()
cluster_ids_unique









    Out[4]:





array(['cluster_24', 'cluster_25', 'cluster_26', 'cluster_27',
       'cluster_33', 'cluster_34'], dtype=object)



In [6]:

    
import pandas as pd

%matplotlib inline



In [13]:

    
cluster_n_to_name = {24: 'Rods', 25: 'Cones',
                      26: 'Bipolar cells (group1)',
                      27: 'Bipolar cells (group2)',
                      33: 'Bipolar cells (group3)',
                      34: 'Muller glia'}
cluster_id_to_name = dict(('cluster_{}'.format(str(i).zfill(2)), name)
                          for i, name in cluster_n_to_name.items())

colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids_unique))
id_to_color = dict(zip(cluster_ids_unique, colors))
id_to_color









    Out[13]:





{'cluster_24': (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 'cluster_25': (0.9882352941176471, 0.55294117647058827, 0.3843137254901961),
 'cluster_26': (0.55294117647058827, 0.62745098039215685, 0.79607843137254897),
 'cluster_27': (0.90588235294117647, 0.54117647058823526, 0.76470588235294112),
 'cluster_33': (0.65098039215686276, 0.84705882352941175, 0.32941176470588235),
 'cluster_34': (1.0, 0.85098039215686272, 0.18431372549019609)}



In [17]:

    
color_labels = pd.Series[id_to_color[i] for i in cell_metadata.loc[expression.index, 'cluster_id']]
cluster_names_to_color = dict((cluster_id_to_name[i], id_to_color[i])
                              for i in cluster_ids_unique)
cluster_names_to_color = pd.Series(cluster_names_to_color)
cluster_names_to_color









    Out[17]:





Bipolar cells (group1)    (0.552941176471, 0.627450980392, 0.796078431373)
Bipolar cells (group2)    (0.905882352941, 0.541176470588, 0.764705882353)
Bipolar cells (group3)    (0.650980392157, 0.847058823529, 0.329411764706)
Cones                      (0.988235294118, 0.552941176471, 0.38431372549)
Muller glia                           (1.0, 0.850980392157, 0.18431372549)
Rods                                 (0.4, 0.760784313725, 0.647058823529)
dtype: object



In [20]:

    
corr = expression.corr()
corr.head()









    Out[20]:







  
    
      
      2010107E04RIK
      4930447C04RIK
      A930011O12RIK
      ABCA8A
      ABLIM1
      ACSL3
      AIPL1
      ALDOC
      ANK3
      APLP2
      ...
      VEGFA
      VIM
      VSTM2B
      VSX1
      VSX2
      WIPI1
      YWHAB
      ZBTB20
      ZFP365
      ZFP36L1
    
  
  
    
      2010107E04RIK
      1.000000
      -0.040650
      0.253482
      -0.057395
      0.421055
      -0.056094
      0.183179
      0.018484
      0.280896
      0.492910
      ...
      -0.071227
      -0.078278
      0.404747
      0.169892
      0.413562
      0.002738
      0.479560
      0.409382
      0.337859
      -0.027455
    
    
      4930447C04RIK
      -0.040650
      1.000000
      0.088354
      0.097449
      -0.054471
      0.021556
      0.132774
      0.075016
      0.018536
      0.020248
      ...
      0.102227
      0.051832
      -0.047962
      -0.086311
      -0.008172
      0.094253
      0.085201
      -0.005951
      -0.002913
      0.021031
    
    
      A930011O12RIK
      0.253482
      0.088354
      1.000000
      -0.080432
      -0.022657
      -0.082728
      0.523890
      -0.070553
      -0.030090
      0.214442
      ...
      -0.079095
      -0.105954
      -0.040201
      0.049992
      0.017344
      -0.034580
      0.183478
      0.052615
      0.172772
      -0.067916
    
    
      ABCA8A
      -0.057395
      0.097449
      -0.080432
      1.000000
      0.005267
      0.658647
      -0.103829
      0.590935
      0.121986
      0.014062
      ...
      0.489624
      0.667542
      0.001470
      -0.080088
      0.132304
      0.613612
      0.005695
      0.102000
      -0.056141
      0.266058
    
    
      ABLIM1
      0.421055
      -0.054471
      -0.022657
      0.005267
      1.000000
      0.052236
      -0.145026
      0.019399
      0.412484
      0.387326
      ...
      0.047961
      -0.010137
      0.542600
      -0.103937
      0.680343
      0.044357
      0.266791
      0.585079
      0.393497
      0.051459
    
  

5 rows × 259 columns



In [21]:

    
common.clustermap(expression.T.corr(), row_colors=color_labels, col_colors=color_labels)
# plt.show()









    Out[21]:





<seaborn.matrix.ClusterGrid at 0x11be937b8>



In [19]:

    
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids_unique))
name_to_color = dict(zip(unique_cluster_names, colors))
name_to_color









    Out[19]:





{'cluster_24': (0.40000000000000002, 0.76078431372549016, 0.6470588235294118),
 'cluster_25': (0.9882352941176471, 0.55294117647058827, 0.3843137254901961),
 'cluster_26': (0.55294117647058827, 0.62745098039215685, 0.79607843137254897),
 'cluster_27': (0.90588235294117647, 0.54117647058823526, 0.76470588235294112),
 'cluster_33': (0.65098039215686276, 0.84705882352941175, 0.32941176470588235),
 'cluster_34': (1.0, 0.85098039215686272, 0.18431372549019609)}



In [20]:

    
color_labels = [name_to_color[name] for name in cell_metadata.loc[
    expression.index, 'cluster_id']]



In [11]:

    
expression.head()









    Out[11]:







  
    
      
      2010107E04RIK
      4930447C04RIK
      A930011O12RIK
      ABCA8A
      ABLIM1
      ACSL3
      AIPL1
      ALDOC
      ANK3
      APLP2
      ...
      VEGFA
      VIM
      VSTM2B
      VSX1
      VSX2
      WIPI1
      YWHAB
      ZBTB20
      ZFP365
      ZFP36L1
    
  
  
    
      r1_TTCCTGCTAGGC
      2
      0
      0
      0
      1
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_TGGAGATACTCT
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      r1_CGTCTACATCCG
      2
      0
      0
      0
      0
      0
      2
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_CAAGCTTGGCGC
      0
      0
      11
      0
      1
      0
      6
      0
      0
      2
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      r1_ACTCACATAGAG
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      0
      0
    
  

5 rows × 259 columns



In [9]:

    
cell_metadata.head()









    Out[9]:







  
    
      
      cluster_id
      cluster_name
    
  
  
    
      r1_TTCCTGCTAGGC
      cluster_24
      Rods
    
    
      r1_TGGAGATACTCT
      cluster_24
      Rods
    
    
      r1_CGTCTACATCCG
      cluster_24
      Rods
    
    
      r1_CAAGCTTGGCGC
      cluster_24
      Rods
    
    
      r1_ACTCACATAGAG
      cluster_24
      Rods



In [ ]:

    
import os
import common

# Assign notebook and folder names
notebook_name = '04_dropout_widget'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder



In [3]:

    
common.expression.multiply?



In [2]:



In [ ]:

    
common.



In [14]:

    
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# %matplotlib inline









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [3]:

    
import macosko2015

expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters()
expression.head()









    Out[3]:







  
    
      
      2010107E04RIK
      4930447C04RIK
      A930011O12RIK
      ABCA8A
      ABLIM1
      ACSL3
      AIPL1
      ALDOC
      ANK3
      APLP2
      ...
      VEGFA
      VIM
      VSTM2B
      VSX1
      VSX2
      WIPI1
      YWHAB
      ZBTB20
      ZFP365
      ZFP36L1
    
  
  
    
      r1_TTCCTGCTAGGC
      2
      0
      0
      0
      1
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_TGGAGATACTCT
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      r1_CGTCTACATCCG
      2
      0
      0
      0
      0
      0
      2
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_CAAGCTTGGCGC
      0
      0
      11
      0
      1
      0
      6
      0
      0
      2
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      r1_ACTCACATAGAG
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      0
      0
    
  

5 rows × 259 columns



In [5]:

    
cell_metadata.head()









    Out[5]:







  
    
      
      cluster_id
      cluster_name
    
  
  
    
      r1_TTCCTGCTAGGC
      cluster_24
      Rods
    
    
      r1_TGGAGATACTCT
      cluster_24
      Rods
    
    
      r1_CGTCTACATCCG
      cluster_24
      Rods
    
    
      r1_CAAGCTTGGCGC
      cluster_24
      Rods
    
    
      r1_ACTCACATAGAG
      cluster_24
      Rods



In [ ]:

    
expression = pd.read_csv()



In [3]:

    
expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters()
print(expression.shape)
expression.head()









    



(300, 259)






    Out[3]:







  
    
      
      2010107E04RIK
      4930447C04RIK
      A930011O12RIK
      ABCA8A
      ABLIM1
      ACSL3
      AIPL1
      ALDOC
      ANK3
      APLP2
      ...
      VEGFA
      VIM
      VSTM2B
      VSX1
      VSX2
      WIPI1
      YWHAB
      ZBTB20
      ZFP365
      ZFP36L1
    
  
  
    
      r1_TTCCTGCTAGGC
      2
      0
      0
      0
      1
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_TGGAGATACTCT
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      r1_CGTCTACATCCG
      2
      0
      0
      0
      0
      0
      2
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_CAAGCTTGGCGC
      0
      0
      11
      0
      1
      0
      6
      0
      0
      2
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      r1_ACTCACATAGAG
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      0
      0
    
  

5 rows × 259 columns



In [4]:

    
print(cell_metadata.shape)
cell_metadata.head()









    



(300, 2)






    Out[4]:







  
    
      
      cluster_id
      cluster_name
    
  
  
    
      r1_TTCCTGCTAGGC
      cluster_24
      Rods
    
    
      r1_TGGAGATACTCT
      cluster_24
      Rods
    
    
      r1_CGTCTACATCCG
      cluster_24
      Rods
    
    
      r1_CAAGCTTGGCGC
      cluster_24
      Rods
    
    
      r1_ACTCACATAGAG
      cluster_24
      Rods



In [5]:

    
print(gene_metadata.shape)
gene_metadata.head()









    



(259, 6)






    Out[5]:







  
    
      
      cluster_24
      cluster_25
      cluster_26
      cluster_27
      cluster_33
      cluster_34
    
  
  
    
      2010107E04RIK
      False
      False
      True
      False
      False
      False
    
    
      4930447C04RIK
      False
      True
      False
      False
      False
      False
    
    
      A930011O12RIK
      False
      False
      False
      False
      False
      True
    
    
      ABCA8A
      False
      False
      False
      False
      False
      True
    
    
      ABLIM1
      False
      False
      True
      False
      False
      False



In [7]:

    
# %matplotlib notebook



In [8]:

    
import itertools



In [9]:

    
# cluster_name_to_ids = {'Horizontal cells': 1, 'Retinal ganglion cells': 2,
#                        'Amacrine cells': range(3, 24), "Rods": 24,
#                        'Cones': 25, 'Bipolar cells': range(26, 34),
#                        'Muller glia': 34, 'Astrocytes': 35,
#                        'Fibroblasts': 36, 'Vascular endothelium': 37,
#                        'Pericytes': 38, 'Microglia': 39}

# [zip(itertools.repeat(name), i) 
#  for name, i in cluster_name_to_ids.items()]



In [10]:

    
# input_folder = os.path.join(common.DATA_FOLDER, '002_robust_pca')

# csv = os.path.join(input_folder, 'lowrank.csv')

# lowrank = pd.read_csv(csv, index_col=0)
# print(lowrank.shape)
# lowrank.head()

Assign colors based on clusters



In [11]:

    
cluster_ids = np.unique(ds.cell_metadata.sel(cell_feature='cluster_id'))
cluster_ids









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-392dba1a8ceb> in <module>()
----> 1 cluster_ids = np.unique(ds.cell_metadata.sel(cell_feature='cluster_id'))
      2 cluster_ids

NameError: name 'ds' is not defined



In [ ]:

    
cluster_n_to_name = {24: 'Rods', 25: 'Cones', 
                      26: 'Bipolar cells (group1)', 
                      27: 'Bipolar cells (group2)', 
                      33: 'Bipolar cells (group3)', 
                      34: 'Muller glia'}
cluster_id_to_name = dict(('cluster_{}'.format(str(i).zfill(2)), name) 
                          for i, name in cluster_n_to_name.items())
cluster_id_to_name



In [ ]:

    
import matplotlib as mpl



In [ ]:

    
ds.expression.indexes['cell']



In [ ]:

    
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
# print(colors)
id_to_color = dict(zip(cluster_ids, map(mpl.colors.rgb2hex, colors)))
id_to_color



In [ ]:

    
ds.cell_metadata['cell_feature']



In [ ]:

    
cluster_names_to_color = dict((cluster_id_to_name[i], id_to_color[i]) for i in cluster_ids)
cluster_names_to_color

Remove some genes



In [ ]:

    
dropmask = np.random.randn(*ds.expression.shape) > -1
dropmask.shape



In [ ]:

    
dropped = pd.DataFrame(ds.expression.values * dropmask)
print(dropped.shape)
dropped.head()
# dropped



In [ ]:

    
common.clustermap(dropped.T.corr(method='spearman'), col_colors=color_labels)



In [ ]:

    
plt.show()



In [ ]:

    
from ipywidgets import interact

from ipywidgets import IntRangeSlider


# table1_t = table1.T
# lowrank_t = lowrank.T

from ipywidgets  import IntSlider

expression = ds.expression.to_pandas()
print(expression.shape)
expression.head()



In [ ]:

    
cluster_ids_in_data = ds.cell_metadata.sel(cell_feature='cluster_id').values
# cluster_ids_in_data



In [ ]:

    
color_labels = [id_to_color[i] for i in cluster_ids_in_data]
color_labels[:4]



In [12]:

    
def plot_dropout(percent_gene_dropout=50,
                 correlation='pearson', linkage_method='ward',
                 distance_metric='euclidean', #dataset='original'
                ):
#     if dataset == 'original':
#         data = expression
#     elif dataset == 'low-rank':
#         data = lowrank_t
    
    
    title = '{}%, {}, {}, {}'.format(percent_gene_dropout, 
                                         correlation, linkage_method, 
                                         distance_metric, )
    
    threshold = percent_gene_dropout / 100.
    print('threshold', threshold)
    mask = np.random.uniform(size=expression.shape) > threshold
    print(mask.shape)
    data = expression * mask
    print(data.head())
    g = common.clustermap(data.corr(method=correlation), 
                         col_colors=color_labels, 
                         row_colors=color_labels, 
                         metric=distance_metric, 
                          method=linkage_method, 
                          figsize=(4, 4))
    g.fig.suptitle(title)
    plt.show()


def plot_dropout_interactive():
    interact(plot_dropout, 
         percent_gene_dropout=IntSlider(value=0, min=0, max=100, step=10), 
         correlation=['pearson', 'spearman'], 
         linkage_method=['ward', 'average', 'single', "complete"],
         distance_metric=['euclidean', "cityblock"], 
#          dataset=['original', 'low-rank']
            )

plot_dropout_interactive()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-bb8fe48bb5b7> in <module>()
     38             )
     39 
---> 40 plot_dropout_interactive()

<ipython-input-12-bb8fe48bb5b7> in plot_dropout_interactive()
     30 
     31 def plot_dropout_interactive():
---> 32     interact(plot_dropout, 
     33          percent_gene_dropout=IntSlider(value=0, min=0, max=100, step=10),
     34          correlation=['pearson', 'spearman'],

NameError: name 'interact' is not defined



In [ ]:

    
warnings.onceregistry



In [ ]:

    
common.clustermap(table1.T.corr(method='spearman'), col_colors=color_labels)

Add Robust PCA implementations to path



In [ ]:

    
import sys

sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])

import r_pca
import rpcaADMM



In [ ]:

    
%%time
rpca_alm = r_pca.R_pca(dropped.as_matrix(), lmbda=0.1)
rpca_alm.fit()



In [ ]:

    
rpca_alm.lmbda



In [ ]:

    
sns.heatmap(dropped)



In [ ]:

    
sns.heatmap(rpca_alm.L)



In [ ]:

    
sns.heatmap(rpca_alm.S)



In [ ]:

    
L = pd.DataFrame(rpca_alm.L, index=dropped.index, columns=dropped.columns)
print(L.shape)
L.head()



In [ ]:

    
common.clustermap(L.T.corr(method='spearman'), col_colors=color_labels)



In [ ]:

	2010107E04RIK	4930447C04RIK	A930011O12RIK	ABCA8A	ABLIM1	ACSL3	AIPL1	ALDOC	ANK3	APLP2	...	VEGFA	VIM	VSTM2B	VSX1	VSX2	WIPI1	YWHAB	ZBTB20	ZFP365	ZFP36L1
2010107E04RIK	1.000000	-0.040650	0.253482	-0.057395	0.421055	-0.056094	0.183179	0.018484	0.280896	0.492910	...	-0.071227	-0.078278	0.404747	0.169892	0.413562	0.002738	0.479560	0.409382	0.337859	-0.027455
4930447C04RIK	-0.040650	1.000000	0.088354	0.097449	-0.054471	0.021556	0.132774	0.075016	0.018536	0.020248	...	0.102227	0.051832	-0.047962	-0.086311	-0.008172	0.094253	0.085201	-0.005951	-0.002913	0.021031
A930011O12RIK	0.253482	0.088354	1.000000	-0.080432	-0.022657	-0.082728	0.523890	-0.070553	-0.030090	0.214442	...	-0.079095	-0.105954	-0.040201	0.049992	0.017344	-0.034580	0.183478	0.052615	0.172772	-0.067916
ABCA8A	-0.057395	0.097449	-0.080432	1.000000	0.005267	0.658647	-0.103829	0.590935	0.121986	0.014062	...	0.489624	0.667542	0.001470	-0.080088	0.132304	0.613612	0.005695	0.102000	-0.056141	0.266058
ABLIM1	0.421055	-0.054471	-0.022657	0.005267	1.000000	0.052236	-0.145026	0.019399	0.412484	0.387326	...	0.047961	-0.010137	0.542600	-0.103937	0.680343	0.044357	0.266791	0.585079	0.393497	0.051459

	2010107E04RIK	A930011O12RIK	ABLIM1	AIPL1	APLP2	...	ZBTB20	ZFP365
r1_TTCCTGCTAGGC	2	0	1	1	0	...	0	0
r1_TGGAGATACTCT	0	1	0	0	0	...	1	0
r1_CGTCTACATCCG	2	0	0	2	1	...	0	0
r1_CAAGCTTGGCGC	0	11	1	6	2	...	0	1
r1_ACTCACATAGAG	1	0	0	0	1	...	2	0

	cluster_id	cluster_name
r1_TTCCTGCTAGGC	cluster_24	Rods
r1_TGGAGATACTCT	cluster_24	Rods
r1_CGTCTACATCCG	cluster_24	Rods
r1_CAAGCTTGGCGC	cluster_24	Rods
r1_ACTCACATAGAG	cluster_24	Rods

	cluster_24	cluster_25	cluster_26	cluster_27	cluster_33	cluster_34
2010107E04RIK	False	False	True	False	False	False
4930447C04RIK	False	True	False	False	False	False
A930011O12RIK	False	False	False	False	False	True
ABCA8A	False	False	False	False	False	True
ABLIM1	False	False	True	False	False	False