In [96]:

    
import os
import common

# Assign notebook and folder names
notebook_name = '01_downsample_macosko_data'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder



In [2]:

    
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline



In [4]:

    
table1 = pd.read_csv('expression_table1.csv', index_col=0)
print(table1.shape)
table1.head()









    



(6600, 20478)






    Out[4]:







  
    
      
      KITL
      TMTC3
      CEP290
      4930430F08RIK
      1700017N19RIK
      MGAT4C
      RASSF9
      LRRIQ1
      ADGB
      SLC6A15
      ...
      EFNB1
      PJA1
      TMEM28
      EDA
      EIF2S3Y
      GM20775
      UTY
      DDX3Y
      KDM5D
      N-R5S1
    
    
      barcode
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      r1_GGCCGCAGTCCG
      0
      3
      1
      2
      0
      0
      0
      0
      0
      4
      ...
      0
      8
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_CTTGTGCGGGAA
      0
      0
      3
      1
      0
      0
      0
      0
      0
      1
      ...
      0
      9
      0
      0
      2
      0
      1
      5
      0
      0
    
    
      r1_GCGCAACTGCTC
      1
      0
      0
      2
      0
      4
      0
      0
      0
      3
      ...
      1
      11
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_GATTGGGAGGCA
      0
      0
      2
      0
      0
      1
      0
      0
      0
      2
      ...
      1
      2
      0
      0
      3
      0
      1
      0
      0
      0
    
    
      r1_CCTCCTAGTTGG
      0
      2
      1
      1
      0
      2
      0
      0
      0
      1
      ...
      0
      3
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 20478 columns

Subset cells based on cluster identies



In [16]:

    
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()









    



(44808,)






    Out[16]:





barcode
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64



In [17]:

    
cluster_identities.groupby(cluster_identities).size()









    Out[17]:





cluster_id
1       252
2       432
3       289
4        73
5        77
6       211
7       326
8       159
9       350
10      191
11      214
12      274
13       50
14      111
15       73
16      262
17      375
18       83
19      127
20      389
21      254
22      274
23      264
24    29400
25     1868
26     2217
27      664
28      496
29      591
30      636
31      512
32      320
33      849
34     1624
35       54
36       85
37      252
38       63
39       67
Name: cluster_id, dtype: int64

What is the cluster representation in this file?



In [19]:

    
cluster_sizes_table1 = table1.groupby(cluster_identities, axis=0).size()
cluster_sizes_table1









    Out[19]:





cluster_id
1.0       27
2.0       70
3.0       44
4.0       13
5.0       18
6.0       42
7.0       54
8.0       23
9.0       42
10.0      29
11.0      31
12.0      47
13.0      11
14.0      10
15.0      13
16.0      36
17.0      72
18.0      14
19.0      19
20.0      64
21.0      47
22.0      46
23.0      39
24.0    3746
25.0     241
26.0     317
27.0     126
28.0      56
29.0      85
30.0      87
31.0      80
32.0      54
33.0     114
34.0     244
35.0       4
36.0      13
37.0      24
38.0       9
39.0       9
dtype: int64



In [21]:

    
del cluster_markers_dropna



In [22]:

    
big_clusters = cluster_sizes_table1[cluster_sizes_table1 > 100]
big_clusters









    Out[22]:





cluster_id
24.0    3746
25.0     241
26.0     317
27.0     126
33.0     114
34.0     244
dtype: int64



In [26]:

    
big_clusters.index = big_clusters.index.astype(int)
big_clusters









    Out[26]:





cluster_id
24    3746
25     241
26     317
27     126
33     114
34     244
dtype: int64



In [28]:

    
cells_in_big_clusters = cluster_identities.isin(big_clusters.index)
cells_in_big_clusters = cells_in_big_clusters[cells_in_big_clusters]
cells_in_big_clusters.sum()









    Out[28]:





36622



In [29]:

    
cells_in_big_clusters.head()









    Out[29]:





barcode
r1_GTGCCGCCTCTC    True
r1_TGCGAGAGCTTG    True
r1_TAGATTATTCAT    True
r1_TTCTTTTTTCAA    True
r1_TGCGTGCCGGTC    True
Name: cluster_id, dtype: bool

Perform the subset!



In [107]:

    
table1_big_clusters, y = table1.align(cells_in_big_clusters, axis=0, join='inner')
print(table1_big_clusters.shape)
print(y.shape)









    



(4788, 20478)
(4788,)

Grab random genes and make sure this has decent cluster representation



In [103]:

    
# np.random.seed(2017)

# n_cells = 200
# random_cells = np.random.choice(x.index, size=n_cells, replace=False)

# #  Perform the subset
# table1_subset_cells = x.loc[random_cells, :]
# print(table1_subset_cells.shape)
# table1_subset_cells.groupby(cluster_identities).size()

Just kidding... this gets too lopsided so we're going to take 50 cells from each cluster



In [112]:

    
np.random.seed(2017)

n_cells = 50
table1_subset_cells = table1_big_clusters.groupby(
    cluster_identities, as_index=False, group_keys=False).apply(
        lambda x: x.loc[np.random.choice(x.index, size=n_cells, replace=False)])
print(table1_subset_cells.shape)
table1_subset_cells.head()









    



(300, 20478)






    Out[112]:







  
    
      
      KITL
      TMTC3
      CEP290
      4930430F08RIK
      1700017N19RIK
      MGAT4C
      RASSF9
      LRRIQ1
      ADGB
      SLC6A15
      ...
      EFNB1
      PJA1
      TMEM28
      EDA
      EIF2S3Y
      GM20775
      UTY
      DDX3Y
      KDM5D
      N-R5S1
    
    
      barcode
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      r1_TTCCTGCTAGGC
      0
      3
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_TGGAGATACTCT
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_CGTCTACATCCG
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_CAAGCTTGGCGC
      0
      0
      3
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_ACTCACATAGAG
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 20478 columns

Subset genes by differential expression

Use Supplementary table 4, which has the information of which genes were differentially expressed in different clusters



In [113]:

    
cluster_markers = pd.read_excel('macosko2015/mmc4.xlsx', 
                                sheetname='FINAL_MARKERS_FOR_EACH_CLUSTER.',
                                skiprows=3)
print(cluster_markers.shape)
cluster_markers.head()



In [114]:

    
cluster_markers.dtypes









    Out[114]:





Unnamed: 0    object
myAUC         object
myDiff        object
power         object
cluster #     object
dtype: object



In [115]:

    
cluster_markers.tail()



In [116]:

    
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers.tail()



In [117]:

    
big_clusters.index.values.dtype









    Out[117]:





dtype('int64')



In [118]:

    
cluster_markers.groupby('cluster #').size()









    Out[118]:





cluster #
1         190
10        120
11        111
12         68
13        163
14        127
15         69
16         97
17         99
18         75
18 D3       1
19        115
2         174
22         51
23         67
25         14
26         87
28         48
29         39
3         179
3    A      1
3    C      1
3    F      1
3    G      4
3    H      1
3    L      2
3    N      2
3    P      5
3    R      1
3    T      2
         ... 
R          11
RAB         1
RCV         4
RO          4
RPGRI       3
RRB         1
RT          1
S           4
SAMS        2
SC          2
SLC24       4
SN          1
SO          1
SPHK        3
ST          1
SUL         1
SY          2
TC          3
THSD        1
TMEM2       1
TT          2
TUBB        1
TUL         4
UNC1        2
VIP         1
VS          1
ZFP80       2
ge          4
t           3
ust         1
Length: 190, dtype: int64

Okay this file is corrupted..... have to go in and manually change the file so the cluster numbers are in the right column

Use manually created file



In [119]:

    
cluster_markers = pd.read_excel('macosko2015/mmc4_v2.xlsx', skiprows=3)
cluster_markers = cluster_markers.rename(columns={"Unnamed: 0": 'gene_symbol'})

# Remove any rows with NA because those are all header rows
print(cluster_markers.shape)
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers['cluster #'] = cluster_markers['cluster #'].astype(int)
cluster_markers.head()









    



(4292, 5)
(3994, 5)






    Out[119]:







  
    
      
      gene_symbol
      myAUC
      myDiff
      power
      cluster #
    
  
  
    
      0
      CALB1
      0.966
      3.615047
      0.466
      1
    
    
      1
      SLC4A3
      0.963
      3.448571
      0.463
      1
    
    
      2
      TPM3
      0.965
      3.151521
      0.465
      1
    
    
      3
      SEPT4
      0.964
      2.939258
      0.464
      1
    
    
      4
      VIM
      0.944
      2.937992
      0.444
      1



In [120]:

    
cluster_markers.dtypes









    Out[120]:





gene_symbol    object
myAUC          object
myDiff         object
power          object
cluster #       int64
dtype: object



In [1]:

    
cluster_markers.groupby('cluster #').size()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-8ebc328bd3fd> in <module>()
----> 1 cluster_markers.groupby('cluster #').size()

NameError: name 'cluster_markers' is not defined



In [ ]:

    
table1.groupby()

Still have to clean the gene symbols because the AUC ended up in some of them

There should be fewer unique genes so let's check how many we have now



In [122]:

    
len(cluster_markers['gene_symbol'].unique())









    Out[122]:





1574



In [123]:

    
cleaned_symbols = cluster_markers['gene_symbol'].str.split().str[0]
len(cleaned_symbols.unique())









    Out[123]:





1331

Assign the symbols to the column



In [124]:

    
cluster_markers['gene_symbol'] = cleaned_symbols

Save to a csv



In [125]:

    
cluster_markers.to_csv(os.path.join(data_folder, 'cluster_markers_v2.csv'))

Use only genes found in the clusters with lots of cells ("big clusters")



In [126]:

    
rows = cluster_markers['cluster #'].isin(big_clusters.index.values)
print(rows.sum())
cluster_markers_big_clusters = cluster_markers.loc[rows]
print(cluster_markers_big_clusters.shape)
cluster_markers_big_clusters.head()









    



371
(371, 5)






    Out[126]:







  
    
      
      gene_symbol
      myAUC
      myDiff
      power
      cluster #
    
  
  
    
      2722
      RHO
      1.8
      57266 0.44
      5    2
      24
    
    
      2723
      GNAT1
      1.7
      80155 0.38
      9    2
      24
    
    
      2724
      SLC24A1
      1.7
      43717 0.30
      2    2
      24
    
    
      2725
      PDE6B
      1.7
      43134 0.35
      5    2
      24
    
    
      2726
      PDC
      1.7
      00660 0.41
      9    2
      24



In [127]:

    
gene_subset = cluster_markers_big_clusters['gene_symbol'].unique()
len(gene_subset)









    Out[127]:





259

Perform the subset



In [128]:

    
table1_subset_cells_genes = table1_subset_cells.loc[:, gene_subset]
print(table1_subset_cells_genes.shape)
table1_subset_cells_genes.head()









    



(300, 259)






    Out[128]:







  
    
      
      RHO
      GNAT1
      SLC24A1
      PDE6B
      PDC
      CNGA1
      RP1
      SAG
      NR2E3
      NRL
      ...
      SLC6A6
      MAP1B
      TMA7
      STX3
      SYT1
      CRX
      SNAP25
      MPP4
      NEUROD1
      A930011O12RIK
    
    
      barcode
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      r1_TTCCTGCTAGGC
      14
      3
      1
      3
      12
      0
      1
      7
      2
      2
      ...
      1
      1
      2
      0
      0
      0
      0
      1
      0
      0
    
    
      r1_TGGAGATACTCT
      23
      8
      6
      4
      13
      9
      2
      19
      1
      1
      ...
      3
      0
      2
      1
      0
      1
      0
      2
      0
      1
    
    
      r1_CGTCTACATCCG
      14
      4
      7
      1
      6
      3
      0
      13
      2
      2
      ...
      0
      1
      0
      3
      0
      1
      0
      2
      0
      0
    
    
      r1_CAAGCTTGGCGC
      62
      18
      10
      20
      29
      2
      8
      31
      9
      2
      ...
      0
      5
      7
      3
      2
      6
      2
      3
      7
      11
    
    
      r1_ACTCACATAGAG
      10
      1
      0
      1
      5
      2
      1
      7
      3
      1
      ...
      1
      1
      2
      3
      1
      2
      1
      0
      3
      0
    
  

5 rows × 259 columns



In [129]:

    
table1_subset_cells_genes.to_csv(
    os.path.join(data_folder, 'expression_table1_subset.csv'))



In [2]:









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-e095c1d5fdab> in <module>()
----> 1 table2

NameError: name 'table2' is not defined



In [ ]:

	Unnamed: 0	myAUC	myDiff	power	cluster #
4287	PDC	0.163	-2.033191	0.337	39
4288	PDE6B	0.205	-2.102541	0.295	39
4289	RPGRIP1	0.221	-2.105338	0.279	39
4290	SAG	0.154	-2.197799	0.346	39
4291	RP1	0.197	-2.359118	0.303	39

	Unnamed: 0	myAUC	myDiff	power	cluster #
4287	PDC	0.163	-2.033191	0.337	39
4288	PDE6B	0.205	-2.102541	0.295	39
4289	RPGRIP1	0.221	-2.105338	0.279	39
4290	SAG	0.154	-2.197799	0.346	39
4291	RP1	0.197	-2.359118	0.303	39

	KITL	TMTC3	CEP290	4930430F08RIK	1700017N19RIK	MGAT4C	RASSF9	LRRIQ1	ADGB	SLC6A15	...	EFNB1	PJA1	TMEM28	EDA	EIF2S3Y	GM20775	UTY	DDX3Y	KDM5D	N-R5S1
barcode
r1_GGCCGCAGTCCG	0	3	1	2	0	0	0	0	0	4	...	0	8	1	0	0	0	0	0	0	0
r1_CTTGTGCGGGAA	0	0	3	1	0	0	0	0	0	1	...	0	9	0	0	2	0	1	5	0	0
r1_GCGCAACTGCTC	1	0	0	2	0	4	0	0	0	3	...	1	11	0	0	0	0	0	0	0	0
r1_GATTGGGAGGCA	0	0	2	0	0	1	0	0	0	2	...	1	2	0	0	3	0	1	0	0	0
r1_CCTCCTAGTTGG	0	2	1	1	0	2	0	0	0	1	...	0	3	0	0	0	0	0	0	0	0

	KITL	TMTC3	CEP290	4930430F08RIK	1700017N19RIK	MGAT4C	RASSF9	LRRIQ1	ADGB	SLC6A15	...	EFNB1	PJA1	TMEM28	EDA	EIF2S3Y	GM20775	UTY	DDX3Y	KDM5D	N-R5S1
barcode
r1_TTCCTGCTAGGC	0	3	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
r1_TGGAGATACTCT	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
r1_CGTCTACATCCG	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
r1_CAAGCTTGGCGC	0	0	3	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
r1_ACTCACATAGAG	0	0	1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	Unnamed: 0	myAUC	myDiff	power	cluster #
0	CALB1	0.966	3.615047	0.466	1
1	SLC4A3	0.963	3.448571	0.463	1
2	TPM3	0.965	3.151521	0.465	1
3	SEPT4	0.964	2.939258	0.464	1
4	VIM	0.944	2.937992	0.444	1

	gene_symbol	myAUC	myDiff	power	cluster #
2722	RHO	1.8	57266 0.44	5 2	24
2723	GNAT1	1.7	80155 0.38	9 2	24
2724	SLC24A1	1.7	43717 0.30	2 2	24
2725	PDE6B	1.7	43134 0.35	5 2	24
2726	PDC	1.7	00660 0.41	9 2	24