In [96]:
import os
import common

# Assign notebook and folder names
notebook_name = '01_downsample_macosko_data'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [4]:
table1 = pd.read_csv('expression_table1.csv', index_col=0)
print(table1.shape)
table1.head()


(6600, 20478)
Out[4]:
KITL TMTC3 CEP290 4930430F08RIK 1700017N19RIK MGAT4C RASSF9 LRRIQ1 ADGB SLC6A15 ... EFNB1 PJA1 TMEM28 EDA EIF2S3Y GM20775 UTY DDX3Y KDM5D N-R5S1
barcode
r1_GGCCGCAGTCCG 0 3 1 2 0 0 0 0 0 4 ... 0 8 1 0 0 0 0 0 0 0
r1_CTTGTGCGGGAA 0 0 3 1 0 0 0 0 0 1 ... 0 9 0 0 2 0 1 5 0 0
r1_GCGCAACTGCTC 1 0 0 2 0 4 0 0 0 3 ... 1 11 0 0 0 0 0 0 0 0
r1_GATTGGGAGGCA 0 0 2 0 0 1 0 0 0 2 ... 1 2 0 0 3 0 1 0 0 0
r1_CCTCCTAGTTGG 0 2 1 1 0 2 0 0 0 1 ... 0 3 0 0 0 0 0 0 0 0

5 rows × 20478 columns

Subset cells based on cluster identies


In [16]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()


(44808,)
Out[16]:
barcode
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [17]:
cluster_identities.groupby(cluster_identities).size()


Out[17]:
cluster_id
1       252
2       432
3       289
4        73
5        77
6       211
7       326
8       159
9       350
10      191
11      214
12      274
13       50
14      111
15       73
16      262
17      375
18       83
19      127
20      389
21      254
22      274
23      264
24    29400
25     1868
26     2217
27      664
28      496
29      591
30      636
31      512
32      320
33      849
34     1624
35       54
36       85
37      252
38       63
39       67
Name: cluster_id, dtype: int64

What is the cluster representation in this file?


In [19]:
cluster_sizes_table1 = table1.groupby(cluster_identities, axis=0).size()
cluster_sizes_table1


Out[19]:
cluster_id
1.0       27
2.0       70
3.0       44
4.0       13
5.0       18
6.0       42
7.0       54
8.0       23
9.0       42
10.0      29
11.0      31
12.0      47
13.0      11
14.0      10
15.0      13
16.0      36
17.0      72
18.0      14
19.0      19
20.0      64
21.0      47
22.0      46
23.0      39
24.0    3746
25.0     241
26.0     317
27.0     126
28.0      56
29.0      85
30.0      87
31.0      80
32.0      54
33.0     114
34.0     244
35.0       4
36.0      13
37.0      24
38.0       9
39.0       9
dtype: int64

In [21]:
del cluster_markers_dropna

In [22]:
big_clusters = cluster_sizes_table1[cluster_sizes_table1 > 100]
big_clusters


Out[22]:
cluster_id
24.0    3746
25.0     241
26.0     317
27.0     126
33.0     114
34.0     244
dtype: int64

In [26]:
big_clusters.index = big_clusters.index.astype(int)
big_clusters


Out[26]:
cluster_id
24    3746
25     241
26     317
27     126
33     114
34     244
dtype: int64

In [28]:
cells_in_big_clusters = cluster_identities.isin(big_clusters.index)
cells_in_big_clusters = cells_in_big_clusters[cells_in_big_clusters]
cells_in_big_clusters.sum()


Out[28]:
36622

In [29]:
cells_in_big_clusters.head()


Out[29]:
barcode
r1_GTGCCGCCTCTC    True
r1_TGCGAGAGCTTG    True
r1_TAGATTATTCAT    True
r1_TTCTTTTTTCAA    True
r1_TGCGTGCCGGTC    True
Name: cluster_id, dtype: bool

Perform the subset!


In [107]:
table1_big_clusters, y = table1.align(cells_in_big_clusters, axis=0, join='inner')
print(table1_big_clusters.shape)
print(y.shape)


(4788, 20478)
(4788,)

Grab random genes and make sure this has decent cluster representation


In [103]:
# np.random.seed(2017)

# n_cells = 200
# random_cells = np.random.choice(x.index, size=n_cells, replace=False)

# #  Perform the subset
# table1_subset_cells = x.loc[random_cells, :]
# print(table1_subset_cells.shape)
# table1_subset_cells.groupby(cluster_identities).size()

Just kidding... this gets too lopsided so we're going to take 50 cells from each cluster


In [112]:
np.random.seed(2017)

n_cells = 50
table1_subset_cells = table1_big_clusters.groupby(
    cluster_identities, as_index=False, group_keys=False).apply(
        lambda x: x.loc[np.random.choice(x.index, size=n_cells, replace=False)])
print(table1_subset_cells.shape)
table1_subset_cells.head()


(300, 20478)
Out[112]:
KITL TMTC3 CEP290 4930430F08RIK 1700017N19RIK MGAT4C RASSF9 LRRIQ1 ADGB SLC6A15 ... EFNB1 PJA1 TMEM28 EDA EIF2S3Y GM20775 UTY DDX3Y KDM5D N-R5S1
barcode
r1_TTCCTGCTAGGC 0 3 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
r1_TGGAGATACTCT 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
r1_CGTCTACATCCG 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
r1_CAAGCTTGGCGC 0 0 3 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
r1_ACTCACATAGAG 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 20478 columns

Subset genes by differential expression

Use Supplementary table 4, which has the information of which genes were differentially expressed in different clusters


In [113]:
cluster_markers = pd.read_excel('macosko2015/mmc4.xlsx', 
                                sheetname='FINAL_MARKERS_FOR_EACH_CLUSTER.',
                                skiprows=3)
print(cluster_markers.shape)
cluster_markers.head()


(4292, 5)
Out[113]:
Unnamed: 0 myAUC myDiff power cluster #
0 CALB1 0.966 3.615047 0.466 1
1 SLC4A3 0.963 3.448571 0.463 1
2 TPM3 0.965 3.151521 0.465 1
3 SEPT4 0.964 2.939258 0.464 1
4 VIM 0.944 2.937992 0.444 1

In [114]:
cluster_markers.dtypes


Out[114]:
Unnamed: 0    object
myAUC         object
myDiff        object
power         object
cluster #     object
dtype: object

In [115]:
cluster_markers.tail()


Out[115]:
Unnamed: 0 myAUC myDiff power cluster #
4287 PDC 0.163 -2.033191 0.337 39
4288 PDE6B 0.205 -2.102541 0.295 39
4289 RPGRIP1 0.221 -2.105338 0.279 39
4290 SAG 0.154 -2.197799 0.346 39
4291 RP1 0.197 -2.359118 0.303 39

In [116]:
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers.tail()


(4148, 5)
Out[116]:
Unnamed: 0 myAUC myDiff power cluster #
4287 PDC 0.163 -2.033191 0.337 39
4288 PDE6B 0.205 -2.102541 0.295 39
4289 RPGRIP1 0.221 -2.105338 0.279 39
4290 SAG 0.154 -2.197799 0.346 39
4291 RP1 0.197 -2.359118 0.303 39

In [117]:
big_clusters.index.values.dtype


Out[117]:
dtype('int64')

In [118]:
cluster_markers.groupby('cluster #').size()


Out[118]:
cluster #
1         190
10        120
11        111
12         68
13        163
14        127
15         69
16         97
17         99
18         75
18 D3       1
19        115
2         174
22         51
23         67
25         14
26         87
28         48
29         39
3         179
3    A      1
3    C      1
3    F      1
3    G      4
3    H      1
3    L      2
3    N      2
3    P      5
3    R      1
3    T      2
         ... 
R          11
RAB         1
RCV         4
RO          4
RPGRI       3
RRB         1
RT          1
S           4
SAMS        2
SC          2
SLC24       4
SN          1
SO          1
SPHK        3
ST          1
SUL         1
SY          2
TC          3
THSD        1
TMEM2       1
TT          2
TUBB        1
TUL         4
UNC1        2
VIP         1
VS          1
ZFP80       2
ge          4
t           3
ust         1
Length: 190, dtype: int64

Okay this file is corrupted..... have to go in and manually change the file so the cluster numbers are in the right column

Use manually created file


In [119]:
cluster_markers = pd.read_excel('macosko2015/mmc4_v2.xlsx', skiprows=3)
cluster_markers = cluster_markers.rename(columns={"Unnamed: 0": 'gene_symbol'})

# Remove any rows with NA because those are all header rows
print(cluster_markers.shape)
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers['cluster #'] = cluster_markers['cluster #'].astype(int)
cluster_markers.head()


(4292, 5)
(3994, 5)
Out[119]:
gene_symbol myAUC myDiff power cluster #
0 CALB1 0.966 3.615047 0.466 1
1 SLC4A3 0.963 3.448571 0.463 1
2 TPM3 0.965 3.151521 0.465 1
3 SEPT4 0.964 2.939258 0.464 1
4 VIM 0.944 2.937992 0.444 1

In [120]:
cluster_markers.dtypes


Out[120]:
gene_symbol    object
myAUC          object
myDiff         object
power          object
cluster #       int64
dtype: object

In [1]:
cluster_markers.groupby('cluster #').size()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-8ebc328bd3fd> in <module>()
----> 1 cluster_markers.groupby('cluster #').size()

NameError: name 'cluster_markers' is not defined

In [ ]:
table1.groupby()

Still have to clean the gene symbols because the AUC ended up in some of them

There should be fewer unique genes so let's check how many we have now


In [122]:
len(cluster_markers['gene_symbol'].unique())


Out[122]:
1574

In [123]:
cleaned_symbols = cluster_markers['gene_symbol'].str.split().str[0]
len(cleaned_symbols.unique())


Out[123]:
1331

Assign the symbols to the column


In [124]:
cluster_markers['gene_symbol'] = cleaned_symbols

Save to a csv


In [125]:
cluster_markers.to_csv(os.path.join(data_folder, 'cluster_markers_v2.csv'))

Use only genes found in the clusters with lots of cells ("big clusters")


In [126]:
rows = cluster_markers['cluster #'].isin(big_clusters.index.values)
print(rows.sum())
cluster_markers_big_clusters = cluster_markers.loc[rows]
print(cluster_markers_big_clusters.shape)
cluster_markers_big_clusters.head()


371
(371, 5)
Out[126]:
gene_symbol myAUC myDiff power cluster #
2722 RHO 1.8 57266 0.44 5 2 24
2723 GNAT1 1.7 80155 0.38 9 2 24
2724 SLC24A1 1.7 43717 0.30 2 2 24
2725 PDE6B 1.7 43134 0.35 5 2 24
2726 PDC 1.7 00660 0.41 9 2 24

In [127]:
gene_subset = cluster_markers_big_clusters['gene_symbol'].unique()
len(gene_subset)


Out[127]:
259

Perform the subset


In [128]:
table1_subset_cells_genes = table1_subset_cells.loc[:, gene_subset]
print(table1_subset_cells_genes.shape)
table1_subset_cells_genes.head()


(300, 259)
Out[128]:
RHO GNAT1 SLC24A1 PDE6B PDC CNGA1 RP1 SAG NR2E3 NRL ... SLC6A6 MAP1B TMA7 STX3 SYT1 CRX SNAP25 MPP4 NEUROD1 A930011O12RIK
barcode
r1_TTCCTGCTAGGC 14 3 1 3 12 0 1 7 2 2 ... 1 1 2 0 0 0 0 1 0 0
r1_TGGAGATACTCT 23 8 6 4 13 9 2 19 1 1 ... 3 0 2 1 0 1 0 2 0 1
r1_CGTCTACATCCG 14 4 7 1 6 3 0 13 2 2 ... 0 1 0 3 0 1 0 2 0 0
r1_CAAGCTTGGCGC 62 18 10 20 29 2 8 31 9 2 ... 0 5 7 3 2 6 2 3 7 11
r1_ACTCACATAGAG 10 1 0 1 5 2 1 7 3 1 ... 1 1 2 3 1 2 1 0 3 0

5 rows × 259 columns


In [129]:
table1_subset_cells_genes.to_csv(
    os.path.join(data_folder, 'expression_table1_subset.csv'))

In [2]:



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-e095c1d5fdab> in <module>()
----> 1 table2

NameError: name 'table2' is not defined

In [ ]: