In [4]:
import macosko2015
macosko2015.__version__


Out[4]:
'1.0.0'

In [3]:
macosko2015.load_amacrine()


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-50a2e486a6a4> in <module>()
----> 1 macosko2015.load_amacrine()

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/macosko2015/main.py in load_amacrine(format)
     57     if format == 'pandas':
     58         expression = pd.read_csv(
---> 59             os.path.join(folder, 'amacrine_expression.csv'))
     60         cell_metadata = pd.read_csv(
     61             os.path.join(folder, 'amacrine_cell_metadata.csv'))

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/macosko2015/data/05_make_rentina_subsets_for_teaching/amacrine_expression.csv' does not exist

In [ ]:
import os
import common

# Assign notebook and folder names
notebook_name = '02_robust_pca'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

In [6]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [7]:
table1 = pd.read_table('/Users/olgabot/Downloads/GSE63473_RAW/GSM1626793_P14Retina_1.digital_expression.txt.gz', 
                       compression='gzip', index_col=0)
print(table1.shape)
table1.head()


(20478, 6600)
Out[7]:
GGCCGCAGTCCG CTTGTGCGGGAA GCGCAACTGCTC GATTGGGAGGCA CCTCCTAGTTGG AGTCAAGCCCTC GTGCCGCCTCTC CCTGTGACACAC AATCTCGTTAAT GATTTCCTCTGA ... GATTTAATGGTA TGTAAGGATCCG GAGTGGCTTGAT GCATCTTTCAGG ACACGAGTTTGG CACCCAGTTTCG CCTGGAGAGTTT TCTTCACTCTTA GCCGTCTTACTA GACCAAACTAAT
gene
10:100015630-100100413:Kitl 0 0 1 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
10:100443902-100487350:Tmtc3 3 0 0 0 2 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
10:100488289-100573655:Cep290 1 3 0 2 1 18 10 3 4 3 ... 4 0 3 0 0 0 0 2 0 1
10:100572274-100589259:4930430F08Rik 2 1 2 0 1 1 0 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
10:100592386-100618391:1700017N19Rik 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 6600 columns


In [8]:
data = table1.values
mask = data > 5

sns.distplot(data[mask].flat, kde=False)


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x116137438>

In [9]:
mask = table1 == 0
sns.heatmap(table1, xticklabels=[], yticklabels=[], mask=mask)
fig = plt.gcf()
fig.savefig('table1_heatmap.png')



In [10]:
n_transcripts_per_cell = table1.sum()
n_transcripts_per_cell.head()


Out[10]:
GGCCGCAGTCCG    37519
CTTGTGCGGGAA    32077
GCGCAACTGCTC    28162
GATTGGGAGGCA    20384
CCTCCTAGTTGG    19571
dtype: int64

In [11]:
sns.distplot(n_transcripts_per_cell)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c13bd68>

In [12]:
n_transcripts_per_cell.describe()


Out[12]:
count     6600.000000
mean      1332.358939
std       1572.921791
min        466.000000
25%        578.000000
50%        821.000000
75%       1452.250000
max      37519.000000
dtype: float64

In [13]:
n_expressed_genes_per_cell = (table1 > 0).sum()
n_expressed_genes_per_cell.head()


Out[13]:
GGCCGCAGTCCG    7254
CTTGTGCGGGAA    6945
GCGCAACTGCTC    6404
GATTGGGAGGCA    5751
CCTCCTAGTTGG    5785
dtype: int64

In [14]:
sns.distplot(n_expressed_genes_per_cell)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x120d0add8>

In [15]:
greater500 = (table1 > 100).sum(axis=1) > 1
greater500.sum()


Out[15]:
23

In [16]:
table1_t = table1.T
print(table1_t.shape)
table1_t.head()


(6600, 20478)
Out[16]:
gene 10:100015630-100100413:Kitl 10:100443902-100487350:Tmtc3 10:100488289-100573655:Cep290 10:100572274-100589259:4930430F08Rik 10:100592386-100618391:1700017N19Rik 10:101681487-102391469:Mgat4c 10:102512222-102546560:Rassf9 10:103063198-103236322:Lrriq1 10:10335703-10472326:Adgb 10:103367808-103419378:Slc6a15 ... X:99136130-99148991:Efnb1 X:99465734-99471273:Pja1 X:99821021-99848790:Tmem28 X:99975606-100400762:Eda Y:1010543-1028847:Eif2s3y Y:10640942-10643315:Gm20775 Y:1096861-1245759:Uty Y:1260715-1286613:Ddx3y Y:897788-943811:Kdm5d Y:991630-991748:n-R5s1
GGCCGCAGTCCG 0 3 1 2 0 0 0 0 0 4 ... 0 8 1 0 0 0 0 0 0 0
CTTGTGCGGGAA 0 0 3 1 0 0 0 0 0 1 ... 0 9 0 0 2 0 1 5 0 0
GCGCAACTGCTC 1 0 0 2 0 4 0 0 0 3 ... 1 11 0 0 0 0 0 0 0 0
GATTGGGAGGCA 0 0 2 0 0 1 0 0 0 2 ... 1 2 0 0 3 0 1 0 0 0
CCTCCTAGTTGG 0 2 1 1 0 2 0 0 0 1 ... 0 3 0 0 0 0 0 0 0 0

5 rows × 20478 columns


In [17]:
n_transcripts_per_gene = table1_t.sum()
n_transcripts_per_gene.head()


Out[17]:
gene
10:100015630-100100413:Kitl              131
10:100443902-100487350:Tmtc3             434
10:100488289-100573655:Cep290           4389
10:100572274-100589259:4930430F08Rik     527
10:100592386-100618391:1700017N19Rik      16
dtype: int64

In [18]:
n_transcripts_per_gene = table1_t.sum()
n_transcripts_per_gene.head()


Out[18]:
gene
10:100015630-100100413:Kitl              131
10:100443902-100487350:Tmtc3             434
10:100488289-100573655:Cep290           4389
10:100572274-100589259:4930430F08Rik     527
10:100592386-100618391:1700017N19Rik      16
dtype: int64

In [19]:
sns.distplot(n_transcripts_per_gene)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d1e0cf8>

Subset the genes based on their total number of transcripts


In [20]:
(n_transcripts_per_gene > 1e3).sum()


Out[20]:
2041

In [21]:
n_transcripts_per_gene[n_transcripts_per_gene > 1e4]


Out[21]:
gene
10:108497650-109010982:Syt1            17227
10:86690209-86705509:Hsp90b1           12175
10:98915152-99026143:Atp2b1            10137
11:106780355-106789185:Ddx5            16403
11:116021912-116027962:H3f3b           13245
11:120447605-120453500:Pde6g           28298
11:62551171-62553213:Ubb               11151
11:67695326-67703333:Rcvrn             22902
11:78343482-78349164:Unc119            13936
11:94287890-94321988:Luc7l3            11121
12:100199435-100209806:Calm1           34885
12:109541001-109571726:Meg3            18699
12:110690605-110702728:Hsp90aa1        17151
12:111669355-111672338:Ckb             12471
14:52110704-52163546:Rpgrip1           22525
14:55518978-55524981:Nrl               10515
14:64588115-64593959:A930011O12Rik     12209
16:91647506-91679221:Son               12407
16:94370618-94469222:Ttc3              10199
16:96120618-96127729:Hmgn1             19479
17:28351515-28365182:Tulp1             20794
17:45567775-45573271:Hsp90ab1          11576
17:46910459-46924933:Prph2             22544
19:5795690-5802672:Malat1             158364
19:8927382-8929356:Rom1                14097
19:9982703-9985092:Fth1                10009
1:150319417-150333905:Pdc              37632
1:178323074-178337797:Hnrnpu           11532
1:4343507-4360314:Rp1                  22067
1:87803680-87845158:Sag                53013
                                       ...  
2:79452521-79456751:Neurod1            12530
3:51388415-51396738:Mgarp              18432
4:134923592-134927671:D4Wsu53e         18037
4:155491361-155559269:Gnb1             22795
4:21847583-21876475:Sfrs18             12185
5:108388391-108432397:Pde6b            17839
5:142903115-142906754:Actb             10541
5:72603696-72642752:Cnga1              10725
6:115931927-115938829:Rho              80682
6:11900373-11907446:Ndufa4             11599
6:3993797-3997436:Gngt1                64556
6:51460434-51469894:Hnrnpa2b1          14264
6:52713729-52766780:Tax1bp1            12541
7:126795234-126800751:Aldoa            12031
8:46206797-46211284:Slc25a4            15352
8:64592558-64693040:Cpe                11149
9:107674477-107679592:Gnat1            25722
9:31149557-31211815:Aplp2              12596
9:59656368-59679375:Pkm                11147
9:59942771-59960659:Nr2e3              11118
9:62341293-62378812:Anp32a             12359
9:64922861-64951607:Slc24a1            13368
MT:10167-11544:mt-Nd4                  12406
MT:1094-2675:mt-Rnr2                  105402
MT:11742-13565:mt-Nd5                  19337
MT:14145-15288:mt-Cytb                 37963
MT:2751-3707:mt-Nd1                    20787
MT:3914-4951:mt-Nd2                    14016
MT:70-1024:mt-Rnr1                     10925
X:160768013-160799663:Rs1              15217
Length: 64, dtype: int64

Look at gene's median transcript count


In [22]:
median_transcripts_per_gene = table1_t.median()
median_transcripts_per_gene.head()


Out[22]:
gene
10:100015630-100100413:Kitl             0.0
10:100443902-100487350:Tmtc3            0.0
10:100488289-100573655:Cep290           0.0
10:100572274-100589259:4930430F08Rik    0.0
10:100592386-100618391:1700017N19Rik    0.0
dtype: float64

In [23]:
sns.distplot(median_transcripts_per_gene)
fig = plt.gcf()
fig.savefig('median_transcripts_per_gene.png')



In [24]:
data = median_transcripts_per_gene
mask = data > 0

sns.distplot(data[mask])
fig = plt.gcf()
fig.savefig('median_transcripts_per_gene_greater0.png')


Clean data matrix to be compatible with the cluster labels and identities

Currently, cells are labeled by their barcode, e.g. GCGCAACTGCTC, and genes are labeled by their chrom:start-end:symbol, e.g. 6:51460434-51469894:Hnrnpa2b1. But, in the supplementary data, the genes are all uppercase, e.g. HNRNPA2B1 (which is incorrect since this is mouse data.. ) and the barcodes have r1_ prepended before the id, e.g. r1_GCGCAACTGCTC.

So we need to clean the data to be compatible with this


In [29]:
gene_symbols = table1_t.columns.map(lambda x: x.split(':')[-1].upper())
gene_symbols.name = 'symbol'
table1_t.columnsmns = gene_symbols
table1_t.head()


Out[29]:
symbol KITL TMTC3 CEP290 4930430F08RIK 1700017N19RIK MGAT4C RASSF9 LRRIQ1 ADGB SLC6A15 ... EFNB1 PJA1 TMEM28 EDA EIF2S3Y GM20775 UTY DDX3Y KDM5D N-R5S1
symbol
GGCCGCAGTCCG 0 3 1 2 0 0 0 0 0 4 ... 0 8 1 0 0 0 0 0 0 0
CTTGTGCGGGAA 0 0 3 1 0 0 0 0 0 1 ... 0 9 0 0 2 0 1 5 0 0
GCGCAACTGCTC 1 0 0 2 0 4 0 0 0 3 ... 1 11 0 0 0 0 0 0 0 0
GATTGGGAGGCA 0 0 2 0 0 1 0 0 0 2 ... 1 2 0 0 3 0 1 0 0 0
CCTCCTAGTTGG 0 2 1 1 0 2 0 0 0 1 ... 0 3 0 0 0 0 0 0 0 0

5 rows × 20478 columns


In [30]:
barcodes = 'r1_' + table1_t.index
barcodes.name = 'barcode'
table1_t.index = barcodes
table1_t.head()


Out[30]:
symbol KITL TMTC3 CEP290 4930430F08RIK 1700017N19RIK MGAT4C RASSF9 LRRIQ1 ADGB SLC6A15 ... EFNB1 PJA1 TMEM28 EDA EIF2S3Y GM20775 UTY DDX3Y KDM5D N-R5S1
barcode
r1_GGCCGCAGTCCG 0 3 1 2 0 0 0 0 0 4 ... 0 8 1 0 0 0 0 0 0 0
r1_CTTGTGCGGGAA 0 0 3 1 0 0 0 0 0 1 ... 0 9 0 0 2 0 1 5 0 0
r1_GCGCAACTGCTC 1 0 0 2 0 4 0 0 0 3 ... 1 11 0 0 0 0 0 0 0 0
r1_GATTGGGAGGCA 0 0 2 0 0 1 0 0 0 2 ... 1 2 0 0 3 0 1 0 0 0
r1_CCTCCTAGTTGG 0 2 1 1 0 2 0 0 0 1 ... 0 3 0 0 0 0 0 0 0 0

5 rows × 20478 columns


In [32]:
table1_t.to_csv('expression_table1.csv')

In [ ]: