In [4]:

    
import macosko2015
macosko2015.__version__









    Out[4]:





'1.0.0'



In [3]:

    
macosko2015.load_amacrine()









    



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-50a2e486a6a4> in <module>()
----> 1 macosko2015.load_amacrine()

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/macosko2015/main.py in load_amacrine(format)
     57     if format == 'pandas':
     58         expression = pd.read_csv(
---> 59             os.path.join(folder, 'amacrine_expression.csv'))
     60         cell_metadata = pd.read_csv(
     61             os.path.join(folder, 'amacrine_cell_metadata.csv'))

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

~/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'/Users/olgabot/anaconda3/envs/cshl-sca-2017/lib/python3.6/site-packages/macosko2015/data/05_make_rentina_subsets_for_teaching/amacrine_expression.csv' does not exist



In [ ]:

    
import os
import common

# Assign notebook and folder names
notebook_name = '02_robust_pca'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder



In [6]:

    
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [7]:

    
table1 = pd.read_table('/Users/olgabot/Downloads/GSE63473_RAW/GSM1626793_P14Retina_1.digital_expression.txt.gz', 
                       compression='gzip', index_col=0)
print(table1.shape)
table1.head()









    



(20478, 6600)






    Out[7]:







  
    
      
      GGCCGCAGTCCG
      CTTGTGCGGGAA
      GCGCAACTGCTC
      GATTGGGAGGCA
      CCTCCTAGTTGG
      AGTCAAGCCCTC
      GTGCCGCCTCTC
      CCTGTGACACAC
      AATCTCGTTAAT
      GATTTCCTCTGA
      ...
      GATTTAATGGTA
      TGTAAGGATCCG
      GAGTGGCTTGAT
      GCATCTTTCAGG
      ACACGAGTTTGG
      CACCCAGTTTCG
      CCTGGAGAGTTT
      TCTTCACTCTTA
      GCCGTCTTACTA
      GACCAAACTAAT
    
    
      gene
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      10:100015630-100100413:Kitl
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10:100443902-100487350:Tmtc3
      3
      0
      0
      0
      2
      0
      1
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10:100488289-100573655:Cep290
      1
      3
      0
      2
      1
      18
      10
      3
      4
      3
      ...
      4
      0
      3
      0
      0
      0
      0
      2
      0
      1
    
    
      10:100572274-100589259:4930430F08Rik
      2
      1
      2
      0
      1
      1
      0
      1
      1
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10:100592386-100618391:1700017N19Rik
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 6600 columns



In [8]:

    
data = table1.values
mask = data > 5

sns.distplot(data[mask].flat, kde=False)









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x116137438>



In [9]:

    
mask = table1 == 0
sns.heatmap(table1, xticklabels=[], yticklabels=[], mask=mask)
fig = plt.gcf()
fig.savefig('table1_heatmap.png')



In [10]:

    
n_transcripts_per_cell = table1.sum()
n_transcripts_per_cell.head()









    Out[10]:





GGCCGCAGTCCG    37519
CTTGTGCGGGAA    32077
GCGCAACTGCTC    28162
GATTGGGAGGCA    20384
CCTCCTAGTTGG    19571
dtype: int64



In [11]:

    
sns.distplot(n_transcripts_per_cell)









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x11c13bd68>



In [12]:

    
n_transcripts_per_cell.describe()









    Out[12]:





count     6600.000000
mean      1332.358939
std       1572.921791
min        466.000000
25%        578.000000
50%        821.000000
75%       1452.250000
max      37519.000000
dtype: float64



In [13]:

    
n_expressed_genes_per_cell = (table1 > 0).sum()
n_expressed_genes_per_cell.head()









    Out[13]:





GGCCGCAGTCCG    7254
CTTGTGCGGGAA    6945
GCGCAACTGCTC    6404
GATTGGGAGGCA    5751
CCTCCTAGTTGG    5785
dtype: int64



In [14]:

    
sns.distplot(n_expressed_genes_per_cell)









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x120d0add8>



In [15]:

    
greater500 = (table1 > 100).sum(axis=1) > 1
greater500.sum()









    Out[15]:





23



In [16]:

    
table1_t = table1.T
print(table1_t.shape)
table1_t.head()









    



(6600, 20478)






    Out[16]:







  
    
      gene
      10:100015630-100100413:Kitl
      10:100443902-100487350:Tmtc3
      10:100488289-100573655:Cep290
      10:100572274-100589259:4930430F08Rik
      10:100592386-100618391:1700017N19Rik
      10:101681487-102391469:Mgat4c
      10:102512222-102546560:Rassf9
      10:103063198-103236322:Lrriq1
      10:10335703-10472326:Adgb
      10:103367808-103419378:Slc6a15
      ...
      X:99136130-99148991:Efnb1
      X:99465734-99471273:Pja1
      X:99821021-99848790:Tmem28
      X:99975606-100400762:Eda
      Y:1010543-1028847:Eif2s3y
      Y:10640942-10643315:Gm20775
      Y:1096861-1245759:Uty
      Y:1260715-1286613:Ddx3y
      Y:897788-943811:Kdm5d
      Y:991630-991748:n-R5s1
    
  
  
    
      GGCCGCAGTCCG
      0
      3
      1
      2
      0
      0
      0
      0
      0
      4
      ...
      0
      8
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      CTTGTGCGGGAA
      0
      0
      3
      1
      0
      0
      0
      0
      0
      1
      ...
      0
      9
      0
      0
      2
      0
      1
      5
      0
      0
    
    
      GCGCAACTGCTC
      1
      0
      0
      2
      0
      4
      0
      0
      0
      3
      ...
      1
      11
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      GATTGGGAGGCA
      0
      0
      2
      0
      0
      1
      0
      0
      0
      2
      ...
      1
      2
      0
      0
      3
      0
      1
      0
      0
      0
    
    
      CCTCCTAGTTGG
      0
      2
      1
      1
      0
      2
      0
      0
      0
      1
      ...
      0
      3
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 20478 columns



In [17]:

    
n_transcripts_per_gene = table1_t.sum()
n_transcripts_per_gene.head()









    Out[17]:





gene
10:100015630-100100413:Kitl              131
10:100443902-100487350:Tmtc3             434
10:100488289-100573655:Cep290           4389
10:100572274-100589259:4930430F08Rik     527
10:100592386-100618391:1700017N19Rik      16
dtype: int64



In [18]:

    
n_transcripts_per_gene = table1_t.sum()
n_transcripts_per_gene.head()









    Out[18]:





gene
10:100015630-100100413:Kitl              131
10:100443902-100487350:Tmtc3             434
10:100488289-100573655:Cep290           4389
10:100572274-100589259:4930430F08Rik     527
10:100592386-100618391:1700017N19Rik      16
dtype: int64



In [19]:

    
sns.distplot(n_transcripts_per_gene)









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x11d1e0cf8>

Subset the genes based on their total number of transcripts



In [20]:

    
(n_transcripts_per_gene > 1e3).sum()









    Out[20]:





2041



In [21]:

    
n_transcripts_per_gene[n_transcripts_per_gene > 1e4]









    Out[21]:





gene
10:108497650-109010982:Syt1            17227
10:86690209-86705509:Hsp90b1           12175
10:98915152-99026143:Atp2b1            10137
11:106780355-106789185:Ddx5            16403
11:116021912-116027962:H3f3b           13245
11:120447605-120453500:Pde6g           28298
11:62551171-62553213:Ubb               11151
11:67695326-67703333:Rcvrn             22902
11:78343482-78349164:Unc119            13936
11:94287890-94321988:Luc7l3            11121
12:100199435-100209806:Calm1           34885
12:109541001-109571726:Meg3            18699
12:110690605-110702728:Hsp90aa1        17151
12:111669355-111672338:Ckb             12471
14:52110704-52163546:Rpgrip1           22525
14:55518978-55524981:Nrl               10515
14:64588115-64593959:A930011O12Rik     12209
16:91647506-91679221:Son               12407
16:94370618-94469222:Ttc3              10199
16:96120618-96127729:Hmgn1             19479
17:28351515-28365182:Tulp1             20794
17:45567775-45573271:Hsp90ab1          11576
17:46910459-46924933:Prph2             22544
19:5795690-5802672:Malat1             158364
19:8927382-8929356:Rom1                14097
19:9982703-9985092:Fth1                10009
1:150319417-150333905:Pdc              37632
1:178323074-178337797:Hnrnpu           11532
1:4343507-4360314:Rp1                  22067
1:87803680-87845158:Sag                53013
                                       ...  
2:79452521-79456751:Neurod1            12530
3:51388415-51396738:Mgarp              18432
4:134923592-134927671:D4Wsu53e         18037
4:155491361-155559269:Gnb1             22795
4:21847583-21876475:Sfrs18             12185
5:108388391-108432397:Pde6b            17839
5:142903115-142906754:Actb             10541
5:72603696-72642752:Cnga1              10725
6:115931927-115938829:Rho              80682
6:11900373-11907446:Ndufa4             11599
6:3993797-3997436:Gngt1                64556
6:51460434-51469894:Hnrnpa2b1          14264
6:52713729-52766780:Tax1bp1            12541
7:126795234-126800751:Aldoa            12031
8:46206797-46211284:Slc25a4            15352
8:64592558-64693040:Cpe                11149
9:107674477-107679592:Gnat1            25722
9:31149557-31211815:Aplp2              12596
9:59656368-59679375:Pkm                11147
9:59942771-59960659:Nr2e3              11118
9:62341293-62378812:Anp32a             12359
9:64922861-64951607:Slc24a1            13368
MT:10167-11544:mt-Nd4                  12406
MT:1094-2675:mt-Rnr2                  105402
MT:11742-13565:mt-Nd5                  19337
MT:14145-15288:mt-Cytb                 37963
MT:2751-3707:mt-Nd1                    20787
MT:3914-4951:mt-Nd2                    14016
MT:70-1024:mt-Rnr1                     10925
X:160768013-160799663:Rs1              15217
Length: 64, dtype: int64

Look at gene's median transcript count



In [22]:

    
median_transcripts_per_gene = table1_t.median()
median_transcripts_per_gene.head()









    Out[22]:





gene
10:100015630-100100413:Kitl             0.0
10:100443902-100487350:Tmtc3            0.0
10:100488289-100573655:Cep290           0.0
10:100572274-100589259:4930430F08Rik    0.0
10:100592386-100618391:1700017N19Rik    0.0
dtype: float64



In [23]:

    
sns.distplot(median_transcripts_per_gene)
fig = plt.gcf()
fig.savefig('median_transcripts_per_gene.png')



In [24]:

    
data = median_transcripts_per_gene
mask = data > 0

sns.distplot(data[mask])
fig = plt.gcf()
fig.savefig('median_transcripts_per_gene_greater0.png')

Clean data matrix to be compatible with the cluster labels and identities

Currently, cells are labeled by their barcode, e.g. GCGCAACTGCTC, and genes are labeled by their chrom:start-end:symbol, e.g. 6:51460434-51469894:Hnrnpa2b1. But, in the supplementary data, the genes are all uppercase, e.g. HNRNPA2B1 (which is incorrect since this is mouse data.. ) and the barcodes have r1_ prepended before the id, e.g. r1_GCGCAACTGCTC.

So we need to clean the data to be compatible with this



In [29]:

    
gene_symbols = table1_t.columns.map(lambda x: x.split(':')[-1].upper())
gene_symbols.name = 'symbol'
table1_t.columnsmns = gene_symbols
table1_t.head()









    Out[29]:







  
    
      symbol
      KITL
      TMTC3
      CEP290
      4930430F08RIK
      1700017N19RIK
      MGAT4C
      RASSF9
      LRRIQ1
      ADGB
      SLC6A15
      ...
      EFNB1
      PJA1
      TMEM28
      EDA
      EIF2S3Y
      GM20775
      UTY
      DDX3Y
      KDM5D
      N-R5S1
    
    
      symbol
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      GGCCGCAGTCCG
      0
      3
      1
      2
      0
      0
      0
      0
      0
      4
      ...
      0
      8
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      CTTGTGCGGGAA
      0
      0
      3
      1
      0
      0
      0
      0
      0
      1
      ...
      0
      9
      0
      0
      2
      0
      1
      5
      0
      0
    
    
      GCGCAACTGCTC
      1
      0
      0
      2
      0
      4
      0
      0
      0
      3
      ...
      1
      11
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      GATTGGGAGGCA
      0
      0
      2
      0
      0
      1
      0
      0
      0
      2
      ...
      1
      2
      0
      0
      3
      0
      1
      0
      0
      0
    
    
      CCTCCTAGTTGG
      0
      2
      1
      1
      0
      2
      0
      0
      0
      1
      ...
      0
      3
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 20478 columns



In [30]:

    
barcodes = 'r1_' + table1_t.index
barcodes.name = 'barcode'
table1_t.index = barcodes
table1_t.head()









    Out[30]:







  
    
      symbol
      KITL
      TMTC3
      CEP290
      4930430F08RIK
      1700017N19RIK
      MGAT4C
      RASSF9
      LRRIQ1
      ADGB
      SLC6A15
      ...
      EFNB1
      PJA1
      TMEM28
      EDA
      EIF2S3Y
      GM20775
      UTY
      DDX3Y
      KDM5D
      N-R5S1
    
    
      barcode
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      r1_GGCCGCAGTCCG
      0
      3
      1
      2
      0
      0
      0
      0
      0
      4
      ...
      0
      8
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_CTTGTGCGGGAA
      0
      0
      3
      1
      0
      0
      0
      0
      0
      1
      ...
      0
      9
      0
      0
      2
      0
      1
      5
      0
      0
    
    
      r1_GCGCAACTGCTC
      1
      0
      0
      2
      0
      4
      0
      0
      0
      3
      ...
      1
      11
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      r1_GATTGGGAGGCA
      0
      0
      2
      0
      0
      1
      0
      0
      0
      2
      ...
      1
      2
      0
      0
      3
      0
      1
      0
      0
      0
    
    
      r1_CCTCCTAGTTGG
      0
      2
      1
      1
      0
      2
      0
      0
      0
      1
      ...
      0
      3
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 20478 columns



In [32]:

    
table1_t.to_csv('expression_table1.csv')



In [ ]:

	GGCCGCAGTCCG	CTTGTGCGGGAA	GCGCAACTGCTC	GATTGGGAGGCA	CCTCCTAGTTGG	AGTCAAGCCCTC	GTGCCGCCTCTC	CCTGTGACACAC	AATCTCGTTAAT	GATTTCCTCTGA	...	GATTTAATGGTA	TGTAAGGATCCG	GAGTGGCTTGAT	GCATCTTTCAGG	ACACGAGTTTGG	CACCCAGTTTCG	CCTGGAGAGTTT	TCTTCACTCTTA	GCCGTCTTACTA	GACCAAACTAAT
gene
10:100015630-100100413:Kitl	0	0	1	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
10:100443902-100487350:Tmtc3	3	0	0	0	2	0	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
10:100488289-100573655:Cep290	1	3	0	2	1	18	10	3	4	3	...	4	0	3	0	0	0	0	2	0	1
10:100572274-100589259:4930430F08Rik	2	1	2	0	1	1	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
10:100592386-100618391:1700017N19Rik	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

gene	10:100015630-100100413:Kitl	10:100443902-100487350:Tmtc3	10:100488289-100573655:Cep290	10:100572274-100589259:4930430F08Rik	10:101681487-102391469:Mgat4c	10:103367808-103419378:Slc6a15	...	X:99136130-99148991:Efnb1	X:99465734-99471273:Pja1	X:99821021-99848790:Tmem28	Y:1010543-1028847:Eif2s3y	Y:1096861-1245759:Uty	Y:1260715-1286613:Ddx3y
GGCCGCAGTCCG	0	3	1	2	0	4	...	0	8	1	0	0	0
CTTGTGCGGGAA	0	0	3	1	0	1	...	0	9	0	2	1	5
GCGCAACTGCTC	1	0	0	2	4	3	...	1	11	0	0	0	0
GATTGGGAGGCA	0	0	2	0	1	2	...	1	2	0	3	1	0
CCTCCTAGTTGG	0	2	1	1	2	1	...	0	3	0	0	0	0

symbol	KITL	TMTC3	CEP290	4930430F08RIK	1700017N19RIK	MGAT4C	RASSF9	LRRIQ1	ADGB	SLC6A15	...	EFNB1	PJA1	TMEM28	EDA	EIF2S3Y	GM20775	UTY	DDX3Y	KDM5D	N-R5S1
symbol
GGCCGCAGTCCG	0	3	1	2	0	0	0	0	0	4	...	0	8	1	0	0	0	0	0	0	0
CTTGTGCGGGAA	0	0	3	1	0	0	0	0	0	1	...	0	9	0	0	2	0	1	5	0	0
GCGCAACTGCTC	1	0	0	2	0	4	0	0	0	3	...	1	11	0	0	0	0	0	0	0	0
GATTGGGAGGCA	0	0	2	0	0	1	0	0	0	2	...	1	2	0	0	3	0	1	0	0	0
CCTCCTAGTTGG	0	2	1	1	0	2	0	0	0	1	...	0	3	0	0	0	0	0	0	0	0

symbol	KITL	TMTC3	CEP290	4930430F08RIK	1700017N19RIK	MGAT4C	RASSF9	LRRIQ1	ADGB	SLC6A15	...	EFNB1	PJA1	TMEM28	EDA	EIF2S3Y	GM20775	UTY	DDX3Y	KDM5D	N-R5S1
barcode
r1_GGCCGCAGTCCG	0	3	1	2	0	0	0	0	0	4	...	0	8	1	0	0	0	0	0	0	0
r1_CTTGTGCGGGAA	0	0	3	1	0	0	0	0	0	1	...	0	9	0	0	2	0	1	5	0	0
r1_GCGCAACTGCTC	1	0	0	2	0	4	0	0	0	3	...	1	11	0	0	0	0	0	0	0	0
r1_GATTGGGAGGCA	0	0	2	0	0	1	0	0	0	2	...	1	2	0	0	3	0	1	0	0	0
r1_CCTCCTAGTTGG	0	2	1	1	0	2	0	0	0	1	...	0	3	0	0	0	0	0	0	0	0