In [4]:
import macosko2015
macosko2015.__version__
Out[4]:
In [3]:
macosko2015.load_amacrine()
In [ ]:
import os
import common
# Assign notebook and folder names
notebook_name = '02_robust_pca'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
In [6]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [7]:
table1 = pd.read_table('/Users/olgabot/Downloads/GSE63473_RAW/GSM1626793_P14Retina_1.digital_expression.txt.gz',
compression='gzip', index_col=0)
print(table1.shape)
table1.head()
Out[7]:
In [8]:
data = table1.values
mask = data > 5
sns.distplot(data[mask].flat, kde=False)
Out[8]:
In [9]:
mask = table1 == 0
sns.heatmap(table1, xticklabels=[], yticklabels=[], mask=mask)
fig = plt.gcf()
fig.savefig('table1_heatmap.png')
In [10]:
n_transcripts_per_cell = table1.sum()
n_transcripts_per_cell.head()
Out[10]:
In [11]:
sns.distplot(n_transcripts_per_cell)
Out[11]:
In [12]:
n_transcripts_per_cell.describe()
Out[12]:
In [13]:
n_expressed_genes_per_cell = (table1 > 0).sum()
n_expressed_genes_per_cell.head()
Out[13]:
In [14]:
sns.distplot(n_expressed_genes_per_cell)
Out[14]:
In [15]:
greater500 = (table1 > 100).sum(axis=1) > 1
greater500.sum()
Out[15]:
In [16]:
table1_t = table1.T
print(table1_t.shape)
table1_t.head()
Out[16]:
In [17]:
n_transcripts_per_gene = table1_t.sum()
n_transcripts_per_gene.head()
Out[17]:
In [18]:
n_transcripts_per_gene = table1_t.sum()
n_transcripts_per_gene.head()
Out[18]:
In [19]:
sns.distplot(n_transcripts_per_gene)
Out[19]:
In [20]:
(n_transcripts_per_gene > 1e3).sum()
Out[20]:
In [21]:
n_transcripts_per_gene[n_transcripts_per_gene > 1e4]
Out[21]:
In [22]:
median_transcripts_per_gene = table1_t.median()
median_transcripts_per_gene.head()
Out[22]:
In [23]:
sns.distplot(median_transcripts_per_gene)
fig = plt.gcf()
fig.savefig('median_transcripts_per_gene.png')
In [24]:
data = median_transcripts_per_gene
mask = data > 0
sns.distplot(data[mask])
fig = plt.gcf()
fig.savefig('median_transcripts_per_gene_greater0.png')
Currently, cells are labeled by their barcode, e.g. GCGCAACTGCTC
, and genes are labeled by their chrom:start-end:symbol, e.g. 6:51460434-51469894:Hnrnpa2b1
. But, in the supplementary data, the genes are all uppercase, e.g. HNRNPA2B1
(which is incorrect since this is mouse data.. ) and the barcodes have r1_
prepended before the id, e.g. r1_GCGCAACTGCTC
.
So we need to clean the data to be compatible with this
In [29]:
gene_symbols = table1_t.columns.map(lambda x: x.split(':')[-1].upper())
gene_symbols.name = 'symbol'
table1_t.columnsmns = gene_symbols
table1_t.head()
Out[29]:
In [30]:
barcodes = 'r1_' + table1_t.index
barcodes.name = 'barcode'
table1_t.index = barcodes
table1_t.head()
Out[30]:
In [32]:
table1_t.to_csv('expression_table1.csv')
In [ ]: