In [40]:
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Get basename of input file
import os
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
## Add more when seeing new modules in other notebooks!
In [3]:
# Read in the DGE data sheet
inFile = 'Test.dge.txt.gz'
expression = pd.read_table(inFile,
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0,
# Tells pandas to decompress the gzipped file if required
compression='gzip')
# Print out the shape and the top 5 rows of the newly assigned dataframe
print(expression.shape)
expression.head()
Out[3]:
In [45]:
# Convert input file name into output file names
base = os.path.basename(inFile).split(sep=".")[0]
out_filtered = base + "_filtered.mtx"
out_filtered_lr = base + "_filtered_lr.mtx"
out_filtered_lr_pos = base + "_filtered_lr_pos.mtx"
In [5]:
# See the raw distributions of data
## e.g. for gene as row name and cell as column name:
## See the distribution of gene expression level within one cell for each of the cells listed?
sns.boxplot(expression)
## Get current figure, then save to file
#fig = plt.gcf()
#fig.savefig(NameOfFigure)
Out[5]:
In [6]:
# Log transform the data if needed
expression_logged = np.log2(expression+1)
expression_logged.head()
Out[6]:
In [7]:
# Plot log transformed data with a designated size
fig, ax=plt.subplots(figsize = (10,5))
sns.boxplot(expression_logged)
# gcf = Get current figure
#fig = plt.gcf()
#fig.savefig(NameOfFigure)
Out[7]:
In [8]:
# More plots!
## 1) Hist of genes falling in bins of number of cells expressing one particular gene;
bool_exp = expression > 0
num_cells_per_gene = bool_exp.sum(axis=1)
## 2) Hist of genes falling in bins of average expression across cells;
#gene_rank_by_ave = expression.rank(axis=1,method='average').index
exp_mean = expression.mean(axis=1)
## 3) Hist of cells falling in bins of aggregate expression of all genes;
exp_all_per_cell = expression.sum()
## 4) Hist of cells falling in bins of aggregate expression of housekeeping genes.
# Assign gene list to housekeeping gene list
hglist = ["ACTB","B2M","HNRPLL","HPRT","PSMB2","PSMB4","PPIA","PRPS1","PRPS1L1","PRPS1L3","PRPS2","PRPSAP1","PRPSAP2","RPL10","RPL10A","RPL10L","RPL11","RPL12","RPL13","RPL14","RPL15","RPL17","RPL18","RPL19","RPL21","RPL22","RPL22L1","RPL23","RPL24","RPL26","RPL27","RPL28","RPL29","RPL3","RPL30","RPL32","RPL34","RPL35","RPL36","RPL37","RPL38","RPL39","RPL39L","RPL3L","RPL4","RPL41","RPL5","RPL6","RPL7","RPL7A","RPL7L1","RPL8","RPL9","RPLP0","RPLP1","RPLP2","RPS10","RPS11","RPS12","RPS13","RPS14","RPS15","RPS15A","RPS16","RPS17","RPS18","RPS19","RPS20","RPS21","RPS24","RPS25","RPS26","RPS27","RPS27A","RPS27L","RPS28","RPS29","RPS3","RPS3A","RPS4X","RPS5","RPS6","RPS6KA1","RPS6KA2","RPS6KA3","RPS6KA4","RPS6KA5","RPS6KA6","RPS6KB1","RPS6KB2","RPS6KC1","RPS6KL1","RPS7","RPS8","RPS9","RPSA","TRPS1","UB"]
exp_hkg_agg = expression.loc[hglist].sum()
# Four axes, returned as a 2-d array. e.g. for showing plots in 4
f, axarr = plt.subplots(2, 2,figsize = (8,8))
axarr[0, 0].hist(num_cells_per_gene,range=(0,100),bins=50,log=True)
axarr[0, 0].set_title('How many genes can have more than # cell expression?')
axarr[0, 1].hist(exp_mean, bins=50, log=True,color='green')
axarr[0, 1].set_title('Average expression across cells')
axarr[1, 0].hist(exp_all_per_cell, bins=50, log=False,color='red')
axarr[1, 0].set_title('Total transcripts detected in cell')
axarr[1, 1].hist(exp_hkg_agg, bins=50, log=False,color='red')
axarr[1, 1].set_title('Housekeeping transcripts detected in cell')
Out[8]:
In [9]:
## Remove low expression and high expression cells
### Remove cells that does not have certain number of transcripts detected. matrix is transposed.
mask0expcell = expression.T.sum(axis=1) > 1000
exp0cellrmd = expression.T[mask0expcell]
print(exp0cellrmd.shape)
exp0cellrmd.head()
Out[9]:
In [10]:
### Remove cells that have more than certain number of transcripts detected.
maskdoublet = exp0cellrmd.sum(axis=1) < 5000
exp_filtered = exp0cellrmd[maskdoublet]
print(exp_filtered.shape)
exp_filtered.head()
Out[10]:
In [11]:
# Apply filtering for genes and cells
## Remove low-expression genes. See below for the post-filtering gene number estimation. Here using "as least one count in more than 10 cells"
mask0exp = (exp_filtered.T > 0).sum(axis=1) >= 10
mask0exp
exp = exp_filtered.T[mask0exp]
print(exp.shape)
exp.head()
Out[11]:
In [12]:
# Estimate number of gene left for different filtering criteria
# Define a function for calculation
def numGene(mtx,reads_num=5, cell_num=50):
"""
Estimate number of gene left by applying different filtering criteria.
For each pair of expression reads and cell number, calculate the genes that can
pass the criteria of have A number of reads detected in at least more than
B number of cells.
Usage: numGene(reads_num, cell_num,mtx)
Args:
reads_num is the upper number of expression level will be tested (count by 1).
cell_num is the upper number of cell number will be set as threshold (count by 10).
mtx is the data matrix input gene X cell
"""
expreads = list(range(0,reads_num))
cellnum = list(range(10,cell_num + 1,10))
genenumdict = {}
for i in expreads:
z = []
for j in cellnum:
gmask = (mtx > i).sum(axis=1) >= j
genenum = gmask.sum()
z.append(genenum)
genenumdict[i] = z
return genenumdict
In [13]:
# Run numGene() on the matrix
genenumdict = numGene(expression)
In [14]:
# See how the output gene number estimation looks
genenumdict
Out[14]:
In [15]:
# Define a function to plot 3D distribution of the gene number
def plotnumGene(genenumdict, reads_num=5, cell_num=50):
cellnum = list(range(10,cell_num + 1,10))
x = np.asarray(cellnum)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
for i in sorted(genenumdict):
y = np.ones(x.size)*i
z = genenumdict[i]
color_code = (i+0.1)/reads_num
ax.plot(y,x,z)
plt.show()
return
In [16]:
# Run plotnumGene() to plot
plotnumGene(numGene(expression))
In [17]:
# Choose the criteria for gene filtering and apply that to the data matrix
In [20]:
# %load r_pca.py
from __future__ import division, print_function
import numpy as np
try:
from pylab import plt
except ImportError:
print('Unable to import pylab. R_pca.plot_fit() will not work.')
try:
# Python 2: 'xrange' is the iterative version
range = xrange
except NameError:
# Python 3: 'range' is iterative - no need for 'xrange'
pass
class R_pca:
def __init__(self, D, mu=None, lmbda=None):
self.D = D
self.S = np.zeros(self.D.shape)
self.Y = np.zeros(self.D.shape)
if mu:
self.mu = mu
else:
self.mu = np.prod(self.D.shape) / (4 * self.norm_p(self.D, 2))
self.mu_inv = 1 / self.mu
if lmbda:
self.lmbda = lmbda
else:
self.lmbda = 1 / np.sqrt(np.max(self.D.shape))
@staticmethod
def norm_p(M, p):
return np.sum(np.power(M, p))
@staticmethod
def shrink(M, tau):
return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape))
def svd_threshold(self, M, tau):
U, S, V = np.linalg.svd(M, full_matrices=False)
return np.dot(U, np.dot(np.diag(self.shrink(S, tau)), V))
def fit(self, tol=None, max_iter=1000, iter_print=100):
iter = 0
err = np.Inf
Sk = self.S
Yk = self.Y
Lk = np.zeros(self.D.shape)
if tol:
_tol = tol
else:
_tol = 1E-7 * self.norm_p(np.abs(self.D), 2)
while (err > _tol) and iter < max_iter:
Lk = self.svd_threshold(
self.D - Sk + self.mu_inv * Yk, self.mu_inv)
Sk = self.shrink(
self.D - Lk + (self.mu_inv * Yk), self.mu_inv * self.lmbda)
Yk = Yk + self.mu * (self.D - Lk - Sk)
err = self.norm_p(np.abs(self.D - Lk - Sk), 2)
iter += 1
if (iter % iter_print) == 0 or iter == 1 or iter > max_iter or err <= _tol:
print('iteration: {0}, error: {1}'.format(iter, err))
self.L = Lk
self.S = Sk
return Lk, Sk
def plot_fit(self, size=None, tol=0.1, axis_on=True):
n, d = self.D.shape
if size:
nrows, ncols = size
else:
sq = np.ceil(np.sqrt(n))
nrows = int(sq)
ncols = int(sq)
ymin = np.nanmin(self.D)
ymax = np.nanmax(self.D)
print('ymin: {0}, ymax: {1}'.format(ymin, ymax))
numplots = np.min([n, nrows * ncols])
plt.figure()
for n in range(numplots):
plt.subplot(nrows, ncols, n + 1)
plt.ylim((ymin - tol, ymax + tol))
plt.plot(self.L[n, :] + self.S[n, :], 'r')
plt.plot(self.L[n, :], 'b')
if not axis_on:
plt.axis('off')
In [21]:
# use R_pca to estimate the degraded data as L + S, where L is low rank, and S is sparse
rpca = R_pca(np.array(exp.T))
L, S = rpca.fit(max_iter=10000, iter_print=100)
# visually inspect results (requires matplotlib)
rpca.plot_fit()
plt.show()
# Save to dataframe
exp_lr = pd.DataFrame(L, index=exp.T.index, columns=exp.T.columns)
print(exp_lr.shape)
exp_lr.head()
Out[21]:
In [33]:
# Replace all negative values with zero
exp_lr_pos = exp_lr[exp_lr > 0].fillna(value=0)
print("Shape of the filtered lowrank matrix without negative value is: ", exp_lr_pos.shape)
exp_lr_pos.head()
Out[33]:
In [46]:
# Save filtered matrix and the low rank matrix to file
exp.T.to_csv(out_filtered)
exp_lr_pos.to_csv(out_filtered_lr_pos)