In [33]:
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Statistic model package for Generalized Linear Models
import statsmodels.api as sm
# Stats package for chi-square distribution
from scipy.stats import chi2
# Import operating system module
import os
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
## Add more when seeing new modules in other notebooks!
In [2]:
# Define function GetTopGene() to Rank gene by expression level and get the list of top expressed gene
def GetTopGene(mtx, top_num = 200):
"""
Sort expression level of all genes in a cell, and pick the top expressed genes for each cell.
Make the union list of the top expressed genes.
Usage: gene_list = GetTopGene(mtx, top_num)
Args:
mtx: Gene by cell count matrix
top_num: Number of top expressed genes to be included in the top list. Default = 200
"""
tglist = []
for name in mtx.columns:
mtx_sorted = mtx.sort_values(by=name, ascending=False)
tglist.extend(list(mtx_sorted.index[0:int(top_num)]))
uniqtopgenes = list(set(tglist))
return uniqtopgenes
In [3]:
# Define a function to take care the overall process from reading in files
# to saving the matrix with only top expressed genes.
def TopGene(inFile, compression=False, index=0, top_num = 200):
"""
Reading in filtered matrix, run GetTopGene() on the matrix, and save the matrix
with only the top expressed genes to a new file.
Usage: TopGene(inFile)
Args:
inFile: Absolute path of the file containing a filtered cell-by-gene or gene-by-cell matrix
compression: boolean. If True, file will be decompressed when reading in.
index: 0 or 1. Use 0 when the indexes are genes, 1 when it is not.
top_num: Number of top expressed genes to be included in the top list. Default = 200
"""
# Read in the DGE data sheet
if compression==True:
expression = pd.read_table(inFile, sep=',',index_col=0,compression='gzip')
else:
expression = pd.read_table(inFile, sep=',',index_col=0)
# Convert input file names to output file names
base = inFile.strip(".mtx")
out = base + "_top.mtx"
print("Reading in %s" % inFile)
if index==1:
expression = expression.T
# Get top genes with GetTopGene()
topgenes = GetTopGene(expression, top_num=100)
# How many genes are included in this gene list?
print("%s genes included in topgene list" % (len(topgenes)))
# Extract matrix with topgenes
topgene_exp = expression.loc[topgenes]
print("The shape of the top gene matrix is:", topgene_exp.shape)
# Save top gene matrix to file
topgene_exp.T.to_csv(out)
print("Topgene matrix is saved as %s" % out)
In [30]:
# Run TopGene() on filtered mtx files.
TopGene('Test_filtered.mtx', index=1)
In [4]:
TopGene('Test_filtered_lr_pos.mtx', index=1)
In [100]:
## Define all functions required
def calcCV(mtx, cv2_thres = 0.3, percentile = 95):
"""
Calculate coefficient of variations.
Use only genes with a coefficient variation over cv2_thres, and within percentile in all means.
Args:
mtx: dataframe
cv2_thres: float. Minimum of coefficient of variation required.
percentile: int in [0,100]. Percentile in mean
Return:
mean, cv2, fit_mean, fit_cv2,df
"""
nrow,ncol = mtx.shape
## Get the gene names (index) from the input matrix
dfindex=mtx.index.values.tolist()
mean = mtx.mean(axis=1)
var = mtx.var(axis=1)
cv2 = var / (mean ** 2)
minmean = np.percentile(np.asarray(mean[cv2 > cv2_thres]),percentile)
useforfit = mean > minmean
print(useforfit.sum()," of genes will be used for fit.")
fit_mean = mean[useforfit]
fit_cv2 = cv2[useforfit]
return mean,var, cv2, fit_mean, fit_cv2, ncol-1,dfindex
def fitCVtoModel(mean, fit_mean, fit_cv2):
"""
Fit coefficient of variation and mean to a model.
Use fit_mean and fit_cv2 calculated by calcCV()
"""
# Estimate model by using y ~ 1/x. Use add_constant() in sm, so that the dataset will have a constant column
exog1 = sm.add_constant(np.asarray(1/fit_mean))
# Fit the dataset to a generalized linear model. Here used the default Gaussian.
fitresults = sm.GLM(fit_cv2, exog1, family=sm.families.Gaussian()).fit()
## Store parameters from the fit model to a0 (for intercept) and a1 (for the exogenous variable)
a0,a1 = fitresults.params
print(fitresults.summary())
# Find the min and max in the raw data set. And use the calculated regression model to predict CV square. [For y ~ 1/x]
minX = np.min(np.log(mean[mean>0]))
maxX = np.max(np.log(mean))
exog_pred = np.exp(np.linspace(minX,maxX,1000))
endog_pred = a0 + (a1 / exog_pred)
return exog_pred, endog_pred, a0, a1
def rankHVG(mean,var, cv2, dfindex,a0, a1):
"""
Rank genes by variant to mean ratio and return the sorted dataframe
"""
# Order genes by significance. Then label the top genes on the graph.
## Predict CV square for each mean value, then compare the raw with the prediction
afit = a0 + (a1 / mean)
varFitRatio = var/(afit*mean ** 2)
## Stack mean, cv2, varFitRation as a ndarray
ndvar = np.stack((mean,cv2,varFitRatio),axis=1)
## Convert ndarray into pandas dataframe, using dfindex as index
nddf = pd.DataFrame(ndvar,index=dfindex,columns=["Mean","CV2","varFitRatio"])
## Sort the dataframe by varFitRatio column in descending order
nddf_sorted = nddf.sort_values(by="varFitRatio",ascending=False)
return nddf_sorted
def plotHVG(nddf_sorted, mean, cv2, endog_pred, exog_pred,df, top_num):
# Plot log transformed CV2 versus mean. Label the top genes.
fig = plt.figure(figsize=(6, 6))
plt.plot(np.log(mean),np.log(cv2),"g.")
plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
upper_endog = endog_pred * chi2.ppf(0.975,df)/df
lower_endog = endog_pred * chi2.ppf(0.025,df)/df
plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
top_m = nddf_sorted["Mean"][0:top_num]
top_cv2 = nddf_sorted["CV2"][0:top_num]
plt.plot(np.log(top_m),np.log(top_cv2),'o',mec='red',mfc='none',mew=1)
title = "Variation over average of detected genes. Top " + str(top_num) + " genes marked."
return
def findHVG(mtx, cv2_thres = 0.3, percentile = 95, top_num = 100):
"""
Find highly variable genes using coefficient of variations.
Args:
mtx: A gene by cell matrix.
cv2_thres: float. Minimum of coefficient of variation required.
percentile: int in [0,100]. Percentile in mean
top_num: int. Number of top variable genes to be labeled and reported.
"""
mean,var, cv2, fit_mean, fit_cv2, df, dfindex = calcCV(mtx, cv2_thres, percentile)
exog_pred, endog_pred, a0, a1 = fitCVtoModel(mean, fit_mean, fit_cv2)
nddf_sorted = rankHVG(mean,var, cv2, dfindex, a0, a1)
plotHVG(nddf_sorted, mean, cv2, endog_pred, exog_pred,df, top_num)
return nddf_sorted
In [98]:
# Read in filtered cell by gene matrix
inFile = 'Test_filtered.mtx'
mtx = pd.read_table(inFile, sep=',',index_col=0)
# Use gene by cell matrix as input for findHVG(). Transpose input matrix if required.
mtxT = mtx.T
mtxT.head()
Out[98]:
In [99]:
top_num = 100
outdf = findHVG(mtxT,cv2_thres=5,top_num=top_num)
In [97]:
# Save top highly variable genes to file
# Convert input file name into output file names
base = os.path.basename(inFile).split(sep=".")[0]
outfile = base + '_HVtop_' + str(top_num) + '.mtx'
# Save top gene matrix to file
topgene_mask = outdf.index[0:top_num].values.tolist()
topgene_mtx = mtx[topgene_mask]
topgene_mtx.to_csv(outfile)
In [ ]:
In [7]:
# Read in filtered cell by gene matrix
inFile = 'Test.dge.txt.gz'
expression = pd.read_table(inFile, sep='\t',index_col=0,compression='gzip')
print(expression.shape)
expression.head()
Out[7]:
In [8]:
# Name output files using input file name
base = inFile.strip(".dge.txt.gz")
num_top = 100
outfile = base + '_top_' + str(num_top) + '.mtx'
In [9]:
# Calculate coefficient of variations.
mean = expression.mean(axis=1)
var = np.var(expression.T)
cv2 = var / (mean ** 2)
In [23]:
minmeanforfit = np.percentile(np.asarray(mean[cv2 > 0.3]),95)
minmeanforfit
Out[23]:
In [24]:
useforfit = mean > minmeanforfit
gene_mean = mean[useforfit]
gene_var = var[useforfit]
gene_cv2 = cv2[useforfit]
In [31]:
useforfit.sum()
Out[31]:
In [ ]:
# Estimate model by using y ~ 1/x. Use add_constant() in sm, so that the dataset will have a constant column
exog1 = sm.add_constant(np.asarray(1/gene_mean))
exog1
## Alternatively, use np.stack
### Make an array of ones. This will be used to estimate the intercept.
# ones = np.ones(3185)
# gene1overmean_array = np.asarray(1/gene_mean)
### Combine all ones and the gene_mean column together
# exog1 = np.stack((ones,gene1overmean_array),axis=1)
In [67]:
# Fit the dataset to a generalized linear model. Here used the default Gaussian.
fitresults = sm.GLM(gene_cv2, exog1, family=sm.families.Gaussian()).fit()
## Store parameters from the fit model to a0 (for intercept) and a1 (for the exogenous variable)
a0,a1 = fitresults.params
print(fitresults.summary())
print(a0,a1)
In [68]:
# Find the min and max in the raw data set. And use the calculated regression model to predict CV square. [For y ~ 1/x]
minX = np.min(np.log(mean[mean>0]))
maxX = np.max(np.log(mean))
exog_pred = np.exp(np.linspace(minX,maxX,1000))
endog_pred = a0 + (a1 / exog_pred)
In [120]:
# Plot log transformed CV square over mean. Include the prediction with confident interval
fig = plt.figure(figsize=(10, 10))
plt.plot(np.log(mean),np.log(cv2),"g.")
plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
## Calculate the intervals
df = ncol - 1
upper_endog = endog_pred * chi2.ppf(0.975,df)/df
lower_endog = endog_pred * chi2.ppf(0.025,df)/df
plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
Out[120]:
In [80]:
# Order genes by significance. Then label the top genes on the graph.
## Predict CV square for each mean value, then compare the raw with the prediction
afit = a0 + (a1 / mean)
varFitRatio = var/(afit*mean ** 2)
## Get the gene names (index) from the input matrix
dfindex = expression.index.values.tolist()
## Stack mean, cv2, varFitRation as a ndarray
ndvar = np.stack((mean,cv2,varFitRatio),axis=1)
## Convert ndarray into pandas dataframe, using dfindex as index
nddf = pd.DataFrame(ndvar,index=dfindex,columns=["Mean","CV2","varFitRatio"])
## Sort the dataframe by varFitRatio column in descending order
nddf_sorted = nddf.sort_values(by="varFitRatio",ascending=False)
In [130]:
# Redo ploting. Label the top 100 genes.
fig = plt.figure(figsize=(10, 10))
plt.plot(np.log(mean),np.log(cv2),"g.")
plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
df = ncol - 1
upper_endog = endog_pred * chi2.ppf(0.975,df)/df
lower_endog = endog_pred * chi2.ppf(0.025,df)/df
plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
top100m = nddf_sorted["Mean"][0:100]
top100cv2 = nddf_sorted["CV2"][0:100]
plt.plot(np.log(top100m),np.log(top100cv2),'o',mec='red',mfc='none',mew=1)
Out[130]:
In [138]:
# Save top gene matrix to file
topgene_mask = nddf_sorted.index[0:100].values.tolist()
topgene_mtx = expression.loc[topgene_mask]
topgene_mtx.to_csv(outfile)