In [33]:
    
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Statistic model package for Generalized Linear Models
import statsmodels.api as sm
# Stats package for chi-square distribution
from scipy.stats import chi2
# Import operating system module
import os
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
## Add more when seeing new modules in other notebooks!
    
In [2]:
    
# Define function GetTopGene() to Rank gene by expression level and get the list of top expressed gene 
def GetTopGene(mtx, top_num = 200):
    """
    Sort expression level of all genes in a cell, and pick the top expressed genes for each cell. 
    Make the union list of the top expressed genes.
    Usage: gene_list = GetTopGene(mtx, top_num)
    Args:
        mtx: Gene by cell count matrix
        top_num: Number of top expressed genes to be included in the top list. Default = 200
    """
    tglist = []
    for name in mtx.columns:
        mtx_sorted = mtx.sort_values(by=name, ascending=False)
        tglist.extend(list(mtx_sorted.index[0:int(top_num)]))
    uniqtopgenes = list(set(tglist))
    return uniqtopgenes
    
In [3]:
    
# Define a function to take care the overall process from reading in files 
# to saving the matrix with only top expressed genes.
def TopGene(inFile, compression=False, index=0, top_num = 200):
    """
    Reading in filtered matrix, run GetTopGene() on the matrix, and save the matrix 
    with only the top expressed genes to a new file.
    Usage: TopGene(inFile)
    Args:
        inFile: Absolute path of the file containing a filtered cell-by-gene or gene-by-cell matrix
        compression: boolean. If True, file will be decompressed when reading in.
        index: 0 or 1. Use 0 when the indexes are genes, 1 when it is not.
        top_num: Number of top expressed genes to be included in the top list. Default = 200
    """
    # Read in the DGE data sheet
    if compression==True:
        expression = pd.read_table(inFile, sep=',',index_col=0,compression='gzip')
    else:
        expression = pd.read_table(inFile, sep=',',index_col=0)
    # Convert input file names to output file names
    base = inFile.strip(".mtx")
    out = base + "_top.mtx"
    print("Reading in %s" % inFile)
    if index==1:
        expression = expression.T
    # Get top genes with GetTopGene()
    topgenes = GetTopGene(expression, top_num=100)
    # How many genes are included in this gene list?
    print("%s genes included in topgene list" % (len(topgenes)))
    # Extract matrix with topgenes
    topgene_exp = expression.loc[topgenes]
    print("The shape of the top gene matrix is:", topgene_exp.shape)
    # Save top gene matrix to file
    topgene_exp.T.to_csv(out)
    print("Topgene matrix is saved as %s" % out)
    
In [30]:
    
# Run TopGene() on filtered mtx files.
TopGene('Test_filtered.mtx', index=1)
    
    
In [4]:
    
TopGene('Test_filtered_lr_pos.mtx', index=1)
    
    
In [100]:
    
## Define all functions required
def calcCV(mtx, cv2_thres = 0.3, percentile = 95):
    """
    Calculate coefficient of variations.
    Use only genes with a coefficient variation over cv2_thres, and within percentile in all means.
    Args:
        mtx: dataframe
        cv2_thres: float. Minimum of coefficient of variation required.
        percentile: int in [0,100]. Percentile in mean
    Return:
        mean, cv2, fit_mean, fit_cv2,df
    """
    nrow,ncol = mtx.shape
    ## Get the gene names (index) from the input matrix
    dfindex=mtx.index.values.tolist()
    mean = mtx.mean(axis=1)
    var = mtx.var(axis=1)
    cv2 = var / (mean ** 2)
    minmean = np.percentile(np.asarray(mean[cv2 > cv2_thres]),percentile)
    useforfit = mean > minmean
    print(useforfit.sum()," of genes will be used for fit.")
    fit_mean = mean[useforfit]
    fit_cv2 = cv2[useforfit]
    
    return mean,var, cv2, fit_mean, fit_cv2, ncol-1,dfindex
def fitCVtoModel(mean, fit_mean, fit_cv2):
    """
    Fit coefficient of variation and mean to a model.
    Use fit_mean and fit_cv2 calculated by calcCV()
    """
    # Estimate model by using y ~ 1/x. Use add_constant() in sm, so that the dataset will have a constant column
    exog1 = sm.add_constant(np.asarray(1/fit_mean))
    # Fit the dataset to a generalized linear model. Here used the default Gaussian.
    fitresults = sm.GLM(fit_cv2, exog1, family=sm.families.Gaussian()).fit()
    ## Store parameters from the fit model to a0 (for intercept) and a1 (for the exogenous variable)
    a0,a1 = fitresults.params
    print(fitresults.summary())
    # Find the min and max in the raw data set. And use the calculated regression model to predict CV square. [For y ~ 1/x]
    minX = np.min(np.log(mean[mean>0]))
    maxX = np.max(np.log(mean))
    exog_pred = np.exp(np.linspace(minX,maxX,1000))
    endog_pred = a0 + (a1 / exog_pred)
    return exog_pred, endog_pred, a0, a1
def rankHVG(mean,var, cv2, dfindex,a0, a1):
    """
    Rank genes by variant to mean ratio and return the sorted dataframe
    """
    # Order genes by significance. Then label the top genes on the graph.
    ## Predict CV square for each mean value, then compare the raw with the prediction
    afit = a0 + (a1 / mean)
    varFitRatio = var/(afit*mean ** 2) 
    ## Stack mean, cv2, varFitRation as a ndarray
    ndvar = np.stack((mean,cv2,varFitRatio),axis=1)
    ## Convert ndarray into pandas dataframe, using dfindex as index
    nddf = pd.DataFrame(ndvar,index=dfindex,columns=["Mean","CV2","varFitRatio"])
    ## Sort the dataframe by varFitRatio column in descending order
    nddf_sorted = nddf.sort_values(by="varFitRatio",ascending=False)
    return nddf_sorted
def plotHVG(nddf_sorted, mean, cv2, endog_pred, exog_pred,df, top_num):  
    # Plot log transformed CV2 versus mean. Label the top genes. 
    fig = plt.figure(figsize=(6, 6))
    plt.plot(np.log(mean),np.log(cv2),"g.")
    plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
    upper_endog = endog_pred * chi2.ppf(0.975,df)/df
    lower_endog = endog_pred * chi2.ppf(0.025,df)/df
    plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
    plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
    top_m = nddf_sorted["Mean"][0:top_num]
    top_cv2 = nddf_sorted["CV2"][0:top_num]
    plt.plot(np.log(top_m),np.log(top_cv2),'o',mec='red',mfc='none',mew=1)
    title = "Variation over average of detected genes. Top " + str(top_num) + " genes marked."
    return
def findHVG(mtx, cv2_thres = 0.3, percentile = 95, top_num = 100):
    """
    Find highly variable genes using coefficient of variations. 
    
    Args:
        mtx: A gene by cell matrix.
        cv2_thres: float. Minimum of coefficient of variation required.
        percentile: int in [0,100]. Percentile in mean
        top_num: int. Number of top variable genes to be labeled and reported.
    """
    mean,var, cv2, fit_mean, fit_cv2, df, dfindex = calcCV(mtx, cv2_thres, percentile)
    exog_pred, endog_pred, a0, a1 = fitCVtoModel(mean, fit_mean, fit_cv2)
    nddf_sorted = rankHVG(mean,var, cv2, dfindex, a0, a1)
    plotHVG(nddf_sorted, mean, cv2, endog_pred, exog_pred,df, top_num)
    return nddf_sorted
    
In [98]:
    
# Read in filtered cell by gene matrix
inFile = 'Test_filtered.mtx'
mtx = pd.read_table(inFile, sep=',',index_col=0)
# Use gene by cell matrix as input for findHVG(). Transpose input matrix if required.
mtxT = mtx.T
mtxT.head()
    
    Out[98]:
In [99]:
    
top_num = 100
outdf = findHVG(mtxT,cv2_thres=5,top_num=top_num)
    
    
    
In [97]:
    
# Save top highly variable genes to file
# Convert input file name into output file names
base = os.path.basename(inFile).split(sep=".")[0]
outfile = base + '_HVtop_' + str(top_num) + '.mtx'
# Save top gene matrix to file
topgene_mask = outdf.index[0:top_num].values.tolist()
topgene_mtx = mtx[topgene_mask]
topgene_mtx.to_csv(outfile)
    
In [ ]:
    
    
In [7]:
    
# Read in filtered cell by gene matrix
inFile = 'Test.dge.txt.gz'
expression = pd.read_table(inFile, sep='\t',index_col=0,compression='gzip')
print(expression.shape)
expression.head()
    
    
    Out[7]:
In [8]:
    
# Name output files using input file name
base = inFile.strip(".dge.txt.gz")
num_top = 100
outfile = base + '_top_' + str(num_top) + '.mtx'
    
In [9]:
    
# Calculate coefficient of variations.
mean = expression.mean(axis=1)
var = np.var(expression.T)
cv2 = var / (mean ** 2)
    
In [23]:
    
minmeanforfit = np.percentile(np.asarray(mean[cv2 > 0.3]),95)
minmeanforfit
    
    Out[23]:
In [24]:
    
useforfit = mean > minmeanforfit
gene_mean = mean[useforfit]
gene_var = var[useforfit]
gene_cv2 = cv2[useforfit]
    
In [31]:
    
useforfit.sum()
    
    Out[31]:
In [ ]:
    
# Estimate model by using y ~ 1/x. Use add_constant() in sm, so that the dataset will have a constant column
exog1 = sm.add_constant(np.asarray(1/gene_mean))
exog1
## Alternatively, use np.stack
### Make an array of ones. This will be used to estimate the intercept.
# ones = np.ones(3185)
# gene1overmean_array = np.asarray(1/gene_mean)
### Combine all ones and the gene_mean column together
# exog1 = np.stack((ones,gene1overmean_array),axis=1)
    
In [67]:
    
# Fit the dataset to a generalized linear model. Here used the default Gaussian.
fitresults = sm.GLM(gene_cv2, exog1, family=sm.families.Gaussian()).fit()
## Store parameters from the fit model to a0 (for intercept) and a1 (for the exogenous variable)
a0,a1 = fitresults.params
print(fitresults.summary())
print(a0,a1)
    
    
In [68]:
    
# Find the min and max in the raw data set. And use the calculated regression model to predict CV square. [For y ~ 1/x]
minX = np.min(np.log(mean[mean>0]))
maxX = np.max(np.log(mean))
exog_pred = np.exp(np.linspace(minX,maxX,1000))
endog_pred = a0 + (a1 / exog_pred)
    
In [120]:
    
# Plot log transformed CV square over mean. Include the prediction with confident interval
fig = plt.figure(figsize=(10, 10))
plt.plot(np.log(mean),np.log(cv2),"g.")
plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
## Calculate the intervals
df = ncol - 1
upper_endog = endog_pred * chi2.ppf(0.975,df)/df
lower_endog = endog_pred * chi2.ppf(0.025,df)/df
plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
    
    Out[120]:
    
In [80]:
    
# Order genes by significance. Then label the top genes on the graph.
## Predict CV square for each mean value, then compare the raw with the prediction
afit = a0 + (a1 / mean)
varFitRatio = var/(afit*mean ** 2)
## Get the gene names (index) from the input matrix
dfindex = expression.index.values.tolist()
## Stack mean, cv2, varFitRation as a ndarray
ndvar = np.stack((mean,cv2,varFitRatio),axis=1)
## Convert ndarray into pandas dataframe, using dfindex as index
nddf = pd.DataFrame(ndvar,index=dfindex,columns=["Mean","CV2","varFitRatio"])
## Sort the dataframe by varFitRatio column in descending order
nddf_sorted = nddf.sort_values(by="varFitRatio",ascending=False)
    
In [130]:
    
# Redo ploting. Label the top 100 genes. 
fig = plt.figure(figsize=(10, 10))
plt.plot(np.log(mean),np.log(cv2),"g.")
plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
df = ncol - 1
upper_endog = endog_pred * chi2.ppf(0.975,df)/df
lower_endog = endog_pred * chi2.ppf(0.025,df)/df
plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
top100m = nddf_sorted["Mean"][0:100]
top100cv2 = nddf_sorted["CV2"][0:100]
plt.plot(np.log(top100m),np.log(top100cv2),'o',mec='red',mfc='none',mew=1)
    
    Out[130]:
    
In [138]:
    
# Save top gene matrix to file
topgene_mask = nddf_sorted.index[0:100].values.tolist()
topgene_mtx = expression.loc[topgene_mask]
topgene_mtx.to_csv(outfile)