Obtain subsets of genes of interests


In [33]:
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# Statistic model package for Generalized Linear Models
import statsmodels.api as sm
# Stats package for chi-square distribution
from scipy.stats import chi2
# Import operating system module
import os
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
## Add more when seeing new modules in other notebooks!

I. Top expressed genes


In [2]:
# Define function GetTopGene() to Rank gene by expression level and get the list of top expressed gene 
def GetTopGene(mtx, top_num = 200):
    """
    Sort expression level of all genes in a cell, and pick the top expressed genes for each cell. 
    Make the union list of the top expressed genes.
    Usage: gene_list = GetTopGene(mtx, top_num)
    Args:
        mtx: Gene by cell count matrix
        top_num: Number of top expressed genes to be included in the top list. Default = 200
    """
    tglist = []
    for name in mtx.columns:
        mtx_sorted = mtx.sort_values(by=name, ascending=False)
        tglist.extend(list(mtx_sorted.index[0:int(top_num)]))
    uniqtopgenes = list(set(tglist))
    return uniqtopgenes

In [3]:
# Define a function to take care the overall process from reading in files 
# to saving the matrix with only top expressed genes.
def TopGene(inFile, compression=False, index=0, top_num = 200):
    """
    Reading in filtered matrix, run GetTopGene() on the matrix, and save the matrix 
    with only the top expressed genes to a new file.
    Usage: TopGene(inFile)
    Args:
        inFile: Absolute path of the file containing a filtered cell-by-gene or gene-by-cell matrix
        compression: boolean. If True, file will be decompressed when reading in.
        index: 0 or 1. Use 0 when the indexes are genes, 1 when it is not.
        top_num: Number of top expressed genes to be included in the top list. Default = 200
    """
    # Read in the DGE data sheet
    if compression==True:
        expression = pd.read_table(inFile, sep=',',index_col=0,compression='gzip')
    else:
        expression = pd.read_table(inFile, sep=',',index_col=0)
    # Convert input file names to output file names
    base = inFile.strip(".mtx")
    out = base + "_top.mtx"
    print("Reading in %s" % inFile)
    if index==1:
        expression = expression.T
    # Get top genes with GetTopGene()
    topgenes = GetTopGene(expression, top_num=100)
    # How many genes are included in this gene list?
    print("%s genes included in topgene list" % (len(topgenes)))
    # Extract matrix with topgenes
    topgene_exp = expression.loc[topgenes]
    print("The shape of the top gene matrix is:", topgene_exp.shape)
    # Save top gene matrix to file
    topgene_exp.T.to_csv(out)
    print("Topgene matrix is saved as %s" % out)

In [30]:
# Run TopGene() on filtered mtx files.
TopGene('Test_filtered.mtx', index=1)


Reading in Test_filtered.mtx
1456 genes included in topgene list
The shape of the top gene matrix is: (1456, 135)
Topgene matrix is saved as Test_filtered_top.mtx

In [4]:
TopGene('Test_filtered_lr_pos.mtx', index=1)


Reading in Test_filtered_lr_pos.mtx
260 genes included in topgene list
The shape of the top gene matrix is: (260, 135)
Topgene matrix is saved as Test_filtered_lr_pos_top.mtx

II. Highly variable genes

  • Referring to R codes for finding highly variable genes
    • Calculate CV^2 and plot it against means. (Or, plot log transformed version of the two)
    • Fit a regression line, then rank genes by the significance of deviation from the fit

In [100]:
## Define all functions required
def calcCV(mtx, cv2_thres = 0.3, percentile = 95):
    """
    Calculate coefficient of variations.
    Use only genes with a coefficient variation over cv2_thres, and within percentile in all means.
    Args:
        mtx: dataframe
        cv2_thres: float. Minimum of coefficient of variation required.
        percentile: int in [0,100]. Percentile in mean
    Return:
        mean, cv2, fit_mean, fit_cv2,df
    """
    nrow,ncol = mtx.shape
    ## Get the gene names (index) from the input matrix
    dfindex=mtx.index.values.tolist()
    mean = mtx.mean(axis=1)
    var = mtx.var(axis=1)
    cv2 = var / (mean ** 2)
    minmean = np.percentile(np.asarray(mean[cv2 > cv2_thres]),percentile)
    useforfit = mean > minmean
    print(useforfit.sum()," of genes will be used for fit.")
    fit_mean = mean[useforfit]
    fit_cv2 = cv2[useforfit]
    
    return mean,var, cv2, fit_mean, fit_cv2, ncol-1,dfindex


def fitCVtoModel(mean, fit_mean, fit_cv2):
    """
    Fit coefficient of variation and mean to a model.
    Use fit_mean and fit_cv2 calculated by calcCV()
    """
    # Estimate model by using y ~ 1/x. Use add_constant() in sm, so that the dataset will have a constant column
    exog1 = sm.add_constant(np.asarray(1/fit_mean))


    # Fit the dataset to a generalized linear model. Here used the default Gaussian.
    fitresults = sm.GLM(fit_cv2, exog1, family=sm.families.Gaussian()).fit()
    ## Store parameters from the fit model to a0 (for intercept) and a1 (for the exogenous variable)
    a0,a1 = fitresults.params
    print(fitresults.summary())
    # Find the min and max in the raw data set. And use the calculated regression model to predict CV square. [For y ~ 1/x]
    minX = np.min(np.log(mean[mean>0]))
    maxX = np.max(np.log(mean))

    exog_pred = np.exp(np.linspace(minX,maxX,1000))
    endog_pred = a0 + (a1 / exog_pred)
    return exog_pred, endog_pred, a0, a1

def rankHVG(mean,var, cv2, dfindex,a0, a1):
    """
    Rank genes by variant to mean ratio and return the sorted dataframe
    """
    # Order genes by significance. Then label the top genes on the graph.
    ## Predict CV square for each mean value, then compare the raw with the prediction
    afit = a0 + (a1 / mean)
    varFitRatio = var/(afit*mean ** 2) 
    ## Stack mean, cv2, varFitRation as a ndarray
    ndvar = np.stack((mean,cv2,varFitRatio),axis=1)
    ## Convert ndarray into pandas dataframe, using dfindex as index
    nddf = pd.DataFrame(ndvar,index=dfindex,columns=["Mean","CV2","varFitRatio"])
    ## Sort the dataframe by varFitRatio column in descending order
    nddf_sorted = nddf.sort_values(by="varFitRatio",ascending=False)
    return nddf_sorted

def plotHVG(nddf_sorted, mean, cv2, endog_pred, exog_pred,df, top_num):  
    # Plot log transformed CV2 versus mean. Label the top genes. 
    fig = plt.figure(figsize=(6, 6))
    plt.plot(np.log(mean),np.log(cv2),"g.")
    plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
    upper_endog = endog_pred * chi2.ppf(0.975,df)/df
    lower_endog = endog_pred * chi2.ppf(0.025,df)/df
    plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
    plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
    top_m = nddf_sorted["Mean"][0:top_num]
    top_cv2 = nddf_sorted["CV2"][0:top_num]
    plt.plot(np.log(top_m),np.log(top_cv2),'o',mec='red',mfc='none',mew=1)
    title = "Variation over average of detected genes. Top " + str(top_num) + " genes marked."
    return

def findHVG(mtx, cv2_thres = 0.3, percentile = 95, top_num = 100):
    """
    Find highly variable genes using coefficient of variations. 
    
    Args:
        mtx: A gene by cell matrix.
        cv2_thres: float. Minimum of coefficient of variation required.
        percentile: int in [0,100]. Percentile in mean
        top_num: int. Number of top variable genes to be labeled and reported.
    """
    mean,var, cv2, fit_mean, fit_cv2, df, dfindex = calcCV(mtx, cv2_thres, percentile)
    exog_pred, endog_pred, a0, a1 = fitCVtoModel(mean, fit_mean, fit_cv2)
    nddf_sorted = rankHVG(mean,var, cv2, dfindex, a0, a1)
    plotHVG(nddf_sorted, mean, cv2, endog_pred, exog_pred,df, top_num)
    return nddf_sorted

In [98]:
# Read in filtered cell by gene matrix
inFile = 'Test_filtered.mtx'
mtx = pd.read_table(inFile, sep=',',index_col=0)
# Use gene by cell matrix as input for findHVG(). Transpose input matrix if required.
mtxT = mtx.T
mtxT.head()


Out[98]:
ATCTTGGCGGCG ATAGAGTCGCAT GCGCACGTTGAG TTCACGGCTGAG GCGAGACACAGC GTTGGACGAGCG TGACTTCGCGAA CGTTCCGCATAG GGTCTGGACACC GCAGAGTGTCCG ... CCGGTCAATAGC CAGAAGCTGGTC CTCTTTGTTATC AATAGGCGTAGA CCCTTCGGCTGG CGAGTTAAGTGG CTACATATTGCC TGTTCTTGTTAT TCCAACAGTTAC GCAGTCCAGGGA
ACBD3 1 1 0 0 0 0 1 0 0 2 ... 0 0 0 0 0 0 0 0 0 0
AL928854.1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
AL954715.2 0 0 0 1 0 0 0 3 0 0 ... 0 0 0 0 0 0 0 0 0 0
ARHGEF12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BICD2 0 0 1 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 1 0

5 rows × 135 columns


In [99]:
top_num = 100
outdf = findHVG(mtxT,cv2_thres=5,top_num=top_num)


844  of genes will be used for fit.
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      y   No. Observations:                  844
Model:                            GLM   Df Residuals:                      842
Model Family:                Gaussian   Df Model:                            1
Link Function:               identity   Scale:                   14.9524951803
Method:                          IRLS   Log-Likelihood:                -2338.0
Date:                Thu, 03 Aug 2017   Deviance:                       12590.
Time:                        16:06:46   Pearson chi2:                 1.26e+04
No. Iterations:                     2                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.5945      0.271      5.878      0.000       1.063       2.126
x1             1.3185      0.170      7.778      0.000       0.986       1.651
==============================================================================

In [97]:
# Save top highly variable genes to file
# Convert input file name into output file names
base = os.path.basename(inFile).split(sep=".")[0]
outfile = base + '_HVtop_' + str(top_num) + '.mtx'
# Save top gene matrix to file
topgene_mask = outdf.index[0:top_num].values.tolist()
topgene_mtx = mtx[topgene_mask]
topgene_mtx.to_csv(outfile)

In [ ]:

Run findHVG() on raw data?


In [7]:
# Read in filtered cell by gene matrix
inFile = 'Test.dge.txt.gz'
expression = pd.read_table(inFile, sep='\t',index_col=0,compression='gzip')
print(expression.shape)
expression.head()


(13785, 500)
Out[7]:
GCCAAATGCACG AGCGGTGGCAGC TGAACCCCAGCT ATCTTGGCGGCG GTTCCTACAACA ATAGAGTCGCAT GCGCACGTTGAG TTCACGGCTGAG GCGAGACACAGC CGAGTCGGTAGC ... GAGATCTTTTAC CCATGCCCATAC GTAAGCAATTTG AGAATCGAGCCT GTAGATCATCTC TAATGAAGTGGA CTCGGGATATGT GCTAAGGTTGTT AAGTTCAAGGGG GGTCATGGTTCG
GENE
ACBD3 0 0 2 1 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
ACSF3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ACTR2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ADAMTS14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ADAMTS7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 500 columns


In [8]:
# Name output files using input file name
base = inFile.strip(".dge.txt.gz")
num_top = 100
outfile = base + '_top_' + str(num_top) + '.mtx'

In [9]:
# Calculate coefficient of variations.
mean = expression.mean(axis=1)
var = np.var(expression.T)
cv2 = var / (mean ** 2)

In [23]:
minmeanforfit = np.percentile(np.asarray(mean[cv2 > 0.3]),95)
minmeanforfit


Out[23]:
0.20799999999999996

In [24]:
useforfit = mean > minmeanforfit
gene_mean = mean[useforfit]
gene_var = var[useforfit]
gene_cv2 = cv2[useforfit]

In [31]:
useforfit.sum()


Out[31]:
698

In [ ]:
# Estimate model by using y ~ 1/x. Use add_constant() in sm, so that the dataset will have a constant column
exog1 = sm.add_constant(np.asarray(1/gene_mean))
exog1
## Alternatively, use np.stack
### Make an array of ones. This will be used to estimate the intercept.
# ones = np.ones(3185)
# gene1overmean_array = np.asarray(1/gene_mean)
### Combine all ones and the gene_mean column together
# exog1 = np.stack((ones,gene1overmean_array),axis=1)

In [67]:
# Fit the dataset to a generalized linear model. Here used the default Gaussian.
fitresults = sm.GLM(gene_cv2, exog1, family=sm.families.Gaussian()).fit()
## Store parameters from the fit model to a0 (for intercept) and a1 (for the exogenous variable)
a0,a1 = fitresults.params
print(fitresults.summary())
print(a0,a1)


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      y   No. Observations:                  689
Model:                            GLM   Df Residuals:                      687
Model Family:                Gaussian   Df Model:                            1
Link Function:               identity   Scale:                   127.566322944
Method:                          IRLS   Log-Likelihood:                -2647.0
Date:                Fri, 28 Jul 2017   Deviance:                       87638.
Time:                        15:59:03   Pearson chi2:                 8.76e+04
No. Iterations:                     2                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.0920      0.885      3.495      0.000       1.358       4.826
x1             1.7807      0.318      5.594      0.000       1.157       2.405
==============================================================================
3.09199929632 1.78067148659

In [68]:
# Find the min and max in the raw data set. And use the calculated regression model to predict CV square. [For y ~ 1/x]
minX = np.min(np.log(mean[mean>0]))
maxX = np.max(np.log(mean))

exog_pred = np.exp(np.linspace(minX,maxX,1000))
endog_pred = a0 + (a1 / exog_pred)

In [120]:
# Plot log transformed CV square over mean. Include the prediction with confident interval
fig = plt.figure(figsize=(10, 10))
plt.plot(np.log(mean),np.log(cv2),"g.")
plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
## Calculate the intervals
df = ncol - 1
upper_endog = endog_pred * chi2.ppf(0.975,df)/df
lower_endog = endog_pred * chi2.ppf(0.025,df)/df
plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')


Out[120]:
[<matplotlib.lines.Line2D at 0x11fb8dc18>]

In [80]:
# Order genes by significance. Then label the top genes on the graph.
## Predict CV square for each mean value, then compare the raw with the prediction
afit = a0 + (a1 / mean)
varFitRatio = var/(afit*mean ** 2)
## Get the gene names (index) from the input matrix
dfindex = expression.index.values.tolist()
## Stack mean, cv2, varFitRation as a ndarray
ndvar = np.stack((mean,cv2,varFitRatio),axis=1)
## Convert ndarray into pandas dataframe, using dfindex as index
nddf = pd.DataFrame(ndvar,index=dfindex,columns=["Mean","CV2","varFitRatio"])
## Sort the dataframe by varFitRatio column in descending order
nddf_sorted = nddf.sort_values(by="varFitRatio",ascending=False)

In [130]:
# Redo ploting. Label the top 100 genes. 
fig = plt.figure(figsize=(10, 10))
plt.plot(np.log(mean),np.log(cv2),"g.")
plt.plot(np.log(exog_pred),np.log(endog_pred),'k-')
df = ncol - 1
upper_endog = endog_pred * chi2.ppf(0.975,df)/df
lower_endog = endog_pred * chi2.ppf(0.025,df)/df
plt.plot(np.log(exog_pred),np.log(upper_endog),'k--')
plt.plot(np.log(exog_pred),np.log(lower_endog),'k--')
top100m = nddf_sorted["Mean"][0:100]
top100cv2 = nddf_sorted["CV2"][0:100]
plt.plot(np.log(top100m),np.log(top100cv2),'o',mec='red',mfc='none',mew=1)


Out[130]:
[<matplotlib.lines.Line2D at 0x1208d5908>]

In [138]:
# Save top gene matrix to file
topgene_mask = nddf_sorted.index[0:100].values.tolist()
topgene_mtx = expression.loc[topgene_mask]
topgene_mtx.to_csv(outfile)