In [1]:
%matplotlib inline

In [ ]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
import sklearn
from sklearn import linear_model
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')
sns.set_style('ticks')

In [84]:
def bayes_cov_col(Y,X,cols,lm):
    """
    @Y    = Expression matrix, cells x x genes, expecting pandas dataframe
    @X    = Covariate matrix, cells x covariates, expecting pandas dataframe
    @cols = The subset of columns that the EM should be performed over, expecting list
    @lm   = linear model object
    """

    #EM iterateit
    Yhat=pd.DataFrame(lm.predict(X))
    Yhat.index=Y.index
    Yhat.columns=Y.columns
    SSE_all=np.square(Y.subtract(Yhat))
    X_adjust=X.copy()


    df_SSE   = []
    df_logit = []

    for curcov in cols:

        curcells=X[X[curcov]>0].index

        if len(curcells)>2:

            X_notcur=X.copy()
            X_notcur[curcov]=[0]*len(X_notcur)

            X_sub=X_notcur.loc[curcells]

            Y_sub=Y.loc[curcells]

            GENE_var=2.0*Y_sub.var(axis=0)
            vargenes=GENE_var[GENE_var>0].index

            Yhat_notcur=pd.DataFrame(lm.predict(X_sub))
            Yhat_notcur.index=Y_sub.index
            Yhat_notcur.columns=Y_sub.columns

            SSE_notcur=np.square(Y_sub.subtract(Yhat_notcur))
            SSE=SSE_all.loc[curcells].subtract(SSE_notcur)
            SSE_sum=SSE.sum(axis=1)

            SSE_transform=SSE.div(GENE_var+0.5)[vargenes].sum(axis=1)
            logitify=np.divide(1.0,1.0+np.exp(SSE_transform))#sum))

            df_SSE.append(SSE_sum)
            df_logit.append(logitify)

            X_adjust[curcov].loc[curcells]=logitify

    return X_adjust

In [86]:
#simulate two multivariate normal distributions for two classes
#initial parameters
ncells=1000
ngenes=200
halfcells=int(ncells/2.0)
tenthcells=int(ncells/10.0)
informativegenes=0.1
effectsize=1.0

In [87]:
#Simulate Class 0 cells
y1=pd.DataFrame(np.random.normal(loc=0,scale=1,size=(halfcells,int(informativegenes*ngenes))))

#Simulate Class 1 cells
y2=pd.DataFrame(np.random.normal(loc=effectsize,scale=1,size=(halfcells,int(informativegenes*ngenes))))

#Noisy genes
y3=pd.DataFrame(np.random.normal(loc=0,scale=1,size=(ncells,int(ngenes*(1.0-informativegenes)))))

#Concatenate to form cells x genes expression matrix
Y=pd.concat([y1,y2]).reset_index(drop=True)
Y=pd.concat([Y,y3],axis=1)
Y.columns=range(np.shape(Y)[1])
Y.head()


Out[87]:
0 1 2 3 4 5 6 7 8 9 ... 190 191 192 193 194 195 196 197 198 199
0 1.593226 -0.678334 0.010144 0.303765 -0.794254 -0.633916 -0.127974 0.797062 0.373396 -0.206535 ... 0.448402 -0.641421 -0.322198 0.081434 -0.130122 -0.267421 -0.884471 0.998896 -0.002236 1.626706
1 -2.078125 0.355409 -0.197669 -1.646158 -0.468166 -2.293840 0.355827 0.128058 -1.082453 -0.088571 ... 0.816153 0.344587 0.395235 -0.047433 -0.422722 1.940111 1.440322 0.061377 0.059999 -0.024671
2 0.162252 -0.152456 -0.135844 -0.844035 0.035374 -0.561002 0.823325 -0.691107 -0.473183 -0.260492 ... -1.076283 0.849973 0.095815 -3.099961 -1.813451 -1.152462 0.374828 2.138861 -1.278428 1.462149
3 -1.414552 1.780481 -0.582410 -0.406737 -0.472373 -0.646987 0.055764 -1.159705 -1.017917 -1.065572 ... -0.969750 -0.089266 -0.178336 -0.317615 -0.125729 0.029745 -1.297556 1.808506 0.464646 1.411269
4 0.158225 -1.499085 -1.296374 0.948369 0.166480 -0.580187 -0.166709 -0.911766 -0.572613 0.414516 ... -0.824602 1.588345 0.041436 0.305347 0.989426 -1.659724 0.512703 -0.121546 -1.306610 0.038806

5 rows × 200 columns


In [88]:
X=pd.DataFrame()
classvec=[0]*halfcells
classvec.extend([1]*halfcells)
X['class']=classvec

X_noise=X.copy()
X_noise.ix[0:tenthcells]=1

In [91]:
#Kernel Density estimates of the two distributions across all genes
sns.kdeplot(Y.ix[range(halfcells),:].values.flatten())
sns.kdeplot(Y.ix[halfcells+1:,:].values.flatten())
plt.xlabel('Expression Value')
plt.ylabel('Density Estimate')


Out[91]:
<matplotlib.text.Text at 0x118bbcf10>

In [92]:
#plot a single gene as a function of class membership
plt.scatter(X['class'],Y[0])
plt.xlabel('class label')
plt.ylabel('Expression')


Out[92]:
<matplotlib.text.Text at 0x1184996d0>

In [93]:
#Fit regression model
lm=sklearn.linear_model.Ridge()
lm.fit(X,Y)
B=pd.DataFrame(lm.coef_)

In [94]:
#coefficients for informative subset of genes
plt.plot(B[0])
plt.axvline(ngenes*informativegenes,c='red')
plt.axhline(effectsize,c='gray')
plt.xlabel('Gene Index')
plt.ylabel('Coefficient')


Out[94]:
<matplotlib.text.Text at 0x119341fd0>

In [95]:
#Adjust covariates based on fitted coefficients
X_adjust=bayes_cov_col(Y,X,['class'],lm)

In [97]:
#Plot original and adjusted covariates
plt.plot(X,label='Original')
plt.plot(X_adjust,label='Adjusted')
plt.legend(loc='lower right')
plt.xlabel('Cell Index')
plt.ylabel('Class Probability')


Out[97]:
<matplotlib.text.Text at 0x119751cd0>

In [99]:
#Fit regression model on noisy covariates (note how the first ten percent of cells are mostly reclassified as class 0)
lm_noise=sklearn.linear_model.Ridge()
lm_noise.fit(X_noise,Y)
B_noise=pd.DataFrame(lm_noise.coef_)
X_adjust_noise=bayes_cov_col(Y,X_noise,['class'],lm_noise)
plt.plot(X_noise,label='Original Noisy')
plt.plot(X_adjust_noise,label='Adjusted')
plt.legend(loc='lower right')
plt.xlabel('Cell Index')
plt.ylabel('Class Probability')


Out[99]:
<matplotlib.text.Text at 0x119e78190>

In [100]:
lm_noise_adjust=sklearn.linear_model.Ridge()
lm_noise_adjust.fit(X_adjust_noise,Y)
B_noise_adjust=pd.DataFrame(lm_noise_adjust.coef_)

In [103]:
#plot coefficients for informative subset of genes
plt.plot(B_noise[0],label='Original')
plt.plot(B_noise_adjust[0],label='Adjusted')
plt.axvline(ngenes*informativegenes,c='red')
plt.axhline(effectsize,c='gray')
plt.legend(loc='upper right')
plt.xlabel('Gene Index')
plt.ylabel('Coefficient')


Out[103]:
<matplotlib.text.Text at 0x11a3a1910>