notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
import sklearn
from sklearn import linear_model
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')
sns.set_style('ticks')



In [84]:

    
def bayes_cov_col(Y,X,cols,lm):
    """
    @Y    = Expression matrix, cells x x genes, expecting pandas dataframe
    @X    = Covariate matrix, cells x covariates, expecting pandas dataframe
    @cols = The subset of columns that the EM should be performed over, expecting list
    @lm   = linear model object
    """

    #EM iterateit
    Yhat=pd.DataFrame(lm.predict(X))
    Yhat.index=Y.index
    Yhat.columns=Y.columns
    SSE_all=np.square(Y.subtract(Yhat))
    X_adjust=X.copy()


    df_SSE   = []
    df_logit = []

    for curcov in cols:

        curcells=X[X[curcov]>0].index

        if len(curcells)>2:

            X_notcur=X.copy()
            X_notcur[curcov]=[0]*len(X_notcur)

            X_sub=X_notcur.loc[curcells]

            Y_sub=Y.loc[curcells]

            GENE_var=2.0*Y_sub.var(axis=0)
            vargenes=GENE_var[GENE_var>0].index

            Yhat_notcur=pd.DataFrame(lm.predict(X_sub))
            Yhat_notcur.index=Y_sub.index
            Yhat_notcur.columns=Y_sub.columns

            SSE_notcur=np.square(Y_sub.subtract(Yhat_notcur))
            SSE=SSE_all.loc[curcells].subtract(SSE_notcur)
            SSE_sum=SSE.sum(axis=1)

            SSE_transform=SSE.div(GENE_var+0.5)[vargenes].sum(axis=1)
            logitify=np.divide(1.0,1.0+np.exp(SSE_transform))#sum))

            df_SSE.append(SSE_sum)
            df_logit.append(logitify)

            X_adjust[curcov].loc[curcells]=logitify

    return X_adjust



In [86]:

    
#simulate two multivariate normal distributions for two classes
#initial parameters
ncells=1000
ngenes=200
halfcells=int(ncells/2.0)
tenthcells=int(ncells/10.0)
informativegenes=0.1
effectsize=1.0



In [87]:

    
#Simulate Class 0 cells
y1=pd.DataFrame(np.random.normal(loc=0,scale=1,size=(halfcells,int(informativegenes*ngenes))))

#Simulate Class 1 cells
y2=pd.DataFrame(np.random.normal(loc=effectsize,scale=1,size=(halfcells,int(informativegenes*ngenes))))

#Noisy genes
y3=pd.DataFrame(np.random.normal(loc=0,scale=1,size=(ncells,int(ngenes*(1.0-informativegenes)))))

#Concatenate to form cells x genes expression matrix
Y=pd.concat([y1,y2]).reset_index(drop=True)
Y=pd.concat([Y,y3],axis=1)
Y.columns=range(np.shape(Y)[1])
Y.head()









    Out[87]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      190
      191
      192
      193
      194
      195
      196
      197
      198
      199
    
  
  
    
      0
      1.593226
      -0.678334
      0.010144
      0.303765
      -0.794254
      -0.633916
      -0.127974
      0.797062
      0.373396
      -0.206535
      ...
      0.448402
      -0.641421
      -0.322198
      0.081434
      -0.130122
      -0.267421
      -0.884471
      0.998896
      -0.002236
      1.626706
    
    
      1
      -2.078125
      0.355409
      -0.197669
      -1.646158
      -0.468166
      -2.293840
      0.355827
      0.128058
      -1.082453
      -0.088571
      ...
      0.816153
      0.344587
      0.395235
      -0.047433
      -0.422722
      1.940111
      1.440322
      0.061377
      0.059999
      -0.024671
    
    
      2
      0.162252
      -0.152456
      -0.135844
      -0.844035
      0.035374
      -0.561002
      0.823325
      -0.691107
      -0.473183
      -0.260492
      ...
      -1.076283
      0.849973
      0.095815
      -3.099961
      -1.813451
      -1.152462
      0.374828
      2.138861
      -1.278428
      1.462149
    
    
      3
      -1.414552
      1.780481
      -0.582410
      -0.406737
      -0.472373
      -0.646987
      0.055764
      -1.159705
      -1.017917
      -1.065572
      ...
      -0.969750
      -0.089266
      -0.178336
      -0.317615
      -0.125729
      0.029745
      -1.297556
      1.808506
      0.464646
      1.411269
    
    
      4
      0.158225
      -1.499085
      -1.296374
      0.948369
      0.166480
      -0.580187
      -0.166709
      -0.911766
      -0.572613
      0.414516
      ...
      -0.824602
      1.588345
      0.041436
      0.305347
      0.989426
      -1.659724
      0.512703
      -0.121546
      -1.306610
      0.038806
    
  

5 rows × 200 columns



In [88]:

    
X=pd.DataFrame()
classvec=[0]*halfcells
classvec.extend([1]*halfcells)
X['class']=classvec

X_noise=X.copy()
X_noise.ix[0:tenthcells]=1



In [91]:

    
#Kernel Density estimates of the two distributions across all genes
sns.kdeplot(Y.ix[range(halfcells),:].values.flatten())
sns.kdeplot(Y.ix[halfcells+1:,:].values.flatten())
plt.xlabel('Expression Value')
plt.ylabel('Density Estimate')









    Out[91]:





<matplotlib.text.Text at 0x118bbcf10>



In [92]:

    
#plot a single gene as a function of class membership
plt.scatter(X['class'],Y[0])
plt.xlabel('class label')
plt.ylabel('Expression')









    Out[92]:





<matplotlib.text.Text at 0x1184996d0>



In [93]:

    
#Fit regression model
lm=sklearn.linear_model.Ridge()
lm.fit(X,Y)
B=pd.DataFrame(lm.coef_)



In [94]:

    
#coefficients for informative subset of genes
plt.plot(B[0])
plt.axvline(ngenes*informativegenes,c='red')
plt.axhline(effectsize,c='gray')
plt.xlabel('Gene Index')
plt.ylabel('Coefficient')









    Out[94]:





<matplotlib.text.Text at 0x119341fd0>



In [95]:

    
#Adjust covariates based on fitted coefficients
X_adjust=bayes_cov_col(Y,X,['class'],lm)



In [97]:

    
#Plot original and adjusted covariates
plt.plot(X,label='Original')
plt.plot(X_adjust,label='Adjusted')
plt.legend(loc='lower right')
plt.xlabel('Cell Index')
plt.ylabel('Class Probability')









    Out[97]:





<matplotlib.text.Text at 0x119751cd0>



In [99]:

    
#Fit regression model on noisy covariates (note how the first ten percent of cells are mostly reclassified as class 0)
lm_noise=sklearn.linear_model.Ridge()
lm_noise.fit(X_noise,Y)
B_noise=pd.DataFrame(lm_noise.coef_)
X_adjust_noise=bayes_cov_col(Y,X_noise,['class'],lm_noise)
plt.plot(X_noise,label='Original Noisy')
plt.plot(X_adjust_noise,label='Adjusted')
plt.legend(loc='lower right')
plt.xlabel('Cell Index')
plt.ylabel('Class Probability')









    Out[99]:





<matplotlib.text.Text at 0x119e78190>



In [100]:

    
lm_noise_adjust=sklearn.linear_model.Ridge()
lm_noise_adjust.fit(X_adjust_noise,Y)
B_noise_adjust=pd.DataFrame(lm_noise_adjust.coef_)



In [103]:

    
#plot coefficients for informative subset of genes
plt.plot(B_noise[0],label='Original')
plt.plot(B_noise_adjust[0],label='Adjusted')
plt.axvline(ngenes*informativegenes,c='red')
plt.axhline(effectsize,c='gray')
plt.legend(loc='upper right')
plt.xlabel('Gene Index')
plt.ylabel('Coefficient')









    Out[103]:





<matplotlib.text.Text at 0x11a3a1910>

	0	1	2	3	4	5	6	7	8	9	...	190	191	192	193	194	195	196	197	198	199
0	1.593226	-0.678334	0.010144	0.303765	-0.794254	-0.633916	-0.127974	0.797062	0.373396	-0.206535	...	0.448402	-0.641421	-0.322198	0.081434	-0.130122	-0.267421	-0.884471	0.998896	-0.002236	1.626706
1	-2.078125	0.355409	-0.197669	-1.646158	-0.468166	-2.293840	0.355827	0.128058	-1.082453	-0.088571	...	0.816153	0.344587	0.395235	-0.047433	-0.422722	1.940111	1.440322	0.061377	0.059999	-0.024671
2	0.162252	-0.152456	-0.135844	-0.844035	0.035374	-0.561002	0.823325	-0.691107	-0.473183	-0.260492	...	-1.076283	0.849973	0.095815	-3.099961	-1.813451	-1.152462	0.374828	2.138861	-1.278428	1.462149
3	-1.414552	1.780481	-0.582410	-0.406737	-0.472373	-0.646987	0.055764	-1.159705	-1.017917	-1.065572	...	-0.969750	-0.089266	-0.178336	-0.317615	-0.125729	0.029745	-1.297556	1.808506	0.464646	1.411269
4	0.158225	-1.499085	-1.296374	0.948369	0.166480	-0.580187	-0.166709	-0.911766	-0.572613	0.414516	...	-0.824602	1.588345	0.041436	0.305347	0.989426	-1.659724	0.512703	-0.121546	-1.306610	0.038806