notebook.community

Edit and run



In [2]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')
sns.set_style('ticks')
import scipy
from scipy import sparse, io
import h5py
import re
import time
import sklearn
import sys
import statsmodels
import statsmodels.api as sm
import sklearn.ensemble









    



/Users/atray/anaconda/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)
/Users/atray/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:1350: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)



In [17]:

    
outputpath='CHANGE THIS TO OUTPUT PATH'
path2data='CHANGE THIS TO DATA PATH'
#threshold on maximum tolerable false positive rate for random forest filter
fpr_threshold=0.02



In [4]:

    
#read, label, and concatenate human and mouse data
Th=pd.read_csv(path2data+'SpeciesMix_HundredSTAMP_bq10_star_HUMAN_auto_molBC.txt.gz',sep='\t')
Tm=pd.read_csv(path2data+'SpeciesMix_HundredSTAMP_bq10_star_MOUSE_auto_molBC.txt.gz',sep='\t')
Th['human']=[1.0]*len(Th)
Th['mouse']=[0.0]*len(Th)
Tm['human']=[0.0]*len(Tm)
Tm['mouse']=[1.0]*len(Tm)
T=pd.concat([Th,Tm])
T['bcumi']=[x+'-'+y for x,y in zip(T['Cell Barcode'],T['Molecular_Barcode'])]



In [5]:

    
#TPT calculation

BCUMI_group=T.groupby('bcumi').sum()
BCUMI_group['gpt']=BCUMI_group['human']+BCUMI_group['mouse']
BCUMI_group=pd.DataFrame(BCUMI_group[['Num_Obs','gpt']])
BCUMI_group.columns=['total_reads','gpt']
T_tot=T.copy()
T_tot.index=T_tot['bcumi']
T_tot=T_tot.join(BCUMI_group)
T_tot['TPT']=1.0*(np.divide(1.0*T_tot['Num_Obs'],T_tot['total_reads']))
TTPT_filt=T_tot.copy()


# Identify human and mouse cells (higher than 95% purity)

Tmeans=TTPT_filt.groupby('Cell Barcode').mean()
Tmeans['mouse_lab']=Tmeans['mouse']>0.95
Tmeans['human_lab']=Tmeans['human']>0.95
Tmeansf=Tmeans[['mouse_lab','human_lab']]


TTPT_filt.index=TTPT_filt['Cell Barcode']

Tfmerge=TTPT_filt.join(Tmeansf)

gmeans=Tfmerge.groupby('Gene').sum()



In [6]:

    
#TPM normalize gene means within species
gmeans.ix[gmeans['human']>0,'Num_Obs']=np.divide(1e6*gmeans.ix[gmeans['human']>0,'Num_Obs'],np.sum(gmeans.ix[gmeans['human']>0,'Num_Obs']))
gmeans.ix[gmeans['mouse']>0,'Num_Obs']=np.divide(1e6*gmeans.ix[gmeans['mouse']>0,'Num_Obs'],np.sum(gmeans.ix[gmeans['mouse']>0,'Num_Obs']))
gmeans=pd.DataFrame(np.log2(gmeans['Num_Obs']+1))
gmeans.columns=['gsums']



In [7]:

    
Tfmerge.index=Tfmerge['Gene']

Tfmerge=Tfmerge.join(gmeans)

Tfbc_group=Tfmerge.groupby('Cell Barcode').sum()

hbcsums=np.sum(Tfbc_group.ix[Tfbc_group['human_lab']>0,'Num_Obs'])
mbcsums=np.sum(Tfbc_group.ix[Tfbc_group['mouse_lab']>0,'Num_Obs'])

#TPM normalize cell barcode abundances within species
Tfbc_group.ix[Tfbc_group['human_lab']>0,'Num_Obs']=np.divide(1e6*Tfbc_group.ix[Tfbc_group['human_lab']>0,'Num_Obs'],hbcsums)
Tfbc_group.ix[Tfbc_group['mouse_lab']>0,'Num_Obs']=np.divide(1e6*Tfbc_group.ix[Tfbc_group['mouse_lab']>0,'Num_Obs'],mbcsums)

bcsums=pd.DataFrame(Tfbc_group['Num_Obs'])
bcsums.columns=['bcsums']

Tfmerge.index=Tfmerge['Cell Barcode']
Tfmerge=Tfmerge.join(bcsums)

Tfbc_group['class']=(1.0*(Tfbc_group['mouse_lab']>0))-(1.0*(Tfbc_group['human_lab']>0))



In [8]:

    
plt.scatter(Tfbc_group['human'],Tfbc_group['mouse'],c=1.0*np.array(Tfbc_group['human_lab']>0)-1.0*np.array(Tfbc_group['mouse_lab']>0),cmap='bwr')
plt.xlabel('Human UMs')
plt.ylabel('Mouse UMs')









    Out[8]:





<matplotlib.text.Text at 0x10cda3390>



In [9]:

    
#remove cross-species doublets
notdoublet=np.logical_or(Tfmerge['mouse_lab'],Tfmerge['human_lab'])
Tfmerge=Tfmerge[notdoublet]

#label mismatch molecules as ones that are genes from the opposite species
Tfmerge['mismatch']=np.logical_or(np.logical_and(Tfmerge['human']>0,Tfmerge['mouse_lab']),np.logical_and(Tfmerge['mouse']>0,Tfmerge['human_lab']))



In [10]:

    
Tfmerge_h=Tfmerge[Tfmerge['human']>0]
Tfmerge_m=Tfmerge[Tfmerge['mouse']>0]



In [11]:

    
np.sum(np.logical_and(Tfmerge['mismatch'],Tfmerge['human_lab']))









    Out[11]:





32014



In [12]:

    
np.sum(np.logical_and(Tfmerge['mismatch'],Tfmerge['mouse_lab']))









    Out[12]:





27230



In [13]:

    
np.sum(np.logical_not(Tfmerge['mismatch']))









    Out[13]:





3394611



In [63]:

    
#Calculate chimera probabilities based on observed abundances of human mouse cells and genes
hgsums=Tfmerge['human_lab'].sum()
mgsums=Tfmerge['mouse_lab'].sum()
hg=np.divide(1.0*hgsums,hgsums+mgsums)
hb=np.divide(1.0*hbcsums,hbcsums+mbcsums)
chimera_probs=pd.DataFrame([[hg*hb,hg*(1-hb)],[hb*(1-hg),(1-hb)*(1-hg)]])
chimera_probs.columns=['Human Cell','Mouse Cell']
chimera_probs.index=['Human Gene','Mouse Gene']
chimera_probs









    Out[63]:






  
    
      
      Human Cell
      Mouse Cell
    
  
  
    
      Human Gene
      0.628846
      0.148047
    
    
      Mouse Gene
      0.180591
      0.042516



In [18]:

    
#gene level summary statistics
hprop=pd.read_csv(path2data+'/hg38geneprops.csv',index_col=0)
hprop['Gene']=hprop.index
mprop=pd.read_csv(path2data+'/mm10geneprops.csv',index_col=0)
mprop['Gene']=mprop.index



In [19]:

    
#add species level gene properties
Tfmerge_h=Tfmerge_h.join(hprop,on='Gene',rsuffix='_2')
Tfmerge_h=Tfmerge_h.dropna()
Tfmerge_m=Tfmerge_m.join(mprop,on='Gene',rsuffix='_2')
Tfmerge_m=Tfmerge_m.dropna()
Tfmerge_f=pd.concat([Tfmerge_h,Tfmerge_m])



In [20]:

    
Tfmerge_f['log(Reads)']=np.log2(Tfmerge_f['Num_Obs'])



In [21]:

    
def gc_calc(seq):
    seqlen=len(seq)
    gc=np.divide(1.0*(seq.count('G')+seq.count('C')),seqlen)
    return gc

#add gc content feature
Tfmerge_f['gccbc']=[gc_calc(x) for x in Tfmerge_f['Cell Barcode']]
Tfmerge_f['gcumi']=[gc_calc(x) for x in Tfmerge_f['Molecular_Barcode']]



In [41]:

    
#extract final sample labels and features

y=Tfmerge_f['mismatch']
X=Tfmerge_f[['log(Reads)','TPT','gccbc','gcumi','gsums','bcsums','Avg Length','Avg GC','Avg KPK']]



In [42]:

    
Xlog=X.copy()
Xlog['Avg Length']=np.log2(Xlog['Avg Length'])
Xlog['bcsums']=np.log2(Xlog['bcsums'])



In [43]:

    
rf=sklearn.ensemble.RandomForestClassifier(n_estimators=300,oob_score=True,n_jobs=-1,class_weight='auto',verbose=1)



In [44]:

    
#fit random forest
rf.fit(Xlog,y)









    



[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.9min finished






    Out[44]:





RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=True, random_state=None, verbose=1, warm_start=False)



In [45]:

    
probs=[(x[0]-x[1]) for x in rf.oob_decision_function_]



In [46]:

    
fprrf, tprrf, thresholds = sklearn.metrics.roc_curve(y,-np.array(probs))



In [50]:

    
plt.plot(fprrf,tprrf,label='Random Forest')
plt.plot([0,1],[0,1],c='black')
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
plt.axvline(0.04,c='gray')



In [48]:

    
aucrf=sklearn.metrics.auc(fprrf,tprrf)
print('AUC is '+str(aucrf))
probthresh=thresholds[fprrf<fpr_threshold][-1]
prob_logical=-np.array(probs)<probthresh
print('Filtered '+str(np.round(100.0*(1.0-np.mean(prob_logical)),2))+'% of molecules')









    



AUC is 0.933610558166
Filtered 3.05% of molecules



In [49]:

    
df_rf=pd.DataFrame(rf.feature_importances_)
df_rf.index=X.columns
df_rf.sort_values(0,ascending=False)



In [ ]:

    
T_F=Tfmerge_f[prob_logical]
T_F['unique']=np.array([1.0]*len(T_F))
T_F['bcgene']=[(x,y) for x,y in zip(T_F['Cell Barcode'],T_F['Gene'])]
T_FF = T_F.groupby('bcgene').sum()



In [54]:

    
def str2index(strlist):
    reduced=pd.DataFrame(list(set(np.sort(strlist))))
    reduced=reduced.reset_index()
    reduced.index=reduced[0]
    dftmp=pd.DataFrame(strlist,index=strlist)
    dftmp=dftmp.merge(reduced,how='left')['index']
    return np.array(dftmp),list(reduced[0])



In [55]:

    
tfrow=[x[1] for x in T_FF.index]
tfrow,gnames=str2index(tfrow)
tfcol=[x[0] for x in T_FF.index]
tfcol,cnames=str2index(tfcol)
tfdata=np.array(T_FF['unique'])
tmpcol=pd.DataFrame(np.unique(tfcol))
tmpcol['unind']=range(len(tmpcol))

dftfcol=pd.DataFrame(tfcol)
dftfcol=dftfcol.merge(tmpcol,on=0)
tfcol=np.array(dftfcol['unind'])
EXPR_MAT=scipy.sparse.csr_matrix((tfdata,(tfrow,tfcol)),shape=(np.max(tfrow)+1,np.max(tfcol)+1)).toarray()
EXPR_MAT=pd.DataFrame(EXPR_MAT)
EXPR_MAT.index=gnames
EXPR_MAT.columns=cnames



In [56]:

    
#WARNING this matrix contains both human and mouse genes, but you can filter as needed
EXPR_MAT.head()









    Out[56]:






  
    
      
      ATCTTCGGATCA
      TACAATTAAGGC
      TTGCCTTACGCG
      TCTGGATGCGCA
      GGATACATGGTA
      CACTCTGCAAGG
      GAAGTTGGCGGA
      AGACTTCCCTCG
      AAGTAAGCAAGC
      TGGCTATACCAA
      ...
      CGCCCAGTACCG
      GAGACTGCCCCA
      GAATGGCATGCT
      CCTAGACGTAGT
      GACACTTCATCG
      CGTGATGACCCT
      AACCTATGGGCC
      CCTTGTACTTTT
      TGAATGTTTGAC
      TCTCACAATGGG
    
  
  
    
      Plekhg2
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      Plekhg3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      Plekhg4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      Plekhg5
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      Nsa2
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      3.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      4.0
    
  

5 rows × 76 columns



In [ ]:

    
EXPR_MAT.to_csv(outputpath+'/expr_mat_Yorimasafiltered.txt',sep='\t')

	0
log(Reads)	0.214529
gsums	0.147509
Avg Length	0.136966
Avg GC	0.133134
bcsums	0.123919
TPT	0.109936
Avg KPK	0.062631
gccbc	0.035955
gcumi	0.035422

	CACTCTGCAAGG	...	GAGACTGCCCCA	TGAATGTTTGAC	TCTCACAATGGG
Plekhg2	1.0	...	0.0	0.0	1.0
Plekhg3	0.0	...	2.0	0.0	0.0
Plekhg4	0.0	...	0.0	0.0	0.0
Plekhg5	0.0	...	0.0	0.0	0.0
Nsa2	1.0	...	3.0	2.0	4.0