Integrated computational prediction and experimental validation identifies promiscuous T cell epitopes in the proteome of pathogenic mycobacteria. Damien Farrell, Gareth Jones, Chris Pirson, Kerri M Malone, Kevin Rue-Albrecht, Anthony J Chubb, H Martin Vordermeier, Stephen V. Gordon. 2016
In [16]:
import os,sys
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
home = os.path.expanduser('~')
#we still need the older version of the module for repeating the original work
sys.path.append(os.path.join(home,'python/sandbox/mhcpredict'))
import Base, Genome, Tepitope, EpitopePipeline
from matplotlib_venn import venn2,venn3
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth',80)
import seaborn as sns
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','axes.grid': False,'legend.frameon':True, 'legend.fontsize':12})
sns.set_context("notebook", font_scale=1.4)
plt.rcParams['savefig.dpi']=150
from IPython.display import display, HTML
#newer module version we use for remainder of analysis
from epitopepredict import base, analysis, sequtils, utilities
In [17]:
genome = sequtils.genbank2Dataframe('MTB-H37Rv.gb', cds=True)
mbovis = sequtils.genbank2Dataframe('Mbovis.gb', cds=True)
In [18]:
#srm data
srm = pd.read_csv('srm_mtb.csv')
srm = genome.merge(srm[['locus_tag','concentration']],on='locus_tag',how='inner')
srm = srm[srm.concentration>0]
print len(srm)
#srm = srm[srm.length<=400]
#print len(srm)
In [9]:
n=3
minsize=2
mhc1alleles = ['BoLA-N:00101','BoLA-N:00201','BoLA-N:00301','BoLA-N:00401',
'BoLA-N:00501','BoLA-N:00601','BoLA-N:00801','BoLA-N:00901',
'BoLA-N:01001']
otherbinders = {}
otherbinders['iedbmhc1_9'] = pd.read_csv('binders_MTB-H37Rv_iedbmhc1_9_3.csv')
otherbinders['iedbmhc1_11'] = pd.read_csv('binders_MTB-H37Rv_iedbmhc1_11_3.csv')
otherbinders['tepitope'] = pd.read_csv('binders_MTB-H37Rv_tepitope_bovine_3.csv')
otherbinders['netmhciipan'] = pd.read_csv('binders_MTB-H37Rv_netmhciipan_bovine_3.csv')
mapping = pd.read_csv('mbovis_mapping_new.csv',usecols=[1,2,3,4])#,index_col=0)
#show filtered numbers (done inside the pipeline)
for n in otherbinders:
d=otherbinders[n]
dsrm = d.merge(srm[['locus_tag','length']],left_on='name',right_on='locus_tag')
print n, len(d), len(dsrm), len(dsrm[dsrm.length>400])
In [4]:
def responsesbreakdown(res, key1='#pos',key2='method',label=''):
sns.set_context("notebook", font_scale=1.4)
s = res.groupby([key1,key2]).agg({'id':np.size})
s = s.unstack(key2)
s.columns = s.columns.get_level_values(1)
perc = s.sum(1)
s=s.sort(axis=1,ascending=False)
#print s
f,ax=plt.subplots(1,2,figsize=(10,5))
grid=ax.flat
perc.plot(kind='pie',ax=grid[0],cmap='Greens',figsize=(9.5,5),startangle=0)
grid[0].set_title('%s (all methods)' %label,fontsize=18)
grid[0].set_ylabel('')
s.plot(kind='barh', grid=False,ax=grid[1])
grid[1].set_title('%s per method' %label,fontsize=20)
grid[1].set_ylabel('')
sns.despine()
plt.tight_layout()
t=s.T
t=t.fillna(0)
#print t
#print t[t.columns[t.columns>=1]].sum(1)
print t.sum()
#en = t[t.columns[t.columns>=1]].sum(1)/t.sum(1)
return t
def hydroplot(res, key):
res.boxplot('hydro',by=key,vert=False,grid=False)
plt.title('peptide hydrophobicity by response')
plt.suptitle('')
plt.xlabel('hydrophobicity'); plt.ylabel(key)
plt.tight_layout()
plt.savefig('responses_hydrophobicity.png')
In [5]:
EpitopePipeline=reload(EpitopePipeline)
plt.rcParams['font.size']=20
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','legend.frameon': True})
#reload peptide list
plist = pd.read_csv('peptide_lists.csv',index_col=0)
#whole blood
wb = pd.read_csv('vla_wholeblood_results.csv')
wbcols = ['8740','8742','8743','8744']
tags = wb.tag.apply( lambda x: pd.Series(x.split('_')))
wb['mbname'] = tags[0]
wb['id'] = tags[1].astype(int)
#merge them
wb = plist.merge(wb,left_index=True,right_on='id',how='right')
wb.ix[wb.method.isnull(), 'method'] = '_pos ctrls'
#wb['both'] = wb.peptide.isin(ovlps.peptide_y)
ctrls = wb[wb.method=='_pos ctrls']
print ctrls[['tag','#pos']]
s=responsesbreakdown(wb, label='no. positives')
plt.savefig('responses_bymethod_wb.png')
s['%pos'] = s[s.columns[-4:]].sum(1)/s.sum(1)*100
s
Out[5]:
In [6]:
def responsesheatmap(res, cols, labels=None):
"""heatmap of responses by method"""
l = res['strategy'].nunique()
fig,axs = plt.subplots(l,1,figsize=(6,6))
grid=axs.flatten()
i=0
for n,g in res.groupby('strategy'):
g['mean'] = g[cols].mean(1)
x = g.sort('mean')[cols]
x = x.loc[(x>0).any(1)]
ax=grid[i]
hm = ax.pcolor(x.T,cmap='Blues')
ax.set_yticks(np.arange(0.5, len(x.columns)))
if labels == None:
names = cols
else:
names = labels
ax.set_yticklabels(names, minor=False, fontsize=10)
#ax.set_yticks(np.arange(0.5, len(x.index)))
ax.set_xticks([])
ax.set_xticklabels('')
ax.set_xlim(0, len(x.index)-1)
ax.set_title(n)
i+=1
#cb=fig.colorbar(hm, ax=axs[0],orientation='horizontal', aspect=5)
#plt.setp(cb.ax.get_xticklabels(), rotation='vertical', fontsize=10)
plt.tight_layout()
fig.subplots_adjust(hspace=0.4)
In [7]:
x = wb.copy()
x = x.replace({'-': 0, 'positive': 1})
methlabels = {'cl_tepitopepan': 'binder clusters set 1', 'cl_netmhciipan':'binder clusters set 2',
'topsharedbinders': 'top scoring binders'} # 'random binders': 'C'}
x['strategy'] = x.method.replace(methlabels)
responsesheatmap(x, cols=wbcols)
plt.savefig('wb_responses_heatmap.png')
In [8]:
#pbmc data already has wb positives column in it
pbmc = pd.read_csv('vla_pbmc_results.csv')
pcols = ['7010','7447','7448','7855','7853','7852','7449']
x = pbmc[pcols]
#derive positive rates for each peptide using gareth method of nil mean+3SD and 0.1 cutoff
#nil wells for cutoffs
nil = pd.read_csv('vla_pbmc_nil.csv')
pcut = nil.mean()+nil.std()*3+0.1
pcut2 = nil.mean()+0.1
#pbmcpos = ((x>=pcut) & (x>=pcut2)).astype(int)
#pbmcpos = ((x>=pcut+0.1)).astype(int)
#pbmcpos['method'] = res.method
#responsesheatmap(pbmcpos, cols=pcols)
pbmc['#pbmcpos'] = x[(x>=pcut)].count(1)
#pbmc.plot('#garethpos','#pbmcpos',kind='scatter', figsize=(3,3))
#pbmc['mean'] = pbmc[pcols].mean(1)
#pbmc['#wbpos'] = pbmc['#wbpos']/4*100
#pbmc['#pbmcpos'] = pbmc['#pbmcpos']/7*100
#pbmc['#pbmcpos'] = pbmc['#pbmcpos'].astype(int)
pbmc = pbmc.sort_values('#pbmcpos')
#get the mean fraction of both positive counts
#pbmc['score'] = (pbmc['#pbmcpos']+pbmc['#wbpos'])/2.0
#pbmc['score'] = pbmc.score.round(0)
#just get the total fraction of positives
pbmc['score'] = ((pbmc['#pbmcpos']+pbmc['#wbpos'])/11*100).round(0)
tags = pbmc.tag.apply( lambda x: pd.Series(x.split('_')))
#pbmc['mbname'] = tags[0]
pbmc['id'] = tags[1].astype(int)
#merge with peptide list
plist = pd.read_csv('peptide_lists.csv',index_col=0)
pbmc = plist.merge(pbmc,left_index=True,right_on='id',how='right')
pbmc.ix[pbmc.method.isnull(), 'method'] = '_pos ctrls'
print pbmc.groupby('method').agg({'id':np.size})
#pbmc = pbmc[pbmc.method!='_pos ctrls']
#responsesbreakdown(r, key='#pbmcpos', label='% positives')
ctrls = pbmc[pbmc.method=='_pos ctrls']
print ctrls[['tag','#pbmcpos','#wbpos','score']]
print ctrls.groupby('#pbmcpos').agg({'id':np.size}).T
print
print 'mean score of positive controls:', ctrls.score.mean()
In [9]:
#pbmc data plot
sns.set_context("notebook", font_scale=1.2)
c=['#pbmcpos']+pcols
#subtract nil mean
x = pbmc[pbmc['#pbmcpos']>=1]
x = x[x.method!='_pos ctrls']
x[pcols] = x[pcols]-nil.mean()
#x['numpos'] = (x['#pbmcpos']*7/100+1).astype(int)
x['numpos'] = x['#pbmcpos']
x['mean'] = x[pcols].mean(1)
t = pd.melt(x,id_vars=['numpos','method'],value_vars=pcols,var_name='animal',value_name='OD')
fig,ax = plt.subplots(1,1,figsize=(4,4))
sns.swarmplot(x="numpos", y="mean", data=x,
alpha=0.8, size=4, linewidth=.8, edgecolor="black", color='white', ax=ax) #jitter=True
ax.legend(['mean OD'],fontsize=7)
sns.boxplot(x="numpos", y="OD", data=t, ax=ax, linewidth=1, saturation=0.5,
fliersize=0 ,color='lightblue')
ax.set_ylim(-0.2,1.4)
ax.set_xlabel('responding animals')
plt.title('nil subtracted OD PBMC')
plt.tight_layout()
sns.despine()
plt.savefig('oddata_pbmc.png',dpi=300)
In [10]:
#whole blood data plot
#whole blood values already nil subtracted
wb = pd.read_csv('vla_wholeblood_results.csv')
wb = wb.sort_values('#pos')
cols = ['8740_val','8742_val','8743_val','8744_val']
wb['mean'] = wb[cols].mean(1)
x = wb[wb['#pos']>=1]
x = x[-x.tag.isin(ctrls.tag)]
x['#pos'] = wb['#pos']
t = pd.melt(x,id_vars='#pos',value_vars=cols,var_name='animal',value_name='OD')
fig,ax = plt.subplots(1,1,figsize=(4,4))
sns.swarmplot(x="#pos", y="mean", data=x, #hue='animal', palette='Set1', jitter=True,
alpha=0.8, size=4, linewidth=.8, edgecolor="black", color='white', ax=ax)
ax.legend(['mean OD'],fontsize=7)
sns.boxplot(x="#pos", y="OD", data=t, ax=ax, linewidth=1, saturation=0.5,
fliersize=1 ,color='lightgreen')
plt.title('nil subtracted OD Whole Blood')
ax.set_xlabel('responding animals')
plt.tight_layout()
sns.despine()
plt.savefig('oddata_wb.png',dpi=300)
In [11]:
pbmcbl = pd.read_csv('vla_pbmc_baseline.csv')
x = pbmcbl[pcols]
#nil wells for cutoffs
nil = pd.read_csv('vla_pbmc_nil_baseline.csv')
blcut = nil.mean()+nil.std()*3+0.1
blcut2 = nil.mean()+0.1
pbmcbl['#pbmcpos'] = x[(x>=blcut)].count(1)
#pbmcbl['#pbmcpos'] = pbmcbl['#pbmcpos']/7*100
#pbmcbl['#pbmcpos'] = pbmcbl['#pbmcpos'].astype(int)
pbmcbl['score'] = (pbmcbl['#pbmcpos']/7*100).round(0)
pbmcbl.rename(columns={'method':'predictor'})
neglist = pd.read_csv('negatives_list.csv',index_col=0)
neglist = neglist.drop(['score'],axis=1)
pbmcbl = neglist.merge(pbmcbl,on='id')
print len(pbmcbl)
pbmcbl['method'] = 'random binders'
In [12]:
final = pd.concat([pbmc,pbmcbl])
#calculate final confidence levels by binning score based on positive controls
bins = [0,26,100]
final['confidence'] = pd.cut(final.score, bins=bins, precision=0, include_lowest=True,
labels=['low','high'])
#create column label combining cluster methods
methlabels = {'cl_tepitopepan': 'Binder Clusters', 'cl_netmhciipan':'Binder Clusters',
'topsharedbinders': 'Top Scoring Binders', 'random binders': 'Random Binders (Control)'}
final['strategy'] = final.method.replace(methlabels)
final = final.drop(['tbname','translation','product','name','nearest','allele','1-log50k(aff)','core','order',
'plate_no','#garethpos','rank','pos','source','length'],axis=1)
#print final[:2]
import csv
final.to_csv('final_results_all.csv', quoting=csv.QUOTE_NONNUMERIC)
#print final.ix[0]
print
print final.groupby('method').size()
print final.groupby(['method','confidence']).size().T
#print
#s2 = responsesbreakdown(final, key='#pbmcpos', label='% pbmc positives')
#plt.savefig('responses_bymethod_pbmc.png')
#s2.cumsum().sum()
#s2['%pos'] = s2[s2.columns[-4:]].sum(1)/s2.sum(1)*100
#s2 = s.merge(s2,left_index=1,right_index=1,how='right').fillna(0)
#display(s2)
In [ ]:
#get boolean arrays for heatmap by subtracting cuts
xfinal = (final[pcols]>=pcut).astype(int)
xfinal['method'] = final.method
xbl = (pbmcbl[pcols]>=blcut).astype(int)
xbl['method'] = 'baseline'
xctrls = (ctrls[pcols]>=pcut).astype(int)
xctrls['method'] = '_pos ctrls'
xcomb = pd.concat([xfinal,xbl,xctrls])
responsesheatmap(xcomb, cols=pcols)
plt.savefig('pbmc_responses_heatmap.png')
In [13]:
final = pd.read_csv('final_results_all.csv')
final = final.replace('_pos ctrls','positive controls')
final = final.sort('#wbpos')
g=sns.factorplot(x='#wbpos',y='#pbmcpos',data=final,kind='swarm',size=3,marker='x',#jitter=True,
aspect=1.5,alpha=.6,edgecolor='black')
ax=g.axes.flatten()[0]
#g=sns.factorplot(x='#wbpos',y='#pbmcpos',data=final,kind='box',size=3,aspect=1.5,ax=ax)
ax.set_xlabel('whole blood responder freq (%)')
ax.set_ylabel('PBMC responder freq')
Out[13]:
In [14]:
sns.set_context("notebook", font_scale=1.1)
#final.hist('score',by='method',stacked=True,figsize=(5,3),sharex=True,bins=range(0,100,5))
final = pd.read_csv('final_results_all.csv')
final = final.replace('_pos ctrls','Positive Controls')
#custom sort
names = {'Binder Clusters':0, 'Top Scoring Binders':1,'Positive Controls':2,'Random Binders (Control)':3}
final['order'] = final.strategy.map(names)
final=final.sort_index(by='order')
final=final.drop('order',1)
x = final[-final.method.isin(['Positive Controls'])]
#print x[x.method=='random binders'][['confidence','score']]
sf = x.groupby(['confidence','strategy']).agg({'id':np.size})
sf = sf.unstack('strategy')
sf.columns = sf.columns.get_level_values(1)
perc = sf.sum(1)
sf = sf.T.fillna(0)
'''sf['order'] = sf.index.to_series().map(names)
sf=sf.sort_index(by='order')
sf=sf.drop('order',1)'''
sf['success'] = sf['high']/sf.sum(1)*100
#hydroplot(final, key='confidence')
display(sf)
#from statsmodels.stats.weightstats import ztest
g=sns.factorplot(y='strategy',x='score',data=final,kind='swarm',s=6,#size=4,
aspect=1.7,alpha=.9,edgecolor='black',linewidth=.5)
ax=g.axes.flatten()[0]
ax.set_xlabel('responder frequency')
#ax.set_xticks(x.score.unique())
#for i in ax.get_xticklabels():
# i.set(rotation=90)
ax.axvline(26,0,100,linewidth=1, color='red',linestyle='--')
#g.fig.text(.7, 1.0, 'positive', bbox={'facecolor':'white','alpha':0.8, 'pad':5})
#g.fig.text(.4, 1.0, 'negative', bbox={'facecolor':'white','alpha':0.8, 'pad':5})
ax.set_xlim((-5,100))
sns.despine()
plt.subplots_adjust(top=0.8)
from pandas.tools.plotting import table
t=table(ax, sf.astype(int),loc='lower right',colWidths=[0.08, 0.08, 0.1])
#t.auto_set_font_size(False)
#t.set_fontsize(5.8)
#plt.tight_layout()
plt.savefig('responder_freqs_stripplot.png',bbox_inches='tight',dpi=300)
In [15]:
final = pd.read_csv('final_results_all.csv')
#top peptides (high confidence) and their protein info
cols = ['locus_tag','mbname','peptide','method','strategy','score','confidence','start']
top = final[final.confidence=='positive']
top = top[top.method != '_pos ctrls']
neg = final[final.confidence=='negative']
#overlap of our final sets with known antigens
EpitopePipeline=reload(EpitopePipeline)
antigens = EpitopePipeline.getAntigenDatasets()
antigens['source'] = antigens.source.apply(lambda r: ';'.join(r))
top = antigens.merge(top[cols],on='locus_tag',how='right')
annot = EpitopePipeline.combineAnnotationData()
#kruh = pd.read_csv('Kruh_proteome_30d.csv')
#print top[top.locus_tag.isin(kruh.locus_tag)]
#add annotations
top = top.merge(annot[['locus_tag','gene','length','product']],on='locus_tag',how='left')
top = top.sort_values(['score','locus_tag'],ascending=[False,True]).fillna('')
top = top.reset_index(drop=True)
top.to_csv('top_peptides.csv',index=False)
print '%s peptides in positive category' %len(top)
print 'total positive rate: %s' %str(len(top)/270.0)
print
#print top.columns
topc = top.groupby('locus_tag').agg({'peptide':np.size,
'product':base.first}).sort_values('peptide',ascending=False)
topc = topc.rename(columns={'peptide':'positives'})
#print topc[:5]
tsp = final.groupby('locus_tag').agg({'strategy':base.first,'R':np.size})
tsp = tsp.rename(columns={'R':'prots'})
#lt=['Rv0655']
#lt=['Rv3676','Rv3584']
#lt='Rv0757'
lt=['Rv3584']
display(final[final.locus_tag.isin(lt)][cols].sort_values(['locus_tag','start']))
print 'proteins with most positives:'
x = tsp.merge(topc,left_index=True,right_index=True)
x = x[x.positives>1].sort_values(['positives','strategy'],ascending=False)
display(x)
#print x.groupby('prots').agg({'positives':np.sum})
#x['frac'] = x.peptide/x.prots
#x.plot('prots','frac',kind='scatter',figsize=(4,4))
print 'peptides with highest scores:'
display( top[top.score>=60][cols])
In [40]:
#cg = EpitopePipeline.getCategories(genome,plot=False)
cg = EpitopePipeline.getCategories(srm,plot=False)
cneg = EpitopePipeline.getCategories(neg,plot=False)
cpos = EpitopePipeline.getCategories(top,plot=False)
print len(neg), cneg.sum()
cats = pd.concat([cg,cpos,cneg],axis=1)
cats = cats/cats.sum()
cats.columns=(['genome (filtered)','negative','positive'])
ax=cats.plot(kind='barh',cmap='winter',alpha=0.8,width=.7,grid=False,figsize=(9,6))
plt.xlabel('fraction of total')
ax.set_title('host protein MTB category representation')
plt.tight_layout()
plt.savefig('vlaresults_cats.png')
In [17]:
#srm concentrations of positives?
c = top.merge(srm, on='locus_tag', how='left')
f,ax=plt.subplots(1,1,figsize=(6,3))
c.hist('concentration',bins=30,ax=ax,grid=False)
#ax.set_yscale('log')
#print c[c['concentration_y']>10][['locus_tag','concentration_y','score','product_y']]
Out[17]:
In [18]:
#clusters correlation with positives and no. binders..?
f=pd.read_csv('final_results_all.csv')
print f.columns
f.plot('score','length',kind='scatter',figsize=(3,3))
#tsb correlation with binders in each epitope R?..
Out[18]:
In [78]:
f=pd.read_csv('final_results_all.csv')
#f['confidence'] = f.confidence.map({'negative':0,'low':1,'medium':2,'high':3})
f['confidence'] = pd.Categorical(f.confidence)
sns.set_context("notebook", font_scale=1.0)
#f = f[f.method.isin(['topsharedbinders'])]
#f['mhc1overlaps'] = f.filter(regex="iedbmhc1_").sum(axis=1)
f['overlap'] = (f.mhc1overlaps>=1).astype(int)
xx = f[-f.mhc1overlaps.isnull()]
cols=['peptide','method','iedbmhc1_9_overlap','iedbmhc1_11_overlap','overlap']
#print xx[cols][22:34]
print xx.groupby(['overlap','method']).size()
gg = xx.groupby(['confidence']).agg({'overlap': [np.sum,np.size]})
gg.columns = gg.columns.get_level_values(1)
gg['frac'] = gg['sum']/gg['size']
#print gg
#gg.plot(y='overlap',kind='barh',by='overlap',figsize=(3,3))
gg.plot(y='frac',kind='barh',figsize=(3,3))
#xx.plot('mhc1overlaps','confidence',kind='scatter',figsize=(2,2))
gg = xx.groupby(['confidence']).agg({'mhc1overlaps': [np.mean,np.std,np.size]})
gg.columns = gg.columns.get_level_values(1)
print gg
gg.plot(y='mean',xerr='std',kind='barh',figsize=(3,3))
plt.title('mhc overlaps')
sns.despine()
#xx.hist('mhc1overlaps',by='confidence')
references:
In [24]:
#bola alleles plots
bd = pd.read_csv('bola_alleles_dietz.csv')
bd=bd.set_index('allele').sort('freq',ascending=False)
bd['name'] = 'dietz'
bo = pd.read_csv('bola_alleles_oprzadek.csv')
bo = bo.groupby('allele_short').agg({'freq':np.max})
bo=bo.sort('freq',ascending=False)
bo['name'] = 'oprzadek'
bx = pd.read_csv('bola_alleles_baxter.csv')
bx = bx.groupby('allele_short').agg({'freq':np.max})
bx=bx.sort('freq',ascending=False)
bx['name'] = 'baxter'
x=pd.concat([bd,bo,bx]).reset_index('allele')
x = x.pivot('index','name','freq')
x=x[x>0.005].dropna(how='all')
table = x[x>0.02].reindex_axis(x.mean(1).order(ascending=False).index).dropna(how='all').fillna('-')
sns.set_context("notebook", font_scale=1.3)
x.plot(y=['baxter','dietz','oprzadek'],kind='bar',figsize=(10,5),cmap='YlOrRd',grid=False,width=0.8)
plt.title('BoLA DRB3.2 allele frequencies')
plt.xlabel('allele')
plt.ylabel('frequency')
plt.legend(labels=['Holstein-Charolais','Holstein USA','Polish Holstein-Freisian'],loc=9)
sns.despine()
plt.tight_layout()
plt.savefig('bola_allelefreqs.png')
targetbola = x.index
table
Out[24]:
In [29]:
reload(Tepitope)
d=Tepitope.datadir
boladrb = os.path.join(d,'IPD_MHC/bola.drb3.fa')
hladrb = os.path.join(d,'IPD_MHC/hla.drb345.fa')
ref = os.path.join(d,'IPD_MHC/hlaref.fa')
aln = Tepitope.drbaln
alnindex = dict([(a.id,a) for a in aln])
Tepitope.compareRef(hladrb,boladrb,ref,alnindex)