Integrated computational prediction and experimental validation identifies promiscuous T cell epitopes in the proteome of pathogenic mycobacteria. Damien Farrell, Gareth Jones, Chris Pirson, Kerri M Malone, Kevin Rue-Albrecht, Anthony J Chubb, H Martin Vordermeier, Stephen V. Gordon. 2016
In [16]:
import os,sys
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
home = os.path.expanduser('~')
#we still need the older version of the module for repeating the original work
sys.path.append(os.path.join(home,'python/sandbox/mhcpredict'))
import Base, Genome, Tepitope, EpitopePipeline
from matplotlib_venn import venn2,venn3
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth',80)
import seaborn as sns
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','axes.grid': False,'legend.frameon':True, 'legend.fontsize':12})
sns.set_context("notebook", font_scale=1.4)
plt.rcParams['savefig.dpi']=150
from IPython.display import display, HTML
#newer module version we use for remainder of analysis
from epitopepredict import base, analysis, sequtils, utilities
In [17]:
genome = sequtils.genbank2Dataframe('MTB-H37Rv.gb', cds=True)
mbovis = sequtils.genbank2Dataframe('Mbovis.gb', cds=True)
In [18]:
#srm data
srm = pd.read_csv('srm_mtb.csv')
srm = genome.merge(srm[['locus_tag','concentration']],on='locus_tag',how='inner')
srm = srm[srm.concentration>0]
print len(srm)
#srm = srm[srm.length<=400]
#print len(srm)
In [9]:
n=3
minsize=2
mhc1alleles = ['BoLA-N:00101','BoLA-N:00201','BoLA-N:00301','BoLA-N:00401',
'BoLA-N:00501','BoLA-N:00601','BoLA-N:00801','BoLA-N:00901',
'BoLA-N:01001']
otherbinders = {}
otherbinders['iedbmhc1_9'] = pd.read_csv('binders_MTB-H37Rv_iedbmhc1_9_3.csv')
otherbinders['iedbmhc1_11'] = pd.read_csv('binders_MTB-H37Rv_iedbmhc1_11_3.csv')
otherbinders['tepitope'] = pd.read_csv('binders_MTB-H37Rv_tepitope_bovine_3.csv')
otherbinders['netmhciipan'] = pd.read_csv('binders_MTB-H37Rv_netmhciipan_bovine_3.csv')
mapping = pd.read_csv('mbovis_mapping_new.csv',usecols=[1,2,3,4])#,index_col=0)
#show filtered numbers (done inside the pipeline)
for n in otherbinders:
d=otherbinders[n]
dsrm = d.merge(srm[['locus_tag','length']],left_on='name',right_on='locus_tag')
print n, len(d), len(dsrm), len(dsrm[dsrm.length>400])
In [4]:
def responsesbreakdown(res, key1='#pos',key2='method',label=''):
sns.set_context("notebook", font_scale=1.4)
s = res.groupby([key1,key2]).agg({'id':np.size})
s = s.unstack(key2)
s.columns = s.columns.get_level_values(1)
perc = s.sum(1)
s=s.sort(axis=1,ascending=False)
#print s
f,ax=plt.subplots(1,2,figsize=(10,5))
grid=ax.flat
perc.plot(kind='pie',ax=grid[0],cmap='Greens',figsize=(9.5,5),startangle=0)
grid[0].set_title('%s (all methods)' %label,fontsize=18)
grid[0].set_ylabel('')
s.plot(kind='barh', grid=False,ax=grid[1])
grid[1].set_title('%s per method' %label,fontsize=20)
grid[1].set_ylabel('')
sns.despine()
plt.tight_layout()
t=s.T
t=t.fillna(0)
#print t
#print t[t.columns[t.columns>=1]].sum(1)
print t.sum()
#en = t[t.columns[t.columns>=1]].sum(1)/t.sum(1)
return t
def hydroplot(res, key):
res.boxplot('hydro',by=key,vert=False,grid=False)
plt.title('peptide hydrophobicity by response')
plt.suptitle('')
plt.xlabel('hydrophobicity'); plt.ylabel(key)
plt.tight_layout()
plt.savefig('responses_hydrophobicity.png')
In [5]:
EpitopePipeline=reload(EpitopePipeline)
plt.rcParams['font.size']=20
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','legend.frameon': True})
#reload peptide list
plist = pd.read_csv('peptide_lists.csv',index_col=0)
#whole blood
wb = pd.read_csv('vla_wholeblood_results.csv')
wbcols = ['8740','8742','8743','8744']
tags = wb.tag.apply( lambda x: pd.Series(x.split('_')))
wb['mbname'] = tags[0]
wb['id'] = tags[1].astype(int)
#merge them
wb = plist.merge(wb,left_index=True,right_on='id',how='right')
wb.ix[wb.method.isnull(), 'method'] = '_pos ctrls'
#wb['both'] = wb.peptide.isin(ovlps.peptide_y)
ctrls = wb[wb.method=='_pos ctrls']
print ctrls[['tag','#pos']]
s=responsesbreakdown(wb, label='no. positives')
plt.savefig('responses_bymethod_wb.png')
s['%pos'] = s[s.columns[-4:]].sum(1)/s.sum(1)*100
s
Out[5]:
In [6]:
def responsesheatmap(res, cols, labels=None):
"""heatmap of responses by method"""
l = res['strategy'].nunique()
fig,axs = plt.subplots(l,1,figsize=(6,6))
grid=axs.flatten()
i=0
for n,g in res.groupby('strategy'):
g['mean'] = g[cols].mean(1)
x = g.sort('mean')[cols]
x = x.loc[(x>0).any(1)]
ax=grid[i]
hm = ax.pcolor(x.T,cmap='Blues')
ax.set_yticks(np.arange(0.5, len(x.columns)))
if labels == None:
names = cols
else:
names = labels
ax.set_yticklabels(names, minor=False, fontsize=10)
#ax.set_yticks(np.arange(0.5, len(x.index)))
ax.set_xticks([])
ax.set_xticklabels('')
ax.set_xlim(0, len(x.index)-1)
ax.set_title(n)
i+=1
#cb=fig.colorbar(hm, ax=axs[0],orientation='horizontal', aspect=5)
#plt.setp(cb.ax.get_xticklabels(), rotation='vertical', fontsize=10)
plt.tight_layout()
fig.subplots_adjust(hspace=0.4)
In [7]:
x = wb.copy()
x = x.replace({'-': 0, 'positive': 1})
methlabels = {'cl_tepitopepan': 'binder clusters set 1', 'cl_netmhciipan':'binder clusters set 2',
'topsharedbinders': 'top scoring binders'} # 'random binders': 'C'}
x['strategy'] = x.method.replace(methlabels)
responsesheatmap(x, cols=wbcols)
plt.savefig('wb_responses_heatmap.png')
In [8]:
#pbmc data already has wb positives column in it
pbmc = pd.read_csv('vla_pbmc_results.csv')
pcols = ['7010','7447','7448','7855','7853','7852','7449']
x = pbmc[pcols]
#derive positive rates for each peptide using gareth method of nil mean+3SD and 0.1 cutoff
#nil wells for cutoffs
nil = pd.read_csv('vla_pbmc_nil.csv')
pcut = nil.mean()+nil.std()*3+0.1
pcut2 = nil.mean()+0.1
#pbmcpos = ((x>=pcut) & (x>=pcut2)).astype(int)
#pbmcpos = ((x>=pcut+0.1)).astype(int)
#pbmcpos['method'] = res.method
#responsesheatmap(pbmcpos, cols=pcols)
pbmc['#pbmcpos'] = x[(x>=pcut)].count(1)
#pbmc.plot('#garethpos','#pbmcpos',kind='scatter', figsize=(3,3))
#pbmc['mean'] = pbmc[pcols].mean(1)
#pbmc['#wbpos'] = pbmc['#wbpos']/4*100
#pbmc['#pbmcpos'] = pbmc['#pbmcpos']/7*100
#pbmc['#pbmcpos'] = pbmc['#pbmcpos'].astype(int)
pbmc = pbmc.sort_values('#pbmcpos')
#get the mean fraction of both positive counts
#pbmc['score'] = (pbmc['#pbmcpos']+pbmc['#wbpos'])/2.0
#pbmc['score'] = pbmc.score.round(0)
#just get the total fraction of positives
pbmc['score'] = ((pbmc['#pbmcpos']+pbmc['#wbpos'])/11*100).round(0)
tags = pbmc.tag.apply( lambda x: pd.Series(x.split('_')))
#pbmc['mbname'] = tags[0]
pbmc['id'] = tags[1].astype(int)
#merge with peptide list
plist = pd.read_csv('peptide_lists.csv',index_col=0)
pbmc = plist.merge(pbmc,left_index=True,right_on='id',how='right')
pbmc.ix[pbmc.method.isnull(), 'method'] = '_pos ctrls'
print pbmc.groupby('method').agg({'id':np.size})
#pbmc = pbmc[pbmc.method!='_pos ctrls']
#responsesbreakdown(r, key='#pbmcpos', label='% positives')
ctrls = pbmc[pbmc.method=='_pos ctrls']
print ctrls[['tag','#pbmcpos','#wbpos','score']]
print ctrls.groupby('#pbmcpos').agg({'id':np.size}).T
print
print 'mean score of positive controls:', ctrls.score.mean()
In [9]:
#pbmc data plot
sns.set_context("notebook", font_scale=1.2)
c=['#pbmcpos']+pcols
#subtract nil mean
x = pbmc[pbmc['#pbmcpos']>=1]
x = x[x.method!='_pos ctrls']
x[pcols] = x[pcols]-nil.mean()
#x['numpos'] = (x['#pbmcpos']*7/100+1).astype(int)
x['numpos'] = x['#pbmcpos']
x['mean'] = x[pcols].mean(1)
t = pd.melt(x,id_vars=['numpos','method'],value_vars=pcols,var_name='animal',value_name='OD')
fig,ax = plt.subplots(1,1,figsize=(4,4))
sns.swarmplot(x="numpos", y="mean", data=x,
alpha=0.8, size=4, linewidth=.8, edgecolor="black", color='white', ax=ax) #jitter=True
ax.legend(['mean OD'],fontsize=7)
sns.boxplot(x="numpos", y="OD", data=t, ax=ax, linewidth=1, saturation=0.5,
fliersize=0 ,color='lightblue')
ax.set_ylim(-0.2,1.4)
ax.set_xlabel('responding animals')
plt.title('nil subtracted OD PBMC')
plt.tight_layout()
sns.despine()
plt.savefig('oddata_pbmc.png',dpi=300)
In [10]:
#whole blood data plot
#whole blood values already nil subtracted
wb = pd.read_csv('vla_wholeblood_results.csv')
wb = wb.sort_values('#pos')
cols = ['8740_val','8742_val','8743_val','8744_val']
wb['mean'] = wb[cols].mean(1)
x = wb[wb['#pos']>=1]
x = x[-x.tag.isin(ctrls.tag)]
x['#pos'] = wb['#pos']
t = pd.melt(x,id_vars='#pos',value_vars=cols,var_name='animal',value_name='OD')
fig,ax = plt.subplots(1,1,figsize=(4,4))
sns.swarmplot(x="#pos", y="mean", data=x, #hue='animal', palette='Set1', jitter=True,
alpha=0.8, size=4, linewidth=.8, edgecolor="black", color='white', ax=ax)
ax.legend(['mean OD'],fontsize=7)
sns.boxplot(x="#pos", y="OD", data=t, ax=ax, linewidth=1, saturation=0.5,
fliersize=1 ,color='lightgreen')
plt.title('nil subtracted OD Whole Blood')
ax.set_xlabel('responding animals')
plt.tight_layout()
sns.despine()
plt.savefig('oddata_wb.png',dpi=300)
In [11]:
pbmcbl = pd.read_csv('vla_pbmc_baseline.csv')
x = pbmcbl[pcols]
#nil wells for cutoffs
nil = pd.read_csv('vla_pbmc_nil_baseline.csv')
blcut = nil.mean()+nil.std()*3+0.1
blcut2 = nil.mean()+0.1
pbmcbl['#pbmcpos'] = x[(x>=blcut)].count(1)
#pbmcbl['#pbmcpos'] = pbmcbl['#pbmcpos']/7*100
#pbmcbl['#pbmcpos'] = pbmcbl['#pbmcpos'].astype(int)
pbmcbl['score'] = (pbmcbl['#pbmcpos']/7*100).round(0)
pbmcbl.rename(columns={'method':'predictor'})
neglist = pd.read_csv('negatives_list.csv',index_col=0)
neglist = neglist.drop(['score'],axis=1)
pbmcbl = neglist.merge(pbmcbl,on='id')
print len(pbmcbl)
pbmcbl['method'] = 'random binders'
In [12]:
final = pd.concat([pbmc,pbmcbl])
#calculate final confidence levels by binning score based on positive controls
bins = [0,26,100]
final['confidence'] = pd.cut(final.score, bins=bins, precision=0, include_lowest=True,
labels=['low','high'])
#create column label combining cluster methods
methlabels = {'cl_tepitopepan': 'Binder Clusters', 'cl_netmhciipan':'Binder Clusters',
'topsharedbinders': 'Top Scoring Binders', 'random binders': 'Random Binders (Control)'}
final['strategy'] = final.method.replace(methlabels)
final = final.drop(['tbname','translation','product','name','nearest','allele','1-log50k(aff)','core','order',
'plate_no','#garethpos','rank','pos','source','length'],axis=1)
#print final[:2]
import csv
final.to_csv('final_results_all.csv', quoting=csv.QUOTE_NONNUMERIC)
#print final.ix[0]
print
print final.groupby('method').size()
print final.groupby(['method','confidence']).size().T
#print
#s2 = responsesbreakdown(final, key='#pbmcpos', label='% pbmc positives')
#plt.savefig('responses_bymethod_pbmc.png')
#s2.cumsum().sum()
#s2['%pos'] = s2[s2.columns[-4:]].sum(1)/s2.sum(1)*100
#s2 = s.merge(s2,left_index=1,right_index=1,how='right').fillna(0)
#display(s2)
In [ ]:
#get boolean arrays for heatmap by subtracting cuts
xfinal = (final[pcols]>=pcut).astype(int)
xfinal['method'] = final.method
xbl = (pbmcbl[pcols]>=blcut).astype(int)
xbl['method'] = 'baseline'
xctrls = (ctrls[pcols]>=pcut).astype(int)
xctrls['method'] = '_pos ctrls'
xcomb = pd.concat([xfinal,xbl,xctrls])
responsesheatmap(xcomb, cols=pcols)
plt.savefig('pbmc_responses_heatmap.png')
In [13]:
final = pd.read_csv('final_results_all.csv')
final = final.replace('_pos ctrls','positive controls')
final = final.sort('#wbpos')
g=sns.factorplot(x='#wbpos',y='#pbmcpos',data=final,kind='swarm',size=3,marker='x',#jitter=True,
aspect=1.5,alpha=.6,edgecolor='black')
ax=g.axes.flatten()[0]
#g=sns.factorplot(x='#wbpos',y='#pbmcpos',data=final,kind='box',size=3,aspect=1.5,ax=ax)
ax.set_xlabel('whole blood responder freq (%)')
ax.set_ylabel('PBMC responder freq')
Out[13]:
In [14]:
sns.set_context("notebook", font_scale=1.1)
#final.hist('score',by='method',stacked=True,figsize=(5,3),sharex=True,bins=range(0,100,5))
final = pd.read_csv('final_results_all.csv')
final = final.replace('_pos ctrls','Positive Controls')
#custom sort
names = {'Binder Clusters':0, 'Top Scoring Binders':1,'Positive Controls':2,'Random Binders (Control)':3}
final['order'] = final.strategy.map(names)
final=final.sort_index(by='order')
final=final.drop('order',1)
x = final[-final.method.isin(['Positive Controls'])]
#print x[x.method=='random binders'][['confidence','score']]
sf = x.groupby(['confidence','strategy']).agg({'id':np.size})
sf = sf.unstack('strategy')
sf.columns = sf.columns.get_level_values(1)
perc = sf.sum(1)
sf = sf.T.fillna(0)
'''sf['order'] = sf.index.to_series().map(names)
sf=sf.sort_index(by='order')
sf=sf.drop('order',1)'''
sf['success'] = sf['high']/sf.sum(1)*100
#hydroplot(final, key='confidence')
display(sf)
#from statsmodels.stats.weightstats import ztest
g=sns.factorplot(y='strategy',x='score',data=final,kind='swarm',s=6,#size=4,
aspect=1.7,alpha=.9,edgecolor='black',linewidth=.5)
ax=g.axes.flatten()[0]
ax.set_xlabel('responder frequency')
#ax.set_xticks(x.score.unique())
#for i in ax.get_xticklabels():
# i.set(rotation=90)
ax.axvline(26,0,100,linewidth=1, color='red',linestyle='--')
#g.fig.text(.7, 1.0, 'positive', bbox={'facecolor':'white','alpha':0.8, 'pad':5})
#g.fig.text(.4, 1.0, 'negative', bbox={'facecolor':'white','alpha':0.8, 'pad':5})
ax.set_xlim((-5,100))
sns.despine()
plt.subplots_adjust(top=0.8)
from pandas.tools.plotting import table
t=table(ax, sf.astype(int),loc='lower right',colWidths=[0.08, 0.08, 0.1])
#t.auto_set_font_size(False)
#t.set_fontsize(5.8)
#plt.tight_layout()
plt.savefig('responder_freqs_stripplot.png',bbox_inches='tight',dpi=300)
In [15]:
final = pd.read_csv('final_results_all.csv')
#top peptides (high confidence) and their protein info
cols = ['locus_tag','mbname','peptide','method','strategy','score','confidence','start']
top = final[final.confidence=='positive']
top = top[top.method != '_pos ctrls']
neg = final[final.confidence=='negative']
#overlap of our final sets with known antigens
EpitopePipeline=reload(EpitopePipeline)
antigens = EpitopePipeline.getAntigenDatasets()
antigens['source'] = antigens.source.apply(lambda r: ';'.join(r))
top = antigens.merge(top[cols],on='locus_tag',how='right')
annot = EpitopePipeline.combineAnnotationData()
#kruh = pd.read_csv('Kruh_proteome_30d.csv')
#print top[top.locus_tag.isin(kruh.locus_tag)]
#add annotations
top = top.merge(annot[['locus_tag','gene','length','product']],on='locus_tag',how='left')
top = top.sort_values(['score','locus_tag'],ascending=[False,True]).fillna('')
top = top.reset_index(drop=True)
top.to_csv('top_peptides.csv',index=False)
print '%s peptides in positive category' %len(top)
print 'total positive rate: %s' %str(len(top)/270.0)
print
#print top.columns
topc = top.groupby('locus_tag').agg({'peptide':np.size,
'product':base.first}).sort_values('peptide',ascending=False)
topc = topc.rename(columns={'peptide':'positives'})
#print topc[:5]
tsp = final.groupby('locus_tag').agg({'strategy':base.first,'R':np.size})
tsp = tsp.rename(columns={'R':'prots'})
#lt=['Rv0655']
#lt=['Rv3676','Rv3584']
#lt='Rv0757'
lt=['Rv3584']
display(final[final.locus_tag.isin(lt)][cols].sort_values(['locus_tag','start']))
print 'proteins with most positives:'
x = tsp.merge(topc,left_index=True,right_index=True)
x = x[x.positives>1].sort_values(['positives','strategy'],ascending=False)
display(x)
#print x.groupby('prots').agg({'positives':np.sum})
#x['frac'] = x.peptide/x.prots
#x.plot('prots','frac',kind='scatter',figsize=(4,4))
print 'peptides with highest scores:'
display( top[top.score>=60][cols])
In [40]:
#cg = EpitopePipeline.getCategories(genome,plot=False)
cg = EpitopePipeline.getCategories(srm,plot=False)
cneg = EpitopePipeline.getCategories(neg,plot=False)
cpos = EpitopePipeline.getCategories(top,plot=False)
print len(neg), cneg.sum()
cats = pd.concat([cg,cpos,cneg],axis=1)
cats = cats/cats.sum()
cats.columns=(['genome (filtered)','negative','positive'])
ax=cats.plot(kind='barh',cmap='winter',alpha=0.8,width=.7,grid=False,figsize=(9,6))
plt.xlabel('fraction of total')
ax.set_title('host protein MTB category representation')
plt.tight_layout()
plt.savefig('vlaresults_cats.png')
In [17]:
#srm concentrations of positives?
c = top.merge(srm, on='locus_tag', how='left')
f,ax=plt.subplots(1,1,figsize=(6,3))
c.hist('concentration',bins=30,ax=ax,grid=False)
#ax.set_yscale('log')
#print c[c['concentration_y']>10][['locus_tag','concentration_y','score','product_y']]
Out[17]:
In [18]:
#clusters correlation with positives and no. binders..?
f=pd.read_csv('final_results_all.csv')
print f.columns
f.plot('score','length',kind='scatter',figsize=(3,3))
#tsb correlation with binders in each epitope R?..
Out[18]:
In [78]:
f=pd.read_csv('final_results_all.csv')
#f['confidence'] = f.confidence.map({'negative':0,'low':1,'medium':2,'high':3})
f['confidence'] = pd.Categorical(f.confidence)
sns.set_context("notebook", font_scale=1.0)
#f = f[f.method.isin(['topsharedbinders'])]
#f['mhc1overlaps'] = f.filter(regex="iedbmhc1_").sum(axis=1)
f['overlap'] = (f.mhc1overlaps>=1).astype(int)
xx = f[-f.mhc1overlaps.isnull()]
cols=['peptide','method','iedbmhc1_9_overlap','iedbmhc1_11_overlap','overlap']
#print xx[cols][22:34]
print xx.groupby(['overlap','method']).size()
gg = xx.groupby(['confidence']).agg({'overlap': [np.sum,np.size]})
gg.columns = gg.columns.get_level_values(1)
gg['frac'] = gg['sum']/gg['size']
#print gg
#gg.plot(y='overlap',kind='barh',by='overlap',figsize=(3,3))
gg.plot(y='frac',kind='barh',figsize=(3,3))
#xx.plot('mhc1overlaps','confidence',kind='scatter',figsize=(2,2))
gg = xx.groupby(['confidence']).agg({'mhc1overlaps': [np.mean,np.std,np.size]})
gg.columns = gg.columns.get_level_values(1)
print gg
gg.plot(y='mean',xerr='std',kind='barh',figsize=(3,3))
plt.title('mhc overlaps')
sns.despine()
#xx.hist('mhc1overlaps',by='confidence')
references:
In [24]:
#bola alleles plots
bd = pd.read_csv('bola_alleles_dietz.csv')
bd=bd.set_index('allele').sort('freq',ascending=False)
bd['name'] = 'dietz'
bo = pd.read_csv('bola_alleles_oprzadek.csv')
bo = bo.groupby('allele_short').agg({'freq':np.max})
bo=bo.sort('freq',ascending=False)
bo['name'] = 'oprzadek'
bx = pd.read_csv('bola_alleles_baxter.csv')
bx = bx.groupby('allele_short').agg({'freq':np.max})
bx=bx.sort('freq',ascending=False)
bx['name'] = 'baxter'
x=pd.concat([bd,bo,bx]).reset_index('allele')
x = x.pivot('index','name','freq')
x=x[x>0.005].dropna(how='all')
table = x[x>0.02].reindex_axis(x.mean(1).order(ascending=False).index).dropna(how='all').fillna('-')
sns.set_context("notebook", font_scale=1.3)
x.plot(y=['baxter','dietz','oprzadek'],kind='bar',figsize=(10,5),cmap='YlOrRd',grid=False,width=0.8)
plt.title('BoLA DRB3.2 allele frequencies')
plt.xlabel('allele')
plt.ylabel('frequency')
plt.legend(labels=['Holstein-Charolais','Holstein USA','Polish Holstein-Freisian'],loc=9)
sns.despine()
plt.tight_layout()
plt.savefig('bola_allelefreqs.png')
targetbola = x.index
table
Out[24]:
In [29]:
reload(Tepitope)
d=Tepitope.datadir
boladrb = os.path.join(d,'IPD_MHC/bola.drb3.fa')
hladrb = os.path.join(d,'IPD_MHC/hla.drb345.fa')
ref = os.path.join(d,'IPD_MHC/hlaref.fa')
aln = Tepitope.drbaln
alnindex = dict([(a.id,a) for a in aln])
Tepitope.compareRef(hladrb,boladrb,ref,alnindex)
In [30]:
reload(Tepitope)
a = Tepitope.compare(hladrb, boladrb, alnindex)
hlas = a.min()
hlas.sort()
print hlas[:10]
In [118]:
#old vla negatives
gp = pd.read_csv('gareth_negatives.csv')
#how to find epitope ranking in our binders?
#print b1[:10]
#print neg[:2]
def findinGenome(genome, peptide):
t = genome[genome.translation.str.contains(peptide)]
if len(t)>0:
return t.head(1).squeeze().locus_tag
else:
return ''
gp['locus_tag'] = gp.apply( lambda x: findinGenome(genome, x.sequence), 1)
print gp[:10]
In [ ]:
def hasseq(r, seqs):
for s in seqs:
if s in r.sequence:
return 1
return 0
x = b4[b4.name.isin(gp.locus_tag)]
#need to reduce similarity of gareth peptides
gp['pred_binder'] = gp.apply(lambda r: hasseq(r, x.core),1)
print gp[gp['pred_binder']==1]
In [212]:
sys.path.append(os.path.join(home,'gitprojects'))
reload(analysis)
db = os.path.join(home,'myco_genomes/all_genomes') #local blastdb
res = pd.read_csv('final_results_all.csv')
res = res[res.method!='_pos ctrls']
print len(res)
cons=[]
for i,r in list(res.iterrows()):
#print i, r.peptide, r.locus_tag, r.confidence
tag = r.locus_tag
p = genome[genome['locus_tag']==tag]
seq = p.translation.head(1).squeeze()
#blast local db
recs = analysis.getLocalOrthologs(seq,db)
recs['accession'] = recs.apply(lambda x: x.subj.split('|')[2].split()[1],1)
recs = recs[recs.perc_ident>50]
#which part of peptide to use??
pep = r.peptide[2:18]
c = analysis.findConservedPeptide(pep,recs)
c = c.reset_index()
c['species'] = c.accession.apply(lambda x : re.split("\d+", x)[0][:3])
c['name'] = r.peptide
c = c.drop_duplicates('species')
#print c
cons.append(c)
df = pd.concat(cons)
df = df.dropna()
s = df.set_index(['name','species'])
s = s.drop('accession',1)
s = s.unstack(level=-1)
s=s.clip(upper=1)
s=s.fillna(0)
s.columns = s.columns.get_level_values(1)
s = s.merge(res[['peptide','locus_tag','confidence','score']], left_index=True, right_on='peptide')
s.to_csv('peptide_conservation.csv')
print len(s)
In [278]:
sns.set_context("notebook", font_scale=1.0)
s = pd.read_csv('peptide_conservation.csv',index_col=0)
#replace col names with species
spnames = {'MAB':'M.abscessus','MAF':'M.africanum','MAP':'M.avium.para','ML':'M.leprae','MMA':'M.marinum',
'Rv':'MTB-H37Rv','MUL':'M.ulcerans','BN':'M.canetti','Mb':'M.bovis','MSM':'M.smegmatis'}
species = spnames.values()
s=s.rename(columns=spnames)
s=s.fillna('negative')
#sort by counts on both axes
#s=s.set_index(['confidence','peptide']) #'['locus_tag','peptide'])
#s=s.reindex_axis(s.sum().order().index, axis=1)
#s=s.reindex_axis(s.sum(1).order().index)
s.ix[s.confidence!='negative','confidence'] = 'positive'
#s=s.sortlevel()
totals = s.sum(1).order(ascending=False)
a = s[s.confidence=='negative']
b = s[s.confidence=='positive']
g = s.groupby('confidence').agg(lambda x: x.sum()/x.count())
print g
#z-test for significance of proportions?
from statsmodels.stats.weightstats import ztest
for i in s.columns[:7]:
z=ztest(a[i],b[i])
print i, z[1]
#fig,ax = plt.subplots(1,1,figsize=(5,5))
#analysis.plotheatmap(g,ax)
g.plot(y=g.columns[:7],layout=(4,2),legend=False,colormap='Set1',
kind='bar',figsize=(5,5),subplots=True)
sns.despine()
#plt.savefig('peptide_conservation_map.png')
In [ ]:
#homology of 20mers within m.bovis?
reload(analysis)
db = os.path.join(home,'myco_genomes/mbovis') #local blastdb
x = res[res['#wbpos']>=50].dropna(subset=['pep_no'])
cons=[]
for i,r in list(x.iterrows()):
#print i, r.peptide, r.locus_tag
tag = r.locus_tag
seq = r.peptide
#blast local db
recs = analysis.getLocalOrthologs(seq,db)
if len(recs)>1:
print tag, r['product']
print recs
In [31]:
#overlap with known iedb tcell epitopes?
iedb = pd.read_csv('iedb_myco_tcell.csv')
iedb=iedb.drop_duplicates('peptide')
iedb = iedb[iedb['Source Organism Name'].str.contains('bovis')]
#print iedb.groupby('Source Molecule Name').size()
In [117]:
#position of Rv3874 in all tepitope binders
b=b4
sb = b.sort('score',ascending=0)
#sb = b.sort('1-log50k(aff)')
sb =sb.reset_index()
print len(sb)
print sb[sb.name=='Rv3874']
In [276]:
#calculation 20mers in proteome
n=20
prots=len(mbovis)
l=(mbovis.length-n-1).sum()
nmers = l*n*prots
print prots, l