Epitope mapping results for M.bovis (part 2)

EMIDA antigen mining project

Reference:

Integrated computational prediction and experimental validation identifies promiscuous T cell epitopes in the proteome of pathogenic mycobacteria. Damien Farrell, Gareth Jones, Chris Pirson, Kerri M Malone, Kevin Rue-Albrecht, Anthony J Chubb, H Martin Vordermeier, Stephen V. Gordon. 2016


In [16]:
import os,sys
import re
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
home = os.path.expanduser('~')
#we still need the older version of the module for repeating the original work
sys.path.append(os.path.join(home,'python/sandbox/mhcpredict'))
import Base, Genome, Tepitope, EpitopePipeline
from matplotlib_venn import venn2,venn3
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth',80)
import seaborn as sns
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','axes.grid': False,'legend.frameon':True, 'legend.fontsize':12})
sns.set_context("notebook", font_scale=1.4)
plt.rcParams['savefig.dpi']=150
from IPython.display import display, HTML
#newer module version we use for remainder of analysis
from epitopepredict import base, analysis, sequtils, utilities

In [17]:
genome = sequtils.genbank2Dataframe('MTB-H37Rv.gb', cds=True)
mbovis = sequtils.genbank2Dataframe('Mbovis.gb', cds=True)

In [18]:
#srm data
srm = pd.read_csv('srm_mtb.csv')
srm = genome.merge(srm[['locus_tag','concentration']],on='locus_tag',how='inner')
srm = srm[srm.concentration>0]
print len(srm)
#srm = srm[srm.length<=400]
#print len(srm)


1870

In [9]:
n=3
minsize=2
mhc1alleles = ['BoLA-N:00101','BoLA-N:00201','BoLA-N:00301','BoLA-N:00401',
                 'BoLA-N:00501','BoLA-N:00601','BoLA-N:00801','BoLA-N:00901',
                 'BoLA-N:01001']
otherbinders = {}
otherbinders['iedbmhc1_9'] = pd.read_csv('binders_MTB-H37Rv_iedbmhc1_9_3.csv')
otherbinders['iedbmhc1_11'] = pd.read_csv('binders_MTB-H37Rv_iedbmhc1_11_3.csv')
otherbinders['tepitope'] = pd.read_csv('binders_MTB-H37Rv_tepitope_bovine_3.csv')
otherbinders['netmhciipan'] = pd.read_csv('binders_MTB-H37Rv_netmhciipan_bovine_3.csv')
mapping = pd.read_csv('mbovis_mapping_new.csv',usecols=[1,2,3,4])#,index_col=0)

#show filtered numbers (done inside the pipeline)
for n in otherbinders:    
    d=otherbinders[n]
    dsrm = d.merge(srm[['locus_tag','length']],left_on='name',right_on='locus_tag')
    print n, len(d), len(dsrm), len(dsrm[dsrm.length>400])


iedbmhc1_9 45480 22458 12388
iedbmhc1_11 47500 23203 12746
netmhciipan 27155 14152 7584
tepitope 47370 24172 13004

In [4]:
def responsesbreakdown(res, key1='#pos',key2='method',label=''):
    sns.set_context("notebook", font_scale=1.4)
    s = res.groupby([key1,key2]).agg({'id':np.size})
    s = s.unstack(key2)
    s.columns = s.columns.get_level_values(1)
    perc = s.sum(1)
    s=s.sort(axis=1,ascending=False)
    #print s
    f,ax=plt.subplots(1,2,figsize=(10,5))
    grid=ax.flat
    perc.plot(kind='pie',ax=grid[0],cmap='Greens',figsize=(9.5,5),startangle=0)
    grid[0].set_title('%s (all methods)' %label,fontsize=18)
    grid[0].set_ylabel('')
    s.plot(kind='barh', grid=False,ax=grid[1])
    grid[1].set_title('%s per method' %label,fontsize=20)
    grid[1].set_ylabel('')
    sns.despine()
    plt.tight_layout()    
    t=s.T
    t=t.fillna(0)  
    #print t
    #print t[t.columns[t.columns>=1]].sum(1)
    print t.sum()
    #en = t[t.columns[t.columns>=1]].sum(1)/t.sum(1)
    return t

def hydroplot(res, key):
    res.boxplot('hydro',by=key,vert=False,grid=False)
    plt.title('peptide hydrophobicity by response')
    plt.suptitle('')
    plt.xlabel('hydrophobicity'); plt.ylabel(key)
    plt.tight_layout()
    plt.savefig('responses_hydrophobicity.png')

VLA whole blood results:


In [5]:
EpitopePipeline=reload(EpitopePipeline)
plt.rcParams['font.size']=20
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','legend.frameon': True})

#reload peptide list
plist = pd.read_csv('peptide_lists.csv',index_col=0)
#whole blood
wb = pd.read_csv('vla_wholeblood_results.csv')
wbcols = ['8740','8742','8743','8744']

tags = wb.tag.apply( lambda x: pd.Series(x.split('_')))
wb['mbname'] = tags[0]
wb['id'] = tags[1].astype(int)

#merge them
wb = plist.merge(wb,left_index=True,right_on='id',how='right')
wb.ix[wb.method.isnull(), 'method'] = '_pos ctrls'
#wb['both'] = wb.peptide.isin(ovlps.peptide_y)

ctrls = wb[wb.method=='_pos ctrls']
print ctrls[['tag','#pos']]

s=responsesbreakdown(wb, label='no. positives')
plt.savefig('responses_bymethod_wb.png')
s['%pos'] = s[s.columns[-4:]].sum(1)/s.sum(1)*100
s


               tag  #pos
0      CFP10.1_300     2
1      CFP10.2_301     2
2      CFP10.7_302     3
3      CFP10.8_303     2
4      CFP10.9_304     4
5   ESAT-6 p45_305     4
6   ESAT-6 p46_306     3
7   ESAT-6 p47_307     3
8   ESAT-6 p51_308     4
9   ESAT-6 p52_309     4
10  ESAT-6 p53_310     4
11  ESAT-6 p55_311     4
#pos
0    171
1     26
2     37
3     24
4     24
dtype: float64
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:7: FutureWarning: sort(....) is deprecated, use sort_index(.....)
Out[5]:
#pos 0 1 2 3 4 %pos
method
topsharedbinders 67 8 5 9 5 28.723404
cl_tepitopepan 52 6 17 8 5 40.909091
cl_netmhciipan 52 12 12 4 8 40.909091
_pos ctrls 0 0 3 3 6 100.000000

In [6]:
def responsesheatmap(res, cols, labels=None):
    """heatmap of responses by method"""
    
    l = res['strategy'].nunique()
    fig,axs = plt.subplots(l,1,figsize=(6,6))
    grid=axs.flatten()
    i=0
    for n,g in res.groupby('strategy'):
        g['mean'] = g[cols].mean(1)        
        x = g.sort('mean')[cols]        
        x = x.loc[(x>0).any(1)]
        ax=grid[i]
        hm = ax.pcolor(x.T,cmap='Blues')
        ax.set_yticks(np.arange(0.5, len(x.columns)))
        if labels == None:
            names = cols
        else:
            names = labels
        ax.set_yticklabels(names, minor=False, fontsize=10)
        #ax.set_yticks(np.arange(0.5, len(x.index)))
        ax.set_xticks([])
        ax.set_xticklabels('')
        ax.set_xlim(0, len(x.index)-1)
        ax.set_title(n)        
        i+=1
        
    #cb=fig.colorbar(hm, ax=axs[0],orientation='horizontal', aspect=5)
    #plt.setp(cb.ax.get_xticklabels(), rotation='vertical', fontsize=10)
    plt.tight_layout()
    fig.subplots_adjust(hspace=0.4)

In [7]:
x = wb.copy()
x = x.replace({'-': 0, 'positive': 1})
methlabels = {'cl_tepitopepan': 'binder clusters set 1', 'cl_netmhciipan':'binder clusters set 2',
              'topsharedbinders': 'top scoring binders'} # 'random binders': 'C'}
x['strategy'] = x.method.replace(methlabels)
responsesheatmap(x, cols=wbcols)
plt.savefig('wb_responses_heatmap.png')


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:10: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/usr/lib/python2.7/dist-packages/matplotlib/collections.py:571: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

VLA pbmc results:


In [8]:
#pbmc data already has wb positives column in it
pbmc = pd.read_csv('vla_pbmc_results.csv')
pcols = ['7010','7447','7448','7855','7853','7852','7449']
x = pbmc[pcols]
#derive positive rates for each peptide using gareth method of nil mean+3SD and 0.1 cutoff
#nil wells for cutoffs
nil = pd.read_csv('vla_pbmc_nil.csv')

pcut = nil.mean()+nil.std()*3+0.1
pcut2 = nil.mean()+0.1
#pbmcpos = ((x>=pcut) & (x>=pcut2)).astype(int)
#pbmcpos = ((x>=pcut+0.1)).astype(int)
#pbmcpos['method'] = res.method
#responsesheatmap(pbmcpos, cols=pcols)

pbmc['#pbmcpos'] = x[(x>=pcut)].count(1)
#pbmc.plot('#garethpos','#pbmcpos',kind='scatter', figsize=(3,3))

#pbmc['mean'] = pbmc[pcols].mean(1)
#pbmc['#wbpos'] = pbmc['#wbpos']/4*100
#pbmc['#pbmcpos'] = pbmc['#pbmcpos']/7*100
#pbmc['#pbmcpos'] = pbmc['#pbmcpos'].astype(int)
pbmc = pbmc.sort_values('#pbmcpos')

#get the mean fraction of both positive counts
#pbmc['score'] = (pbmc['#pbmcpos']+pbmc['#wbpos'])/2.0
#pbmc['score'] = pbmc.score.round(0)

#just get the total fraction of positives
pbmc['score'] = ((pbmc['#pbmcpos']+pbmc['#wbpos'])/11*100).round(0)

tags = pbmc.tag.apply( lambda x: pd.Series(x.split('_')))
#pbmc['mbname'] = tags[0]
pbmc['id'] = tags[1].astype(int)

#merge with peptide list
plist = pd.read_csv('peptide_lists.csv',index_col=0)
pbmc = plist.merge(pbmc,left_index=True,right_on='id',how='right')
pbmc.ix[pbmc.method.isnull(), 'method'] = '_pos ctrls'
print pbmc.groupby('method').agg({'id':np.size})

#pbmc = pbmc[pbmc.method!='_pos ctrls']
#responsesbreakdown(r, key='#pbmcpos', label='% positives')

ctrls = pbmc[pbmc.method=='_pos ctrls']
print ctrls[['tag','#pbmcpos','#wbpos','score']]
print ctrls.groupby('#pbmcpos').agg({'id':np.size}).T
print 
print 'mean score of positive controls:', ctrls.score.mean()


                  id
method              
_pos ctrls        12
cl_netmhciipan    88
cl_tepitopepan    88
topsharedbinders  94
               tag  #pbmcpos  #wbpos  score
0      CFP10.1_300         1       2     27
1      CFP10.2_301         1       2     27
11  ESAT-6 p55_311         3       4     64
3      CFP10.8_303         3       2     45
8   ESAT-6 p51_308         3       4     64
7   ESAT-6 p47_307         4       3     64
4      CFP10.9_304         4       4     73
6   ESAT-6 p46_306         4       3     64
10  ESAT-6 p53_310         4       4     73
9   ESAT-6 p52_309         4       4     73
5   ESAT-6 p45_305         5       4     82
2      CFP10.7_302         5       3     73
#pbmcpos  1  3  4  5
id        2  3  5  2

mean score of positive controls: 60.75

data plots


In [9]:
#pbmc data plot
sns.set_context("notebook", font_scale=1.2)
c=['#pbmcpos']+pcols
#subtract nil mean
x = pbmc[pbmc['#pbmcpos']>=1]
x = x[x.method!='_pos ctrls']
x[pcols] = x[pcols]-nil.mean()
#x['numpos'] = (x['#pbmcpos']*7/100+1).astype(int)
x['numpos'] = x['#pbmcpos']
x['mean'] = x[pcols].mean(1)

t = pd.melt(x,id_vars=['numpos','method'],value_vars=pcols,var_name='animal',value_name='OD')

fig,ax = plt.subplots(1,1,figsize=(4,4))
sns.swarmplot(x="numpos", y="mean", data=x,
            alpha=0.8, size=4, linewidth=.8, edgecolor="black", color='white', ax=ax) #jitter=True

ax.legend(['mean OD'],fontsize=7)
sns.boxplot(x="numpos", y="OD", data=t, ax=ax, linewidth=1, saturation=0.5,
           fliersize=0 ,color='lightblue')
ax.set_ylim(-0.2,1.4)
ax.set_xlabel('responding animals')
plt.title('nil subtracted OD PBMC')
plt.tight_layout()
sns.despine()
plt.savefig('oddata_pbmc.png',dpi=300)



In [10]:
#whole blood data plot
#whole blood values already nil subtracted

wb = pd.read_csv('vla_wholeblood_results.csv')
wb = wb.sort_values('#pos')
cols = ['8740_val','8742_val','8743_val','8744_val']
wb['mean'] = wb[cols].mean(1)

x = wb[wb['#pos']>=1]
x = x[-x.tag.isin(ctrls.tag)]
x['#pos'] = wb['#pos']

t = pd.melt(x,id_vars='#pos',value_vars=cols,var_name='animal',value_name='OD')

fig,ax = plt.subplots(1,1,figsize=(4,4))
sns.swarmplot(x="#pos", y="mean", data=x, #hue='animal', palette='Set1', jitter=True,
            alpha=0.8, size=4, linewidth=.8, edgecolor="black", color='white', ax=ax)
ax.legend(['mean OD'],fontsize=7)
sns.boxplot(x="#pos", y="OD", data=t, ax=ax, linewidth=1, saturation=0.5,
           fliersize=1 ,color='lightgreen')
plt.title('nil subtracted OD Whole Blood')
ax.set_xlabel('responding animals')
plt.tight_layout()
sns.despine()
plt.savefig('oddata_wb.png',dpi=300)


pbmc results for baseline/control set


In [11]:
pbmcbl = pd.read_csv('vla_pbmc_baseline.csv')
x = pbmcbl[pcols]

#nil wells for cutoffs
nil = pd.read_csv('vla_pbmc_nil_baseline.csv')
blcut = nil.mean()+nil.std()*3+0.1
blcut2 = nil.mean()+0.1
pbmcbl['#pbmcpos'] = x[(x>=blcut)].count(1)
#pbmcbl['#pbmcpos'] = pbmcbl['#pbmcpos']/7*100
#pbmcbl['#pbmcpos'] = pbmcbl['#pbmcpos'].astype(int)
pbmcbl['score'] = (pbmcbl['#pbmcpos']/7*100).round(0)

pbmcbl.rename(columns={'method':'predictor'})

neglist = pd.read_csv('negatives_list.csv',index_col=0)
neglist = neglist.drop(['score'],axis=1)
pbmcbl = neglist.merge(pbmcbl,on='id')
print len(pbmcbl)
pbmcbl['method'] = 'random binders'


94

create a final dataframe with all results


In [12]:
final = pd.concat([pbmc,pbmcbl])

#calculate final confidence levels by binning score based on positive controls
bins = [0,26,100]
final['confidence'] = pd.cut(final.score, bins=bins, precision=0, include_lowest=True,
                             labels=['low','high'])
#create column label combining cluster methods
methlabels = {'cl_tepitopepan': 'Binder Clusters', 'cl_netmhciipan':'Binder Clusters',
              'topsharedbinders': 'Top Scoring Binders', 'random binders': 'Random Binders (Control)'}    
final['strategy'] = final.method.replace(methlabels)

final = final.drop(['tbname','translation','product','name','nearest','allele','1-log50k(aff)','core','order',
                    'plate_no','#garethpos','rank','pos','source','length'],axis=1)
#print final[:2]
import csv
final.to_csv('final_results_all.csv', quoting=csv.QUOTE_NONNUMERIC)
#print final.ix[0]
print
print final.groupby('method').size()
print final.groupby(['method','confidence']).size().T
#print

#s2 = responsesbreakdown(final, key='#pbmcpos', label='% pbmc positives')
#plt.savefig('responses_bymethod_pbmc.png')
#s2.cumsum().sum()
#s2['%pos'] = s2[s2.columns[-4:]].sum(1)/s2.sum(1)*100
#s2 = s.merge(s2,left_index=1,right_index=1,how='right').fillna(0)
#display(s2)


method
_pos ctrls          12
cl_netmhciipan      88
cl_tepitopepan      88
random binders      94
topsharedbinders    94
dtype: int64
method            confidence
_pos ctrls        high          12
cl_netmhciipan    low           60
                  high          28
cl_tepitopepan    low           68
                  high          20
random binders    low           92
                  high           2
topsharedbinders  low           76
                  high          18
dtype: int64

pbmc responses breakdown


In [ ]:
#get boolean arrays for heatmap by subtracting cuts

xfinal = (final[pcols]>=pcut).astype(int)
xfinal['method'] = final.method
xbl = (pbmcbl[pcols]>=blcut).astype(int)
xbl['method'] = 'baseline'
xctrls = (ctrls[pcols]>=pcut).astype(int)
xctrls['method'] = '_pos ctrls'
xcomb = pd.concat([xfinal,xbl,xctrls])

responsesheatmap(xcomb, cols=pcols)
plt.savefig('pbmc_responses_heatmap.png')

In [13]:
final = pd.read_csv('final_results_all.csv')
final = final.replace('_pos ctrls','positive controls')
final = final.sort('#wbpos')
g=sns.factorplot(x='#wbpos',y='#pbmcpos',data=final,kind='swarm',size=3,marker='x',#jitter=True,
                 aspect=1.5,alpha=.6,edgecolor='black')
ax=g.axes.flatten()[0]
#g=sns.factorplot(x='#wbpos',y='#pbmcpos',data=final,kind='box',size=3,aspect=1.5,ax=ax)
ax.set_xlabel('whole blood responder freq (%)')
ax.set_ylabel('PBMC responder freq')


/usr/local/lib/python2.7/dist-packages/pandas/core/common.py:449: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask = arr == x
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()
Out[13]:
<matplotlib.text.Text at 0x7f0d577cc050>

final breakdown of combined responses


In [14]:
sns.set_context("notebook", font_scale=1.1)
#final.hist('score',by='method',stacked=True,figsize=(5,3),sharex=True,bins=range(0,100,5))
final = pd.read_csv('final_results_all.csv')
final = final.replace('_pos ctrls','Positive Controls')
#custom sort 
names = {'Binder Clusters':0, 'Top Scoring Binders':1,'Positive Controls':2,'Random Binders (Control)':3}

final['order'] = final.strategy.map(names)
final=final.sort_index(by='order')
final=final.drop('order',1)
x = final[-final.method.isin(['Positive Controls'])]
#print x[x.method=='random binders'][['confidence','score']]

sf = x.groupby(['confidence','strategy']).agg({'id':np.size})
sf = sf.unstack('strategy')
sf.columns = sf.columns.get_level_values(1)
perc = sf.sum(1)
sf = sf.T.fillna(0)

'''sf['order'] = sf.index.to_series().map(names)
sf=sf.sort_index(by='order')
sf=sf.drop('order',1)'''

sf['success'] = sf['high']/sf.sum(1)*100
#hydroplot(final, key='confidence')
display(sf)
#from statsmodels.stats.weightstats import ztest

g=sns.factorplot(y='strategy',x='score',data=final,kind='swarm',s=6,#size=4,
                 aspect=1.7,alpha=.9,edgecolor='black',linewidth=.5)
ax=g.axes.flatten()[0]
ax.set_xlabel('responder frequency')
#ax.set_xticks(x.score.unique())
#for i in ax.get_xticklabels():
#    i.set(rotation=90)

ax.axvline(26,0,100,linewidth=1, color='red',linestyle='--')    
#g.fig.text(.7, 1.0, 'positive', bbox={'facecolor':'white','alpha':0.8, 'pad':5})
#g.fig.text(.4, 1.0, 'negative', bbox={'facecolor':'white','alpha':0.8, 'pad':5})
ax.set_xlim((-5,100))
sns.despine()
plt.subplots_adjust(top=0.8)
from pandas.tools.plotting import table
t=table(ax, sf.astype(int),loc='lower right',colWidths=[0.08, 0.08, 0.1])
#t.auto_set_font_size(False)
#t.set_fontsize(5.8)
#plt.tight_layout()

plt.savefig('responder_freqs_stripplot.png',bbox_inches='tight',dpi=300)


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:9: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
confidence high low success
strategy
Binder Clusters 48 128 27.272727
Random Binders (Control) 2 92 2.127660
Top Scoring Binders 18 76 19.148936

look at top peptide and shared binders


In [15]:
final = pd.read_csv('final_results_all.csv')
#top peptides (high confidence) and their protein info
cols = ['locus_tag','mbname','peptide','method','strategy','score','confidence','start']

top = final[final.confidence=='positive']
top = top[top.method != '_pos ctrls']
neg = final[final.confidence=='negative']

#overlap of our final sets with known antigens
EpitopePipeline=reload(EpitopePipeline)
antigens = EpitopePipeline.getAntigenDatasets()
antigens['source'] = antigens.source.apply(lambda r: ';'.join(r))
top = antigens.merge(top[cols],on='locus_tag',how='right')

annot = EpitopePipeline.combineAnnotationData()
#kruh = pd.read_csv('Kruh_proteome_30d.csv')
#print top[top.locus_tag.isin(kruh.locus_tag)]
#add annotations
top = top.merge(annot[['locus_tag','gene','length','product']],on='locus_tag',how='left')
top = top.sort_values(['score','locus_tag'],ascending=[False,True]).fillna('')
top = top.reset_index(drop=True)
top.to_csv('top_peptides.csv',index=False)

print '%s peptides in positive category' %len(top)
print 'total positive rate: %s' %str(len(top)/270.0)
print

#print top.columns
topc = top.groupby('locus_tag').agg({'peptide':np.size,
                                     'product':base.first}).sort_values('peptide',ascending=False)
topc = topc.rename(columns={'peptide':'positives'})
#print topc[:5]

tsp = final.groupby('locus_tag').agg({'strategy':base.first,'R':np.size})
tsp = tsp.rename(columns={'R':'prots'})

#lt=['Rv0655']
#lt=['Rv3676','Rv3584']
#lt='Rv0757'
lt=['Rv3584']
display(final[final.locus_tag.isin(lt)][cols].sort_values(['locus_tag','start']))

print 'proteins with most positives:'
x = tsp.merge(topc,left_index=True,right_index=True)
x = x[x.positives>1].sort_values(['positives','strategy'],ascending=False)
display(x)
#print x.groupby('prots').agg({'positives':np.sum})
#x['frac'] = x.peptide/x.prots
#x.plot('prots','frac',kind='scatter',figsize=(4,4))
print 'peptides with highest scores:'
display( top[top.score>=60][cols])


0 peptides in positive category
total positive rate: 0.0

locus_tag mbname peptide method strategy score confidence start
187 Rv3584 Mb3615 AVNGNRLTINNVLLRDIRIQ topsharedbinders Top Scoring Binders 36 high 43
168 Rv3584 Mb3615 LTINNVLLRDIRIQAVQTSD topsharedbinders Top Scoring Binders 27 high 49
185 Rv3584 Mb3615 PGKAVDLVLVAVNQSPDVSD topsharedbinders Top Scoring Binders 9 low 72
proteins with most positives:
prots strategy product positives
peptides with highest scores:
locus_tag mbname peptide method strategy score confidence start

In [40]:
#cg = EpitopePipeline.getCategories(genome,plot=False)
cg = EpitopePipeline.getCategories(srm,plot=False)

cneg = EpitopePipeline.getCategories(neg,plot=False)
cpos = EpitopePipeline.getCategories(top,plot=False)

print len(neg), cneg.sum()

cats = pd.concat([cg,cpos,cneg],axis=1)
cats = cats/cats.sum()
cats.columns=(['genome (filtered)','negative','positive'])
ax=cats.plot(kind='barh',cmap='winter',alpha=0.8,width=.7,grid=False,figsize=(9,6))
plt.xlabel('fraction of total')
ax.set_title('host protein MTB category representation')
plt.tight_layout()
plt.savefig('vlaresults_cats.png')


260 260

In [17]:
#srm concentrations of positives?
c = top.merge(srm, on='locus_tag', how='left')
f,ax=plt.subplots(1,1,figsize=(6,3))
c.hist('concentration',bins=30,ax=ax,grid=False)
#ax.set_yscale('log')
#print c[c['concentration_y']>10][['locus_tag','concentration_y','score','product_y']]


Out[17]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f20c0bdab10>], dtype=object)

In [18]:
#clusters correlation with positives and no. binders..?
f=pd.read_csv('final_results_all.csv')
print f.columns
f.plot('score','length',kind='scatter',figsize=(3,3))

#tsb correlation with binders in each epitope R?..


Index([u'Unnamed: 0', u'#garethpos', u'#pbmcpos', u'#wbpos', u'1-log50k(aff)', u'7010', u'7447',
       u'7448', u'7449', u'7852', u'7853', u'7855', u'8740', u'8742', u'8743', u'8744', u'R',
       u'RVL', u'allele', u'binders', u'clustersize', u'concentration', u'core', u'end', u'gene',
       u'hydro', u'id', u'id_x', u'id_y', u'identity', u'iedbmhc1_11_overlap',
       u'iedbmhc1_9_overlap', u'length', u'locus_tag', u'mbname', u'mean', u'method',
       u'mhc1overlaps', u'name', u'nearest', u'order', u'overlap', u'pep_no', u'peplength',
       u'peptide', u'pos', u'product', u'rank', u'score', u'source', u'start', u'tag', u'tbname',
       u'translation', u'wbmean', u'well_no', u'confidence'],
      dtype='object')
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f20c098bb10>

mhc1 overlaps and positives


In [78]:
f=pd.read_csv('final_results_all.csv')
#f['confidence'] = f.confidence.map({'negative':0,'low':1,'medium':2,'high':3})
f['confidence'] = pd.Categorical(f.confidence)
sns.set_context("notebook", font_scale=1.0)
#f = f[f.method.isin(['topsharedbinders'])]
#f['mhc1overlaps'] = f.filter(regex="iedbmhc1_").sum(axis=1)
f['overlap'] = (f.mhc1overlaps>=1).astype(int)

xx = f[-f.mhc1overlaps.isnull()]
cols=['peptide','method','iedbmhc1_9_overlap','iedbmhc1_11_overlap','overlap']
#print xx[cols][22:34]

print xx.groupby(['overlap','method']).size()

gg = xx.groupby(['confidence']).agg({'overlap': [np.sum,np.size]})
gg.columns = gg.columns.get_level_values(1)
gg['frac'] = gg['sum']/gg['size']
#print gg

#gg.plot(y='overlap',kind='barh',by='overlap',figsize=(3,3))
gg.plot(y='frac',kind='barh',figsize=(3,3))


#xx.plot('mhc1overlaps','confidence',kind='scatter',figsize=(2,2))
gg = xx.groupby(['confidence']).agg({'mhc1overlaps': [np.mean,np.std,np.size]})
gg.columns = gg.columns.get_level_values(1)
print gg
gg.plot(y='mean',xerr='std',kind='barh',figsize=(3,3))
plt.title('mhc overlaps')
sns.despine()
#xx.hist('mhc1overlaps',by='confidence')


overlap  method          
0        random binders      31
         topsharedbinders    46
1        cl_netmhciipan      88
         cl_tepitopepan      88
         random binders      63
         topsharedbinders    48
dtype: int64
                mean       std  size
confidence                          
negative    1.692308  1.523794   260
positive    2.125000  1.629975    56

distributions of BoLA alleles

references:

  • A. B. Dietz, N. D. Cohen, L. Timms, and M. E. Kehrli, “Bovine lymphocyte antigen class II alleles as risk factors for high somatic cell counts in milk of lactating dairy cows.,” J. Dairy Sci., vol. 80, no. 2, pp. 406–412, 1997.
  • J. Oprządek, P. Urtnowski, G. Sender, A. Pawlik, and M. Łukaszewicz, “Frequency of BoLA-DRB3 alleles in Polish Holstein-Friesian cattle,” Anim. Sci. Pap. Reports, vol. 30, no. 2, pp. 91–101, 2012.
  • R. Baxter, N. Hastings, a. Law, and E. J. Glass, “A rapid and robust sequence-based genotyping method for BoLA-DRB3 alleles in large numbers of heterozygous cattle,” Anim. Genet., vol. 39, no. 5, pp. 561–563, 2008.

In [24]:
#bola alleles plots
bd = pd.read_csv('bola_alleles_dietz.csv')
bd=bd.set_index('allele').sort('freq',ascending=False)
bd['name'] = 'dietz'

bo = pd.read_csv('bola_alleles_oprzadek.csv')
bo = bo.groupby('allele_short').agg({'freq':np.max})
bo=bo.sort('freq',ascending=False)
bo['name'] = 'oprzadek'

bx = pd.read_csv('bola_alleles_baxter.csv')
bx = bx.groupby('allele_short').agg({'freq':np.max})
bx=bx.sort('freq',ascending=False)
bx['name'] = 'baxter'

x=pd.concat([bd,bo,bx]).reset_index('allele')
x = x.pivot('index','name','freq')
x=x[x>0.005].dropna(how='all')
table = x[x>0.02].reindex_axis(x.mean(1).order(ascending=False).index).dropna(how='all').fillna('-')

sns.set_context("notebook", font_scale=1.3)
x.plot(y=['baxter','dietz','oprzadek'],kind='bar',figsize=(10,5),cmap='YlOrRd',grid=False,width=0.8)
plt.title('BoLA DRB3.2 allele frequencies')
plt.xlabel('allele')
plt.ylabel('frequency')
plt.legend(labels=['Holstein-Charolais','Holstein USA','Polish Holstein-Freisian'],loc=9)
sns.despine()
plt.tight_layout()
plt.savefig('bola_allelefreqs.png')
targetbola = x.index
table


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:8: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:13: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:19: FutureWarning: order is deprecated, use sort_values(...)
Out[24]:
name baxter dietz oprzadek
index
*24 - 0.1429 0.207
*08 - 0.141 0.137
*22 - 0.1429 0.1239
*27 0.2 0.0369 -
*11 0.16 0.0851 0.031
*16 0.1 0.1001 0.0752
*23 - 0.091 0.0819
*06 0.06 - -
*09 0.06 - -
*01 0.06 - -
*02 0.07 - 0.0288
*05 0.04 - -
*36 - - 0.0376
*12 0.06 0.0314 0.0208
*07 - 0.0532 -
*28 - - 0.0586
*03 - 0.0396 0.0255
*10 0.05 - 0.0221
*26 - 0.0232 -

In [29]:
reload(Tepitope)
d=Tepitope.datadir
boladrb = os.path.join(d,'IPD_MHC/bola.drb3.fa')
hladrb = os.path.join(d,'IPD_MHC/hla.drb345.fa')
ref = os.path.join(d,'IPD_MHC/hlaref.fa')
aln = Tepitope.drbaln
alnindex = dict([(a.id,a) for a in aln])
Tepitope.compareRef(hladrb,boladrb,ref,alnindex)