In [6]:
import numpy as np
import pandas as pd
pd.set_option('display.width', 100)
pd.set_option('max_colwidth', 80)
%matplotlib inline
import matplotlib as mpl
import seaborn as sns
sns.set_context("notebook", font_scale=1.4)
from IPython.display import display, HTML
import epitopepredict as ep
from epitopepredict import base, sequtils, analysis
In [7]:
genbankfile = '../testing/zaire-ebolavirus.gb'
zaire = sequtils.genbank_to_dataframe(genbankfile, cds=True)
savepath = 'test_iedbmhc1'
Pi = ep.get_predictor('iedbmhc1')
alleles = ep.mhc1_supertypes
Pi.predictProteins(zaire,length=11,alleles=alleles,path=savepath,overwrite=False)
Pi.load(path=savepath)
One application of immunoinformatics is to screen out likely candidate antigens from the genome for further study. To do this using epitope prediction requires filtering the many potential binders in each protein. There is no theoretical basis for doing this.
Epitope clustering has been previously observed to be an indicator of T cell epitope regions. The findClusters method in the analysis module allows automatic cluster detection from a set of predicted binders from one or more proteins. It can be done for a whole genome.
The result is a table of sequence regions with the number of binders and density of epitope cluster.
In [8]:
#find clusters of binders in these results
pb = Pi.promiscuousBinders(cutoff=5,n=2)
cl = analysis.find_clusters(pb, dist=10, min_size=3, genome=zaire)
display(cl[:20])
In [12]:
name = 'ZEBOVgp6'
#ep.plotting.plot_multiple([Pi], name, regions=cl, n=2)
In [13]:
Pn = ep.get_predictor('netmhciipan')
savepath2='test_netmhciipan'
alleles2 = ep.mhc2_supertypes
Pn.predictProteins(zaire,length=11,alleles=alleles2,path=savepath2,overwrite=False)
Pn.load(path=savepath2)
cl = analysis.get_overlaps(cl,Pn.promiscuousBinders(n=2,cutoff=5),label='mhc2_ovlp')
#plot both sets of binders and overlay region of cluster in previous data
ax = ep.plot_tracks([Pi,Pn],name=name,legend=True,figsize=(14,4),n=2)
r = cl[cl.name==name]
print r
coords = (list(r.start),list(r.end-r.start))
coords = zip(*coords)
ep.plot_regions(coords, ax, color='gray')
In [14]:
reload(ep.base)
reload(analysis)
reload(sequtils)
name = 'ZEBOVgp2'
proteinseq = zaire[zaire.locus_tag==name].translation.iloc[0]
#print proteinseq
#print pb
seqs = pb.peptide
#provide a list of seqs (e.g. strains)
filename='ebola_VP35.fa'
r = sequtils.fetch_protein_sequences('Filovirus[Orgn] AND VP35[Gene]', filename=filename)
#align fasta sequences
aln = sequtils.muscle_alignment(filename)
#sequtils.showAlignment(aln)
alnrows = sequtils.alignment_to_dataframe(aln)
#print (sequtils.formatAlignment(aln))
#print alnrows[:25][['accession','definition','perc_ident']]
c = analysis.epitope_conservation(seqs, alnrows=alnrows)
Blast the protein sequence locally to get an alignment with any orthologs, then perform the analysis as above. In this case we have created a blast database of viral refseq proteins locally. Online blast is generally too slow for such an analysis, especially if we want to examine epitopes in many proteins.
In [19]:
#localdb must be present in your file system, in the case it's in a folder called db
localdb = '../db/viral_refseq'
blr = analysis.get_orthologs(proteinseq, db=localdb)
alnrows, aln = analysis.align_blast_results(blr)
alnrows.to_csv('%s_aligned.csv' %name)
print alnrows
#now we run the same analysis using the set of refseq orthologs
c = analysis.epitope_conservation(seqs, alnrows=alnrows)
In [ ]: