>HIV1_env_HXB2
MRVKEKYQHLWRWGWRWGTMLLGMLMICSATEKLWVTVYYGVPVWKEATT
TLFCASDAKAYDTEVHNVWATHACVPTDPNPQEVVLVNVTENFNMWKNDM
VEQMHEDIISLWDQSLKPCVKLTPLCVSLKCTDLKNDTNTNSSSGRMIME
KGEIKNCSFNISTSIRGKVQKEYAFFYKLDIIPIDNDTTSYKLTSCNTSV
ITQACPKVSFEPIPIHYCAPAGFAILKCNNKTFNGTGPCTNVSTVQCTHG
IRPVVSTQLLLNGSLAEEEVVIRSVNFTDNAKTIIVQLNTSVEINCTRPN
NNTRKRIRIQRGPGRAFVTIGKIGNMRQAHCNISRAKWNNTLKQIASKLR
EQFGNNKTIIFKQSSGGDPEIVTHSFNCGGEFFYCNSTQLFNSTWFNSTW
STEGSNNTEGSDTITLPCRIKQIINMWQKVGKAMYAPPISGQIRCSSNIT
GLLLTRDGGNSNNESEIFRPGGGDMRDNWRSELYKYKVVKIEPLGVAPTK
AKRRVVQREKRAVGIGALFLGFLGAAGSTMGAASMTLTVQARQLLSGIVQ
QQNNLLRAIEAQQHLLQLTVWGIKQLQARILAVERYLKDQQLLGIWGCSG
KLICTTAVPWNASWSNKSLEQIWNHTTWMEWDREINNYTSLIHSLIEESQ
NQQEKNEQELLELDKWASLWNWFNITNWLWYIKLFIMIVGGLVGLRIVFA
VLSIVNRVRQGYSPLSFQTHLPTPRGPDRPEGIEEEGGERDRDRSIRLVN
GSLALIWDDLRSLCLFSYHRLRDLLLIVTRIVELLGRRGWEALKYWWNLL
QYWSQELKNSAVSLLNATAIAVAEGTDRVIEVVQGACRAIRHIPRRIRQG
LERILL
In [1]:
%pylab inline
import pandas as pd
from Bio import SeqIO
In [44]:
"""Load in single HXB2 sequence.
Note that DATA_PATH should point to a general data directory and within that there should be
an "epitope_hotspot" folder containing the file. Or you can supply the path to the file at any location."""
with open(DATA_PATH + 'epitope_hotspot/env_hxb2.fasta','r') as fileHandle:
"""We'll use seq as a string instead of a sequnece object that is returned by SeqIO"""
seq = str(SeqIO.read(fileHandle,'fasta').seq)
"""Remove gaps and the stop codon character at the end of the sequence"""
seq = seq.replace('-','').replace('*','')
print "HXB2 (%d): %s" % (len(seq),seq)
In [45]:
"""Load in the predicted log-IC50 values for each allele:peptide pair"""
bindingDf = pd.read_csv(DATA_PATH + 'epitope_hotspot/env_hxb2.9.netmhcpan.out')
"""Replace the * (asterisk) with _ (underscore) in each HLA allele so that they match those in the frequency file"""
bindingDf['HLA'] = bindingDf['HLA'].map(lambda s: s.replace('*','_'))
In [48]:
"""Make a dict out of the predictions for rapid and convenient access in the future.
In this example since we're only using the values once to make the 2D matrix, this step
may seem pointless, but it is very efficient and useful in other cases.
The key is a tuple of the HLA allele and the peptide, and the value is log-IC50
e.g. {('A_0201','MRVKEKYQH'):10.3}"""
ba = {(row['HLA'],row['Peptide']):row['Value'] for i,row in bindingDf.iterrows()}
for k in ba.keys()[:10]:
print k, ba[k]
In [49]:
"""Load HLA frequencies from a comma delimited file (CSV).
We'll use the allele column as an index for looking up frequencies."""
hlaFreq = pd.read_csv(DATA_PATH + 'epitope_hotspot/alleleFrequencies.csv',index_col='allele')
hlaFreq['Average'].head()
Out[49]:
In [52]:
"""Create a 2D matrix of binding affinities for the single sequence of shape [Npos x NHLAs]"""
uHLAs = unique(bindingDf.HLA)
k = 9
"""Initialize matrix to 15 (not a binder): any missing predictions will be counted as non-binders"""
baMat = 15 * ones((len(seq)-k+1,len(uHLAs)))
for starti in arange(len(seq)-k+1):
if not seq[starti] == '-':
"""If there is a gap character at this start position, it is not a valid kmer"""
"""Grab a "gapless" kmer by starting at position starti and taking k non-gap characters of the str"""
pep = seq[starti:].replace('-','')[:k]
for hlai, h in enumerate(uHLAs):
try:
"""In python we say "Better to ask forgiveness than permission."
This means we will try to get the prediction with key (h,pep)
but if it doesn't exist python will generate a KeyError.
Rather than halt the program we will "catch" this error,
print an error message and move on. """
baMat[starti,hlai] = ba[(h,pep)]
except KeyError:
print "Could not find prediction for HLA %s and peptide %s (giving it default level of log-IC50 = 15)" % (h,pep)
"""Instead of try/except we could do this (they have the same effect)"""
if ba.has_key((h,pep)):
baMat[starti,hlai] = ba[(h,pep)]
else:
print "Could not find prediction for HLA %s and peptide %s (giving it default level of log-IC50 = 15)" % (h,pep)
figure(figsize=(20,5))
pcolor(baMat.T,cmap='gray')
xlim((0,len(seq)))
ylim((0,len(uHLAs)))
ylabel('HLA alleles')
xlabel('9mer start position')
title('Plot of binding affinity (log-IC50)')
Out[52]:
In [53]:
"""Make a couple descriptive figures to make sure the data looks right."""
"""HLA A*0203 and A*0206 should be highly correlated because they are very similar alleles."""
figure(figsize=(15,5))
subplot(1,2,1)
scatter(baMat[:,uHLAs=='A_0203'],baMat[:,uHLAs=='A_0206'],alpha=0.3)
plot([1,12],[1,12],'--',lw=2,color='gray')
xlabel('A*0203 log-IC50')
ylabel('A*0206 log-IC50')
xlim((1,12))
ylim((1,12))
"""Whereas, HLA A*0203 and B*1501 should not bind the same peptides."""
subplot(1,2,2)
scatter(baMat[:,uHLAs=='A_0203'],baMat[:,uHLAs=='B_1501'],alpha=0.3)
plot([1,12],[1,12],'--',lw=2,color='gray')
xlabel('A*0203 log-IC50')
ylabel('B*1501 log-IC50')
xlim((1,12))
ylim((1,12))
Out[53]:
In [54]:
"""All binding affinities less than log(500) will be counted as "binders" """
binders = baMat < log(500)
figure(figsize=(20,5))
pcolor(binders.T,cmap='gray')
xlim((0,len(seq)))
ylim((0,len(uHLAs)))
ylabel('HLA alleles')
xlabel('9mer start position')
title('Plot of binders (True = 1 = white; False = 0 = black)')
Out[54]:
In [66]:
"""Compute percent of population binding each 9mer based on HLA allele frequencies."""
hfreq = array([hlaFreq.Average[h] for h in uHLAs])
hlaBindingFreq = binders * tile(hfreq[None,:],(binders.shape[0],1))
"""Need to compute phenotypic frequency separately for HLA-A and HLA-B alleles"""
locusFreq = zeros((2,binders.shape[0]))
for loci,locus in enumerate('AB'):
"""Index to slice only the alleles for each locus at a time"""
locusInd = [i for i,h in enumerate(uHLAs) if h[0]==locus]
tmpFreq = hlaBindingFreq[:,locusInd].sum(axis=1)
locusFreq[loci,:] = tmpFreq**2 + 2*tmpFreq*(1-tmpFreq)
"""Compute density based on the probability of having either an HLA-A or HLA-B allele that binds (or both)"""
hlaBindingDensity = locusFreq[0,:]*locusFreq[1,:] + locusFreq[0,:]*(1-locusFreq[1,:]) + locusFreq[1,:]*(1-locusFreq[0,:])
"""Convert to units of percent"""
hlaBindingDensity = 100*hlaBindingDensity
figure(figsize=(20,5))
plot(arange(len(hlaBindingDensity)),hlaBindingDensity,'-')
xlabel('9mer start position in HIV Env')
ylabel('Percent of population predicted to bind\n(log-IC50 < 500 nM')
Out[66]:
In [ ]:
"""TODO: Compute a p-value for each 9mer and its HLA binding density,
under the null distribution that HLA binding is uniform"""
In [68]:
"""Q: Is the V2 region of Env a HLA binding "hotspot"?"""
V2Region = (157,177)
V2Binding = hlaBindingDensity[V2Region[0]:V2Region[1]].mean()
print 'Env V2 Region (HXB2):',seq[V2Region[0]:V2Region[1]+k]
print 'Mean V2 binding: %1.1f%%' % (V2Binding)
print 'Mean overall Env binding: %1.1f%%' % (hlaBindingDensity.mean())
figure(figsize=(20,5))
fill_between(V2Region,[hlaBindingDensity.max()]*2,color='blue',alpha=0.4)
plot(arange(len(hlaBindingDensity)),hlaBindingDensity,'-')
xlabel('9mer start position in HIV Env')
ylabel('Percent of population predicted to bind\n(log-IC50 < 500 nM')
Out[68]:
In [69]:
"""Compute a null distribution of mean binding values using random non-contiguous regions of the same length as V2"""
nSamps = 10000
nullBinding = zeros(nSamps)
for i in arange(nSamps):
inds = permutation(hlaBindingDensity.shape[0])[:(V2Region[1]-V2Region[0])]
nullBinding[i] = hlaBindingDensity[inds].mean()
In [70]:
"""Compute a null distribution of mean binding values using all contiguous regions of the same length as V2"""
contigBinding = []
for i in arange(hlaBindingDensity.shape[0]):
inds = i + arange(V2Region[1]-V2Region[0])
if max(inds)<hlaBindingDensity.shape[0] and (i < V2Region[0] or i > V2Region[1]):
contigBinding.append(hlaBindingDensity[inds].mean())
contigBinding = array(contigBinding)
In [71]:
"""Plot the two null distributions of mean percent binding and the observed percent binding in V2"""
figure(figsize=(15,5))
counts,edges=histogram(nullBinding,bins=50)
fill_between(edges[:-1],counts/counts.sum(),color='gray')
counts,edges=histogram(contigBinding,bins=15)
fill_between(edges[:-1],counts/counts.sum(),color='blue',alpha=0.5)
plot([V2Binding,V2Binding],[0,(counts/counts.sum()).max()],'-',lw=2,color='red')
xlabel('Mean percent binding in a V2 sized region')
Out[71]:
In [72]:
print 'Fraction of random non-contiguous regions with more binding than Env V2:',(nullBinding>V2Binding).sum()/nSamps
print 'Fraction of all contiguous regions with more binding than Env V2:',(contigBinding>V2Binding).sum()/len(contigBinding)
In [ ]:
"""OLD CODE FOR HANDLING MULTIPLE SEQUENCES"""
'''seqs = {}
with open(DATA_PATH + 'epitope_hotspot/env_hxb2.fasta','r') as fileHandle:
for record in SeqIO.parse(fileHandle,'fasta'):
seqs[record.name]=str(record.seq)
seqs = pd.Series(seqs)
print seqs.head()
seqs.map(len).describe()
"""Now create a 2D matrix of binding affinities for the single sequence of shape [Npos x NHLAs]"""
uHLAs = unique(bindingDf.HLA)
k = 9
"""Initialize matrix to 15 (not a binder)"""
baMat = 15 * ones((len(seqs),len(seqs.iloc[0]),len(uHLA)))
for seqi,seq in enumerate(seqs):
for starti in arange(len(seq)-k+1):
if not seq[starti] == '-':
pep = seq[starti:].replace('-','')[:k]
for hlai, h in enumerate(uHLAs):
try:
baMat[seqi,starti,hlai] = ba[(h,pep)]
except KeyError:
pass
binders = baMat < log(500)
hfreq = array([hlaFreq.Average[h.replace('*','_')] for h in uHLAs])
hfreq = hfreq**2 + 2*hfreq*(1-hfreq)
binders = binders * tile(hfreq[None,None,:],(binders.shape[0],binders.shape[1],1))
V2Region = (236,259)
for s in seqs.iloc[:10]:
print s[V2Region[0]:V2Region[1]+k]
V2Binding = binders[:,V2Region[0]:V2Region[1],:].mean()
bindingMap = binders.mean(axis=2).mean(axis=0)
figure(figsize=(20,5))
fill_between(V2Region,[bindingMap.max()]*2,color='blue',alpha=0.4)
plot(arange(len(bindingMap)),bindingMap,'-')
xlabel('9mer start position in HIV Env')
ylabel('Fraction of HLA alleles that bind\n(averaged over all sequences)')
'''