In [1]:
from Bio import Seq
from Bio import SeqIO
import pandas as pd
import numpy as np
import sys
import os
sys.path.append('/home/will/PySeqUtils/')
from GeneralSeqTools import fasta_reader, fasta_writer
from HIVAlignTools import SeqTransformer, build_aligners

In [2]:
import HIVAlignTools

In [3]:
HIVAlignTools.build_aligners()


gag
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   20.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed: 16.8min
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-3-52ec053a8004> in <module>()
----> 1 HIVAlignTools.build_aligners()

/home/will/PySeqUtils/HIVAlignTools.py in build_aligners(base_path, verbose)
    382              'gp41', 'v3']
    383     for prot in prots:
--> 384         print prot
    385         path = base_path+prot
    386         aligner = train_aligner(prot, path+'.fasta',

/home/will/PySeqUtils/HIVAlignTools.py in train_aligner(prot, path, train_type, test_size, n_jobs, verbose)
    363                       cv=cv,
    364                       n_jobs=n_jobs,
--> 365                       verbose=verbose,
    366                       refit=False)
    367     gd.fit(X, y)

/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.pyc in fit(self, X, y, **params)
    705                           " The params argument will be removed in 0.15.",
    706                           DeprecationWarning)
--> 707         return self._fit(X, y, ParameterGrid(self.param_grid))
    708 
    709 

/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
    491                     X, y, base_estimator, parameters, train, test,
    492                     self.scorer_, self.verbose, **self.fit_params)
--> 493                 for parameters in parameter_iterable
    494                 for train, test in cv)
    495 

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    515         try:
    516             for function, args, kwargs in iterable:
--> 517                 self.dispatch(function, args, kwargs)
    518 
    519             self.retrieve()

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.pyc in dispatch(self, func, args, kwargs)
    310         """
    311         if self._pool is None:
--> 312             job = ImmediateApply(func, args, kwargs)
    313             index = len(self._jobs)
    314             if not _verbosity_filter(index, self.verbose):

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, func, args, kwargs)
    134         # Don't delay the application, to avoid keeping the input
    135         # arguments in memory
--> 136         self.results = func(*args, **kwargs)
    137 
    138     def get(self):

/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.pyc in fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, verbose, loss_func, **fit_params)
    309             this_score = scorer(clf, X_test, y_test)
    310         else:
--> 311             this_score = clf.score(X_test, y_test)
    312     else:
    313         clf.fit(X_train, **fit_params)

/home/will/PySeqUtils/HIVAlignTools.py in score(self, X, y)
    292 
    293         empty_mask = y == 'XX'
--> 294         out_aligns = self.predict(X)
    295 
    296         pos_scores = score_seqs(y[~empty_mask], out_aligns[~empty_mask])

/home/will/PySeqUtils/HIVAlignTools.py in predict(self, X)
    272                                      max_intron_length=self.max_intron_length,
    273                                      num_threads=self.num_threads)
--> 274             stdout, stderr = blastx_cline()
    275 
    276         blast_records = NCBIXML.parse(StringIO(stdout))

/usr/local/lib/python2.7/dist-packages/Bio/Application/__init__.pyc in __call__(self, stdin, stdout, stderr, cwd, env)
    434                                          shell=(sys.platform!="win32"))
    435         #Use .communicate as can get deadlocks with .wait(), see Bug 2804
--> 436         stdout_str, stderr_str = child_process.communicate(stdin)
    437         if not stdout:
    438             assert not stdout_str

/usr/lib/python2.7/subprocess.pyc in communicate(self, input)
    752             return (stdout, stderr)
    753 
--> 754         return self._communicate(input)
    755 
    756 

/usr/lib/python2.7/subprocess.pyc in _communicate(self, input)
   1310 
   1311             if _has_poll:
-> 1312                 stdout, stderr = self._communicate_with_poll(input)
   1313             else:
   1314                 stdout, stderr = self._communicate_with_select(input)

/usr/lib/python2.7/subprocess.pyc in _communicate_with_poll(self, input)
   1364             while fd2file:
   1365                 try:
-> 1366                     ready = poller.poll()
   1367                 except select.error, e:
   1368                     if e.args[0] == errno.EINTR:

KeyboardInterrupt: 


In [11]:
import shlex
from subprocess import check_call

def score_seq(known, guess, gapopen=10, gapextend=1):
    
    cmd = 'needle -asequence %(cb)s -bsequence %(seq)s -aformat score -gapopen %(go)f -gapextend %(ge)s -outfile %(out)s'
    with NamedTemporaryFile() as conb_handle:
        fasta_writer(conb_handle, [('SeqA', known)])
        conb_handle.flush()
        os.fsync(conb_handle.fileno())
        with NamedTemporaryFile() as seq_handle:
            fasta_writer(seq_handle, [('Seq1', guess)])
            seq_handle.flush()
            os.fsync(seq_handle.fileno())
            with NamedTemporaryFile() as out_handle:
                param_dict = {
                              'cb':conb_handle.name,
                              'seq':seq_handle.name,
                              'out':out_handle.name,
                              'go':gapopen,
                              'ge':gapextend
                              }
                cmd_list = shlex.split(cmd % param_dict)
                check_call(cmd_list)
                for line in out_handle:
                    parts = line.split()
                    if (len(parts) == 4):
                        return float(parts[-1][1:-2])
    


def score_seqs(known_seqs, guess_seqs, gapopen=10, gapextend=1):
    
    score = 0.0
    for ind in range(known_seqs.shape[0]):
        score += score_seq(known_seqs[ind], guess_seqs[ind],
                           gapopen=gapopen, gapextend=gapextend)
    return score

In [12]:
from sklearn.base import BaseEstimator, ClusterMixin
from tempfile import NamedTemporaryFile
from Bio.SubsMat import MatrixInfo as matlist
from Bio.Blast import NCBIXML
from StringIO import StringIO
from Bio.Blast.Applications import NcbiblastxCommandline, NcbiblastnCommandline


class BlastAligner(BaseEstimator, ClusterMixin):
    
    def __init__(self, evalue=10, word_size=2, gapopen=11, gapextend=1, 
                 max_intron_length = 20, tmp_path = '/tmp/', result_type = 'aa',
                 db_path=NamedTemporaryFile(suffix='.fasta').name, num_threads=1):
        self.evalue = evalue
        self.word_size = word_size
        self.gapopen = gapopen
        self.gapextend = gapextend
        self.max_intron_length = max_intron_length
        self.tmp_path = tmp_path
        self.result_type = result_type
        self.db_path = db_path
        self.num_threads = num_threads
    
    def _write_seqs(self, X, handle):
        
        seqs = []
        for row in range(X.shape[0]):
            seq = ''.join(X[row])
            seqs.append(('Seq-%03i' % row, ''.join(l for l in seq if l.isalpha())))
            
        fasta_writer(handle, seqs)
        handle.flush()
        os.fsync(handle.fileno())
    
    
    def fit(self, X, y):
        
        
        empty_mask = y == 'XX'
        
        with open(self.db_path, 'w') as handle:
            self._write_seqs(y[~empty_mask], handle)
        cmd = 'makeblastdb -in %s -dbtype ' % self.db_path
        if self.result_type == 'aa':
            cmd += 'prot'
        else:
            cmd += 'nucl'
        
        check_call(shlex.split(cmd))
        
        
        return self
   
    
    def predict(self, X):
        
        if self.result_type == 'aa':
            blast_cmd = NcbiblastxCommandline
        else:
            blast_cmd = NcbiblastnCommandline
        
        
        with NamedTemporaryFile(dir=self.tmp_path, delete=True) as fasta_handle:    
            self._write_seqs(X, fasta_handle)
            blastx_cline = blast_cmd(query=fasta_handle.name,
                                     db = self.db_path, outfmt=5, 
                                     out = '-',
                                     evalue=self.evalue,
                                     word_size=self.word_size,
                                     gapopen=self.gapopen,
                                     gapextend=self.gapextend,
                                     max_intron_length=self.max_intron_length,
                                     num_threads=self.num_threads)
            stdout, stderr = blastx_cline()
        
        blast_records = NCBIXML.parse(StringIO(stdout))
        seqs = []
        names = []
        prots = []
        for rec in blast_records:
            for align in rec.alignments:
                hsp = align.hsps[0]
                prots.append({
                              'ID':rec.query,
                              'Seq':hsp.query
                              })
        blast_out = pd.DataFrame(prots).groupby('ID')['Seq'].first()
        wanted_out = pd.DataFrame({
                                   'ID':['Seq-%03i' % i for i in range(X.shape[0])],
                                   'want_seq':[True]*X.shape[0],
                                   }).groupby('ID')['want_seq'].first()
        out, _ = blast_out.align(wanted_out, join='right')
        
        return SeqTransformer().transform(out.fillna('XX').values)
    
    def score(self, X, y):
        
        empty_mask = y == 'XX'
        out_aligns = self.predict(X)
        
        pos_scores = score_seqs(y[~empty_mask], out_aligns[~empty_mask])
        bad_scores = score_seqs(out_aligns[empty_mask], out_aligns[empty_mask])
        return (pos_scores - bad_scores)/y.shape[0]

In [13]:
neg_controls = {
                'env':['gag', 'pol', 'vif', 'vpr', 'ltr'],
                'gag':['ltr', 'vif', 'vpr', 'vpu', 'tat', 'rev', 'env'],
                #'ltr':['gag', 'pol', 'vpr', 'vpu', 'env'],
                'nef':['pol', 'gag', 'vpu', 'tat'],
                'pol':['env', 'vpr', 'vpu', 'nef', 'rev', 'ltr'],
                'rev':['ltr', 'gag', 'pol', 'vif', 'nef'],
                'tat':['ltr', 'pol', 'vif', 'nef'],
                'vif':['ltr', 'tat', 'vpu', 'rev', 'env', 'nef'],
                'vpr':['ltr', 'gag', 'pol', 'rev', 'env', 'nef'],
                'vpu':['ltr', 'gag', 'pol', 'vif', 'vpr', 'nef'],
                }

In [5]:
def get_seq(prot_name, typ):
    trans_path = '/home/will/PySeqUtils/TransToolStuff/'
    tmp = 'HIV1_ALL_2012_%s_%s.fasta' % (prot_name.lower(), typ.upper())
    with open(trans_path + tmp) as handle:
        return SeqTransformer.get_from_fasta_handle(handle)


pos_names, pos_X = get_seq('genome', 'DNA')
env_names, env_y = get_seq('env', 'pro')
neg_names = []
neg_X = None
for neg_prot in neg_controls['env']:
    tnames, tx = get_seq(neg_prot, 'DNA')
    neg_names += tnames
    if neg_X is None:
        neg_X = tx.copy()
    else:
        neg_X = np.concatenate((neg_X, tx))

In [6]:
pos_X_ser = pd.Series(pos_X, index=pos_names)
env_y_ser = pd.Series(env_y, index=env_names)

X_ser, y_ser = pos_X_ser.align(env_y_ser, join='inner')
X = X_ser.values
y = y_ser.values
in_env = set(env_names)
neg_inds = [num for num, name in enumerate(neg_names) if name not in in_env]
wneg_X = neg_X[neg_inds]
wneg_y = np.array(['XX']*wneg_X.shape[0])

print X.shape, y.shape, wneg_X.shape, wneg_y.shape


(1984,) (1984,) (6782,) (6782,)

In [7]:
Xall = np.concatenate((X, wneg_X))
yall = np.concatenate((y, wneg_y))

yclass = yall == 'XX'

In [17]:
from sklearn.cross_validation import train_test_split, StratifiedKFold, cross_val_score, StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV

param_dict = {'evalue':np.logspace(-100, 1, 20)}

cv = StratifiedShuffleSplit(yclass, n_iter=3, test_size=500, train_size=100)
aligner = BlastAligner(num_threads=100)
gd = GridSearchCV(aligner, param_dict, refit=False, cv=cv, verbose=5)
gd.fit(Xall, yall)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[GridSearchCV] evalue=1e-100 ...................................................
[GridSearchCV] ........................ evalue=1e-100, score=928.680000 -   3.4s
[GridSearchCV] evalue=1e-100 ...................................................
[GridSearchCV] ........................ evalue=1e-100, score=931.140000 -   3.4s
[GridSearchCV] evalue=1e-100 ...................................................
[GridSearchCV] ........................ evalue=1e-100, score=919.860000 -   3.5s
[GridSearchCV] evalue=2.06913808111e-95 ........................................
[GridSearchCV] ............. evalue=2.06913808111e-95, score=922.960000 -   3.4s
[GridSearchCV] evalue=2.06913808111e-95 ........................................
[GridSearchCV] ............. evalue=2.06913808111e-95, score=934.160000 -   3.5s
[GridSearchCV] evalue=2.06913808111e-95 ........................................
[GridSearchCV] ............. evalue=2.06913808111e-95, score=916.580000 -   3.4s
[GridSearchCV] evalue=4.28133239872e-90 ........................................
[GridSearchCV] ............. evalue=4.28133239872e-90, score=935.020000 -   3.5s
[GridSearchCV] evalue=4.28133239872e-90 ........................................
[GridSearchCV] ............. evalue=4.28133239872e-90, score=930.240000 -   3.4s
[GridSearchCV] evalue=4.28133239872e-90 ........................................
[GridSearchCV] ............. evalue=4.28133239872e-90, score=920.000000 -   3.5s
[GridSearchCV] evalue=8.8586679041e-85 .........................................
[GridSearchCV] .............. evalue=8.8586679041e-85, score=916.820000 -   3.4s
[GridSearchCV] evalue=8.8586679041e-85 .........................................
[GridSearchCV] .............. evalue=8.8586679041e-85, score=940.280000 -   3.4s
[GridSearchCV] evalue=8.8586679041e-85 .........................................
[GridSearchCV] .............. evalue=8.8586679041e-85, score=914.180000 -   3.5s
[GridSearchCV] evalue=1.83298071083e-79 ........................................
[GridSearchCV] ............. evalue=1.83298071083e-79, score=927.200000 -   3.3s
[GridSearchCV] evalue=1.83298071083e-79 ........................................
[GridSearchCV] ............. evalue=1.83298071083e-79, score=874.660000 -   3.4s
[GridSearchCV] evalue=1.83298071083e-79 ........................................
[GridSearchCV] ............. evalue=1.83298071083e-79, score=894.420000 -   3.6s
[GridSearchCV] evalue=3.79269019073e-74 ........................................
[GridSearchCV] ............. evalue=3.79269019073e-74, score=933.620000 -   3.4s
[GridSearchCV] evalue=3.79269019073e-74 ........................................
[GridSearchCV] ............. evalue=3.79269019073e-74, score=897.940000 -   3.4s
[GridSearchCV] evalue=3.79269019073e-74 ........................................
[GridSearchCV] ............. evalue=3.79269019073e-74, score=940.380000 -   3.5s
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  1.0min
[GridSearchCV] evalue=7.84759970351e-69 ........................................
[GridSearchCV] ............. evalue=7.84759970351e-69, score=881.360000 -   3.4s
[GridSearchCV] evalue=7.84759970351e-69 ........................................
[GridSearchCV] ............. evalue=7.84759970351e-69, score=921.060000 -   3.6s
[GridSearchCV] evalue=7.84759970351e-69 ........................................
[GridSearchCV] ............. evalue=7.84759970351e-69, score=939.420000 -   3.4s
[GridSearchCV] evalue=1.62377673919e-63 ........................................
[GridSearchCV] ............. evalue=1.62377673919e-63, score=935.060000 -   3.5s
[GridSearchCV] evalue=1.62377673919e-63 ........................................
[GridSearchCV] ............. evalue=1.62377673919e-63, score=895.600000 -   3.5s
[GridSearchCV] evalue=1.62377673919e-63 ........................................
[GridSearchCV] ............. evalue=1.62377673919e-63, score=932.520000 -   3.4s
[GridSearchCV] evalue=3.35981828628e-58 ........................................
[GridSearchCV] ............. evalue=3.35981828628e-58, score=922.300000 -   3.5s
[GridSearchCV] evalue=3.35981828628e-58 ........................................
[GridSearchCV] ............. evalue=3.35981828628e-58, score=928.860000 -   3.4s
[GridSearchCV] evalue=3.35981828628e-58 ........................................
[GridSearchCV] ............. evalue=3.35981828628e-58, score=903.040000 -   3.5s
[GridSearchCV] evalue=6.95192796178e-53 ........................................
[GridSearchCV] ............. evalue=6.95192796178e-53, score=922.240000 -   3.5s
[GridSearchCV] evalue=6.95192796178e-53 ........................................
[GridSearchCV] ............. evalue=6.95192796178e-53, score=932.340000 -   3.4s
[GridSearchCV] evalue=6.95192796178e-53 ........................................
[GridSearchCV] ............. evalue=6.95192796178e-53, score=882.340000 -   3.4s
[GridSearchCV] evalue=1.43844988829e-47 ........................................
[GridSearchCV] ............. evalue=1.43844988829e-47, score=884.720000 -   3.4s
[GridSearchCV] evalue=1.43844988829e-47 ........................................
[GridSearchCV] ............. evalue=1.43844988829e-47, score=896.180000 -   3.5s
[GridSearchCV] evalue=1.43844988829e-47 ........................................
[GridSearchCV] ............. evalue=1.43844988829e-47, score=922.260000 -   3.4s
[GridSearchCV] evalue=2.97635144163e-42 ........................................
[GridSearchCV] ............. evalue=2.97635144163e-42, score=914.900000 -   3.5s
[GridSearchCV] evalue=2.97635144163e-42 ........................................
[GridSearchCV] ............. evalue=2.97635144163e-42, score=914.460000 -   3.5s
[GridSearchCV] evalue=2.97635144163e-42 ........................................
[GridSearchCV] ............. evalue=2.97635144163e-42, score=934.300000 -   3.4s
[GridSearchCV] evalue=6.15848211066e-37 ........................................
[GridSearchCV] ............. evalue=6.15848211066e-37, score=891.680000 -   3.4s
[GridSearchCV] evalue=6.15848211066e-37 ........................................
[GridSearchCV] ............. evalue=6.15848211066e-37, score=943.400000 -   3.5s
[GridSearchCV] evalue=6.15848211066e-37 ........................................
[GridSearchCV] ............. evalue=6.15848211066e-37, score=930.080000 -   3.4s
[GridSearchCV] evalue=1.2742749857e-31 .........................................
[GridSearchCV] .............. evalue=1.2742749857e-31, score=922.320000 -   3.4s
[GridSearchCV] evalue=1.2742749857e-31 .........................................
[GridSearchCV] .............. evalue=1.2742749857e-31, score=877.680000 -   3.5s
[GridSearchCV] evalue=1.2742749857e-31 .........................................
[GridSearchCV] .............. evalue=1.2742749857e-31, score=907.720000 -   3.4s
[GridSearchCV] evalue=2.63665089873e-26 ........................................
[GridSearchCV] ............. evalue=2.63665089873e-26, score=923.040000 -   3.4s
[GridSearchCV] evalue=2.63665089873e-26 ........................................
[GridSearchCV] ............. evalue=2.63665089873e-26, score=925.020000 -   3.5s
[GridSearchCV] evalue=2.63665089873e-26 ........................................
[GridSearchCV] ............. evalue=2.63665089873e-26, score=939.060000 -   3.4s
[GridSearchCV] evalue=5.45559478117e-21 ........................................
[GridSearchCV] ............. evalue=5.45559478117e-21, score=892.540000 -   3.4s
[GridSearchCV] evalue=5.45559478117e-21 ........................................
[GridSearchCV] ............. evalue=5.45559478117e-21, score=923.700000 -   3.5s
[GridSearchCV] evalue=5.45559478117e-21 ........................................
[GridSearchCV] ............. evalue=5.45559478117e-21, score=934.220000 -   3.4s
[GridSearchCV] evalue=1.12883789168e-15 ........................................
[GridSearchCV] ............. evalue=1.12883789168e-15, score=951.900000 -   3.4s
[GridSearchCV] evalue=1.12883789168e-15 ........................................
[GridSearchCV] ............. evalue=1.12883789168e-15, score=946.220000 -   3.5s
[GridSearchCV] evalue=1.12883789168e-15 ........................................
[GridSearchCV] ............. evalue=1.12883789168e-15, score=902.500000 -   3.4s
[GridSearchCV] evalue=2.33572146909e-10 ........................................
[GridSearchCV] ............. evalue=2.33572146909e-10, score=938.820000 -   3.4s
[GridSearchCV] evalue=2.33572146909e-10 ........................................
[GridSearchCV] ............. evalue=2.33572146909e-10, score=893.400000 -   3.4s
[GridSearchCV] evalue=2.33572146909e-10 ........................................
[GridSearchCV] ............. evalue=2.33572146909e-10, score=917.560000 -   3.4s
[GridSearchCV] evalue=4.83293023857e-05 ........................................
[GridSearchCV] ............. evalue=4.83293023857e-05, score=920.520000 -   3.4s
[GridSearchCV] evalue=4.83293023857e-05 ........................................
[GridSearchCV] ............. evalue=4.83293023857e-05, score=942.780000 -   3.3s
[GridSearchCV] evalue=4.83293023857e-05 ........................................
[GridSearchCV] ............. evalue=4.83293023857e-05, score=901.340000 -   3.5s
[GridSearchCV] evalue=10.0 .....................................................
[GridSearchCV] .......................... evalue=10.0, score=753.180000 -   3.9s
[GridSearchCV] evalue=10.0 .....................................................
[GridSearchCV] .......................... evalue=10.0, score=729.080000 -   3.9s
[GridSearchCV] evalue=10.0 .....................................................
[GridSearchCV] .......................... evalue=10.0, score=731.180000 -   3.8s
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  3.5min finished
Out[17]:
GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True  True], n_iter=3, test_size=50, indices=True, random_state=None),
       estimator=BlastAligner(db_path='/tmp/tmpjDibUz.fasta', evalue=10, gapextend=1,
       gapopen=11, max_intron_length=20, num_threads=100, result_type='aa',
       tmp_path='/tmp/', word_size=2),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'evalue': array([  1.00000e-100,   2.06914e-095,   4.28133e-090,   8.85867e-085,
         1.83298e-079,   3.79269e-074,   7.84760e-069,   1.62378e-063,
         3.35982e-058,   6.95193e-053,   1.43845e-047,   2.97635e-042,
         6.15848e-037,   1.27427e-031,   2.63665e-026,   5.45559e-021,
         1.12884e-015,   2.33572e-010,   4.83293e-005,   1.00000e+001])},
       pre_dispatch='2*n_jobs', refit=False, score_func=None, scoring=None,
       verbose=5)

In [ ]:
import

In [81]:



Out[81]:
'MRVKGIRKNYQHLWRWGTMLLGMLMICSAAEKLWVTVYYGVPVWKEATTTLFCASDAKAYDTEVHNVWATHACVPTDPNPQEVVLENVTENFNMWKNNMVEQMHEDIISLWDQSLKPCVKLTPLCVTLNCTDLMNATNTNTTIIYRWRGEIKNCSFNITTSIRDKVQKEYALFYKLDVVPIDNDNTSYRLISCNTSVITQACPKVSFEPIPIHYCAPAGFAILKCNDKKFNGTGPCTNVSTVQCTHGIRPVVSTQLLLNGSLAEEEVVIRSENFTDNAKTIIVQLNESVEINCTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHCNISRAKWNNTLKQIVKKLREQFGNKTIVFNQSSGGDPEIVMHSFNCGGEFFYCNTTQLFNSTWNGTWNNTEGNITLPCRIKQIINMWQEVGKAMYAPPIRGQIRCSSNITGLLLTRDGGNNETEIFRPGGGDMRDNWRSELYKYKVVKIEPLGVAPTKAKRRVVQREKRAVGIGAMFLGFLGAAGSTMGAASMTLTVQARQLLSGIVQQQNNLLRAIEAQQHLLQLTVWGIKQLQARVLAVERYLKDQQLLGIWGCSGKLICTTAVPWNASWSNKSLDEIWDNMTWMEWEREIDNYTSLIYTLIEESQNQQEKNEQELLELDKWASLWNWFDITNWLWYIKIFIMIVGGLVGLRIVFAVLSIVNRVRQGYSPLSFQTRLPAPRGPDRPEGIEEEGGERDRDRSGRLVDGFLALIWDDLRSLCLFSYHRLRDLLLIVTRIVELLGRRGWEVLKYWWNLLQYWSQELKNSAVSLLNATAIAVAEGTDRVIEVVQRACRAILHIPRRIRQGLERALL'

In [ ]: