Maping Multi-Chain Ab Structure to Amino Acid Sequence


  1. Align to PDB Sequence
  2. Highlight Coresponding Positions

In [1]:
import sys
import os
import nglview as ngl
from Bio.PDB import *
from Bio.Seq import Seq
from Bio import pairwise2 # pairwise alignment

In [2]:
# Globals
PDB_ID = '4hj0'
DIR = 'PDB_Struct'

# Structure Viewer from PDB_ID(NGL)
view = ngl.show_pdbid(PDB_ID)
view


The installed widget Javascript is the wrong version.

In [4]:
# download structure from PDB
pdbl = PDBList()
pdbl.retrieve_pdb_file(PDB_ID, pdir=DIR)

# parse the structure
p = PDBParser()
parser = MMCIFParser()
structure = parser.get_structure('STRUCT_OBJ', os.path.join(DIR, PDB_ID) + '.cif')


WARNING: The default download format has changed from PDB to PDBx/mmCif
Structure exists: 'PDB_Struct/4hj0.cif' 

In [5]:
# Extract Polypeptides from a Structure Object:
ppb = PPBuilder()
complete_aa_seq = []
for pp in ppb.build_peptides(structure):
    seq = pp.get_sequence()
    print(seq)
    complete_aa_seq.append(str(seq))
    if len(pp.get_sequence()) >= 55:
        print ('        ' + str(len(seq)) + ': ' + seq[0] + ' -> ' + seq[50:57])

print('=====' * 15)
print('Full PDB AA SEQ: \n\n' + ''.join(complete_aa_seq))


TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPE
        92: T -> RASCPWY
TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENP
        91: T -> RASCPWY
QLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPS
        131: Q -> PTFGTAN
GTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSS
TQTYICNVNHKPSNTKVDKRV
SYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHE
        201: S -> SNNQRPS
VEKTVAPTE
SYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEG
        202: S -> SNNQRPS
VEKTVAPTE
QLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAP
        130: Q -> PTFGTAN
GTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSS
TQTYICNVNHKPSNTKVDKRV
===========================================================================
Full PDB AA SEQ: 

TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV

In [6]:
# Align Sequences
ref_seq = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPE'
pdb_seq = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPE'

align = pairwise2.align.localms(ref_seq, pdb_seq, 2, -1, -.5, -.1)
ref_seq_a, pdb_seq_a, _, _, _ = align[0]

print(ref_seq_a)

with open('/Users/whitehat/US/dev/projects/1h_NGL_3D_Viewer/pyscripts/aligned_out.txt', 'w') as out_f:
    out_f.write(ref_seq_a)
    out_f.write('\n\n')
    out_f.write(pdb_seq_a)


TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPE