In [1]:
import sys
import os
from collections import OrderedDict, defaultdict
import pprint
import nglview as ngl
from Bio.PDB import *
from Bio.Seq import Seq
from Bio import pairwise2 # pairwise alignment
pretty = pprint.PrettyPrinter(indent=2)
In [2]:
# Globals
PDB_FILE = '4hj0.pdb'
DIR = 'PDB_Struct'
# Structure Viewer from PDB FILE(NGL)
view = ngl.show_structure_file(os.path.join(DIR, PDB_FILE))
view
In [3]:
# download structure from PDB
# pdbl = PDBList()
# pdbl.retrieve_pdb_file(PDB_ID, pdir=DIR)
# parse the structure
p = PDBParser()
parser = MMCIFParser()
structure = p.get_structure('STRUCT_OBJ', os.path.join(DIR, PDB_FILE))
chains = structure.get_chains()
residue_list = []
for chain in chains:
residues = chain.get_residues()
ress = []
for residue in residues:
ress.append(residue.get_resname())
residue_list.append(ress)
print(len(residue_list[0]))
print(residue_list)
In [4]:
# Extract Polypeptides from a Structure Object:
import pdb
ppb = PPBuilder()
complete_aa_seq = []
complete_aa_chain_map = []
chain_lengths = []
for pp in ppb.build_peptides(structure):
seq = pp.get_sequence()
complete_aa_seq.append(str(seq))
chain_lengths.append(len(seq))
if len(pp.get_sequence()) >= 55:
print (' ' + str(len(seq)) + ': ' + seq[0] + ' -> ' + seq[50:57])
print('=====' * 15)
print('Full PDB AA SEQ: \n\n' + ''.join(complete_aa_seq))
print(chain_lengths)
In [5]:
# Align PDB to Reference
ref_seq = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
pdb_seq = 'GELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCW'
align = pairwise2.align.localms(ref_seq, pdb_seq, 2, -1, -.5, -.1)
ref_seq_a, pdb_seq_a, _, _, _ = align[0]
print(ref_seq_a)
print('\n')
print(pdb_seq_a)
with open('aligned_out.txt', 'w') as out_f:
out_f.write('>ref_group_1\n')
out_f.write(ref_seq_a)
out_f.write('\n\n')
out_f.write('>pdb_seq_2128-1\n')
out_f.write(pdb_seq_a)
In [ ]:
# Align Clone to Reference
clone_seq_1 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
align = pairwise2.align.localms(ref_seq, clone_seq_1, 2, -1, -.5, -.1)
ref_seq_a, clone_seq_1, _, _, _ = align[0]
print(ref_seq_a)
print('\n')
print(clone_seq_1)
with open('aligned_out.txt', 'w') as out_f:
out_f.write(ref_seq_a)
out_f.write('\n\n')
out_f.write(clone_seq_1)
In [ ]:
# Multiple Alignment
## Create a new fasta file
fasta_div = '>'
seq_1 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seq_2 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seq_3 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seq_4 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seq_5 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seq_6 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seq_7 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seqs = OrderedDict([
('pdb_seq', pdb_seq),
('ref_seq', ref_seq),
('2130', seq_1),
('2131', seq_2),
('2133', seq_3),
('2699', seq_4),
('2701', seq_5),
('2703', seq_6),
('2704', seq_7)])
seqs_keys = seqs.keys()
# build the fasta file
with open('create.fasta', 'w') as create_fasta:
for seq_id, seq in seqs.items():
create_fasta.write('{}{}\n'.format(fasta_div, seq_id))
create_fasta.write(seq)
create_fasta.write('\n')
## Multipe align the file
clustal_out = 'clustal_out.fasta'
from Bio.Align.Applications import ClustalwCommandline
clustalw_cline = ClustalwCommandline('/usr/local/Cellar/clustal-w/2.1/bin/clustalw2',
infile='create.fasta', outfile=clustal_out)
print(clustalw_cline)
stdout, stderr = clustalw_cline()
# print(stdout, '\n', stderr)
## Let's read seqs in a nicer format
from Bio import AlignIO
align = AlignIO.read(clustal_out, "clustal")
for record in align:
seqs[record.id] = str(record.seq)
req_seq_aligned = seqs['ref_seq']
pdb_seq_aligned = seqs['pdb_seq']
map_pdb_to_ref = OrderedDict()
pdb_counter = 1
chain_name = 'L'
for i, aa in enumerate(pdb_seq_aligned):
if aa != '-':
map_pdb_to_ref['{}.{}'.format(pdb_counter, chain_name)] = [aa, req_seq_aligned[i], i+1]
if pdb_counter == 107 and chain_name == 'L':
pdb_counter = 0
chain_name = 'H'
pdb_counter += 1
# pretty.pprint(map_pdb_to_ref)
import json
print json.dumps(map_pdb_to_ref, indent=4)
In [ ]:
# Local Alignment
from Bio import pairwise2
from Bio.SubsMat.MatrixInfo import blosum62
from Bio.Seq import Seq
seq1 = 'TAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRV'
seq2 = 'ATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPETAGELYQRWERYRRECQETLAAAEPPSGLACNGSFDMYVCWDYAAPNATARASCPWYLPWHHHVAAGFVLRQCGSDGQWGLWRDHTQCENPQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKGTLVTVSSASTKGPSVFPLAPSGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSTQTYICNVNHKPSNTKVDKRVSYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEVEKTVAPTESYVLTQPPSASGTPGQRVAISCSGSNSNIGSNTVHWYQQLPGAAPKLLIYSNNQRPSGVPDRFSGSNSGTSASLAISRLQSEDEADYYCAAWDDSLNGVVFGGGTKVTVLQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGVEKTVAPTEQLQQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPTFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAQGPIVGAPTDYWGKG'
ref_seq = Seq(seq1.replace("-", ""))
pdb_seq = Seq(seq2.replace("-", ""))
alignments = pairwise2.align.globalds(ref_seq, pdb_seq, blosum62, -10, -0.5)
print(pairwise2.format_alignment(*alignments[0]))