Map Multi-Chain Ab Structure to Amino Acid Sequence


  1. Extract PDB Sequence
  2. Parse the Input File with reference and clone sequences
  3. Multiple Alignment
  4. Post-processing and generation of the final JSON

In [ ]:
''' 
    Library Imports
'''
import sys
import os
from collections import OrderedDict, defaultdict
import pprint
from Bio.PDB import *
from Bio.Seq import Seq

In [ ]:
'''
   pdbparser.py - Yevheniy Chuba - 6/1/2017
   Parse local or external (PDB Database) 3D structure files (.pdb, .cif)
'''
import os
from Bio.PDB import *
from Bio.Seq import Seq

class PdbParser:
    '''
        PdbParser extracts amino acid sequence from PDB structure, 
        either downloaded from PDB database or uploaded localy .pdb file
        
        Args:
            pdb_struct_name (string): name of the file or PDB database ID
            external (bool): True indicates the structure comes from an external
                             PDB database; default is False (local .pdb file)
        Returns:
            Extracted amino acid sequence from the PDB file, including chain information
    '''
    
    def __init__(self, struct_name, struct_dir='PDB_Struct', external=False):
        self.struct_name = struct_name
        self.struct_dir = struct_dir
        self.external = external
        
    def pdb_processing(self):
        """
            Process either uploaded or externally downloaded 3D structure
        """
        if self.external:
            self.pdb_struct = self.get_external_struct()
        else:
            self.pdb_struct = self.get_uploaded_struct()
            
        extracted_seq = self.extract_seq_from_structure(self.pdb_struct)
        return extracted_seq
        
    def get_external_struct(self):
        """
            Create Structure object from externally downloed (PDB Database) structure file (.cif)
        """
        self.download_structure()
        parser = MMCIFParser()
        structure = parser.get_structure('STRUCT_OBJ', 
                                         os.path.join(self.struct_dir, self.struct_name) + '.cif')
        return structure
    
    def get_uploaded_struct(self):
        """
            Create Structure object from locally uploaded structure file (.pdb)
        """
        parser = PDBParser()
        structure = parser.get_structure('STRUCT_OBJ',
                                         os.path.join(self.struct_dir, self.struct_name))
        return structure
            
    def download_structure(self):
        """
            Download structure from PDB database based on PDB ID
        """
        pdbl = PDBList()
        pdbl.retrieve_pdb_file(self.struct_name, pdir=self.struct_dir)
        
    def extract_seq_from_structure(self, struct):
        """
            Extract Polypeptides from a Structure Object
        """
        ppb = PPBuilder() # Polypeptide builder object
        aa_seqs = []
        chains = struct.get_chains()
        for pp in ppb.build_peptides(struct):
            seq = pp.get_sequence()
            aa_seqs.append(str(seq))
        chain_aa_map = [[chain.id, aa_seqs[index]] for index, chain in enumerate(chains)]
        return chain_aa_map

In [ ]:
parser = PdbParser("4hj0.pdb")
print(parser.pdb_processing())

In [ ]:
'''
    MultiAlign.py - Yevheniy Chuba - 6/1/2017
    Perform multiple alignment using ClustalW
'''
from collections import OrderedDict
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO

class MultiAlign:
    '''
        MultiAlign performs multiple alignment using BioPython's 
        ClustalW wrapper.
        
        Args:
            clustal_input(str): .fasta input file with multiple sequences
            clustal_output(str): .fasta output file in Clustal format
            clustal_w(str): the location of the clustalw2 command line utility
                            - make sure it is in the system's or user's bin directory
                            - this tool is usually downloaded manually and doesn't come
                            as BioPython's dependency
                            - make sure it has proper permissions (give it 777 if not sure)
            
        Returns:
            A dictionary, mapping sequence id to sequence string
    '''
    def __init__(self, clustal_input='clustal_in.fasta', 
                 clustal_output='clustal_out.fasta', clustalw='clustalw2'):
        self.clustal_input = clustal_input
        self.clustal_output = clustal_output
        self.clustalw = clustalw
        
    def perform_alignment(self):
        clustalw_cline = ClustalwCommandline(self.clustalw, 
                                             infile=self.clustal_input,
                                             outfile=self.clustal_output)
        print(clustalw_cline)
        stdout, stderr = clustalw_cline()
        
        align = AlignIO.read(self.clustal_output, "clustal")
        id_seq = self.extract_seqs(align)
        return id_seq
        
    def extract_seqs(self, align):
        return {record.id: str(record.seq) for record in align}

In [ ]:
multi_align = MultiAlign(clustal_input='create.fasta')
id_seq_map = multi_align.perform_alignment()
id_seq_map

In [ ]:
'''
    InputProcessor.py - Yevheniy Chuba - 6/1/2017
    Currently Parses input CSV file and extracts: Sequence ID, Group, Sequence.
    It then returns the following data structure for further pre-processing:
        {group_id: [['seq_id', 'seq], ... ], 
         ...
         }
'''

import os
from collections import OrderedDict
import csv
import pandas as pd

class InputProcessor:
    '''
        Takes in any data format (various excell versions, csv, tsv)
        and then extracts group, seq_id, seq information
        
        Args:
            input_file(str): the input file to be processed
        
        Returns:
            {group_id: [['seq_id', 'seq], ... ], 
             ...
             }
    '''
    def __init__(self, input_file):
        self.input_file = input_file
        
    def process_input(self):
        """
            Combines file conversion and sequence information extraction
        """
        csv_file_name = self.convert_to_csv()
        group_id_seq = self.get_sequences(csv_file_name)
        return group_id_seq
        
        
    def convert_to_csv(self):
        """
            Convert any given format to csv for cleaner processing
        """
        csv_file_name = ''
        with open(self.input_file, 'r+') as input_f:
            # open either excell or csv file
            file_ext = input_f.name.lower()
            if file_ext.endswith(('.xlsx', 'xls', 'xlt')):
                df = pd.read_excel(input_f, header=None)
            else:
                df = pd.read_csv(input_f, header=None)
            # convert to .csv
            csv_file_name = os.path.splitext(input_f.name)[0] + ".csv"
            df.to_csv(csv_file_name, header=None, index=None, sep=',')
        return csv_file_name
    
    def get_sequences(self, file_name):
        """
            Extract Group Name, Sequence ID, Sequence String (no gaps)
        """
        main_ds = OrderedDict()
        with open(file_name, 'r+') as input_f:
            for line in input_f:
                flag = line.split(',')[3]
                if line.split(',')[3] in ('0', '1'):
                    column_data = line.split(',')
                    molecule_name = column_data[1]
                    group_id = column_data[2]
                    seq = column_data[4].replace('-', '')
                    if group_id in main_ds.keys():
                        main_ds[group_id].append([molecule_name, seq])
                    else:
                        main_ds[group_id] = [[molecule_name, seq]]
        return main_ds

In [ ]:
input_processor = InputProcessor('Specific_File.xlsx')
ds = input_processor.process_input()
print(ds)

In [ ]:
'''
    FastaGen - Yevheniy Chuba - 6/1/2017
    Prepares sequences from various groups for multiple alignment,
    by combining sequences of the same group into a single fasta file.
    The output is the list of fasta file names separated by group.
'''

class FastaGen:
    
    def __init__(self, seq_ds):
        self.seq_ds = seq_ds
        
    def process_seqs(self):
        file_names = []
        for group_id, seqs in seq_ds.iteritems():
            output_file_name = 'group_{}.fasta'.format(group_id)
            file_names.append(output_file_name)
            with open(output_file_name, 'w') as output_f:
                for record in seqs:
                    seq_id = record[0]
                    seq = record[1]
                    output_f.write('>{}\n'.format(seq_id))
                    output_f.write(seq)
                    output_f.write('\n')
        return file_names