In [ ]:
'''
Library Imports
'''
import sys
import os
from collections import OrderedDict, defaultdict
import pprint
from Bio.PDB import *
from Bio.Seq import Seq
In [ ]:
'''
pdbparser.py - Yevheniy Chuba - 6/1/2017
Parse local or external (PDB Database) 3D structure files (.pdb, .cif)
'''
import os
from Bio.PDB import *
from Bio.Seq import Seq
class PdbParser:
'''
PdbParser extracts amino acid sequence from PDB structure,
either downloaded from PDB database or uploaded localy .pdb file
Args:
pdb_struct_name (string): name of the file or PDB database ID
external (bool): True indicates the structure comes from an external
PDB database; default is False (local .pdb file)
Returns:
Extracted amino acid sequence from the PDB file, including chain information
'''
def __init__(self, struct_name, struct_dir='PDB_Struct', external=False):
self.struct_name = struct_name
self.struct_dir = struct_dir
self.external = external
def pdb_processing(self):
"""
Process either uploaded or externally downloaded 3D structure
"""
if self.external:
self.pdb_struct = self.get_external_struct()
else:
self.pdb_struct = self.get_uploaded_struct()
extracted_seq = self.extract_seq_from_structure(self.pdb_struct)
return extracted_seq
def get_external_struct(self):
"""
Create Structure object from externally downloed (PDB Database) structure file (.cif)
"""
self.download_structure()
parser = MMCIFParser()
structure = parser.get_structure('STRUCT_OBJ',
os.path.join(self.struct_dir, self.struct_name) + '.cif')
return structure
def get_uploaded_struct(self):
"""
Create Structure object from locally uploaded structure file (.pdb)
"""
parser = PDBParser()
structure = parser.get_structure('STRUCT_OBJ',
os.path.join(self.struct_dir, self.struct_name))
return structure
def download_structure(self):
"""
Download structure from PDB database based on PDB ID
"""
pdbl = PDBList()
pdbl.retrieve_pdb_file(self.struct_name, pdir=self.struct_dir)
def extract_seq_from_structure(self, struct):
"""
Extract Polypeptides from a Structure Object
"""
ppb = PPBuilder() # Polypeptide builder object
aa_seqs = []
chains = struct.get_chains()
for pp in ppb.build_peptides(struct):
seq = pp.get_sequence()
aa_seqs.append(str(seq))
chain_aa_map = [[chain.id, aa_seqs[index]] for index, chain in enumerate(chains)]
return chain_aa_map
In [ ]:
parser = PdbParser("4hj0.pdb")
print(parser.pdb_processing())
In [ ]:
'''
MultiAlign.py - Yevheniy Chuba - 6/1/2017
Perform multiple alignment using ClustalW
'''
from collections import OrderedDict
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
class MultiAlign:
'''
MultiAlign performs multiple alignment using BioPython's
ClustalW wrapper.
Args:
clustal_input(str): .fasta input file with multiple sequences
clustal_output(str): .fasta output file in Clustal format
clustal_w(str): the location of the clustalw2 command line utility
- make sure it is in the system's or user's bin directory
- this tool is usually downloaded manually and doesn't come
as BioPython's dependency
- make sure it has proper permissions (give it 777 if not sure)
Returns:
A dictionary, mapping sequence id to sequence string
'''
def __init__(self, clustal_input='clustal_in.fasta',
clustal_output='clustal_out.fasta', clustalw='clustalw2'):
self.clustal_input = clustal_input
self.clustal_output = clustal_output
self.clustalw = clustalw
def perform_alignment(self):
clustalw_cline = ClustalwCommandline(self.clustalw,
infile=self.clustal_input,
outfile=self.clustal_output)
print(clustalw_cline)
stdout, stderr = clustalw_cline()
align = AlignIO.read(self.clustal_output, "clustal")
id_seq = self.extract_seqs(align)
return id_seq
def extract_seqs(self, align):
return {record.id: str(record.seq) for record in align}
In [ ]:
multi_align = MultiAlign(clustal_input='create.fasta')
id_seq_map = multi_align.perform_alignment()
id_seq_map
In [ ]:
'''
InputProcessor.py - Yevheniy Chuba - 6/1/2017
Currently Parses input CSV file and extracts: Sequence ID, Group, Sequence.
It then returns the following data structure for further pre-processing:
{group_id: [['seq_id', 'seq], ... ],
...
}
'''
import os
from collections import OrderedDict
import csv
import pandas as pd
class InputProcessor:
'''
Takes in any data format (various excell versions, csv, tsv)
and then extracts group, seq_id, seq information
Args:
input_file(str): the input file to be processed
Returns:
{group_id: [['seq_id', 'seq], ... ],
...
}
'''
def __init__(self, input_file):
self.input_file = input_file
def process_input(self):
"""
Combines file conversion and sequence information extraction
"""
csv_file_name = self.convert_to_csv()
group_id_seq = self.get_sequences(csv_file_name)
return group_id_seq
def convert_to_csv(self):
"""
Convert any given format to csv for cleaner processing
"""
csv_file_name = ''
with open(self.input_file, 'r+') as input_f:
# open either excell or csv file
file_ext = input_f.name.lower()
if file_ext.endswith(('.xlsx', 'xls', 'xlt')):
df = pd.read_excel(input_f, header=None)
else:
df = pd.read_csv(input_f, header=None)
# convert to .csv
csv_file_name = os.path.splitext(input_f.name)[0] + ".csv"
df.to_csv(csv_file_name, header=None, index=None, sep=',')
return csv_file_name
def get_sequences(self, file_name):
"""
Extract Group Name, Sequence ID, Sequence String (no gaps)
"""
main_ds = OrderedDict()
with open(file_name, 'r+') as input_f:
for line in input_f:
flag = line.split(',')[3]
if line.split(',')[3] in ('0', '1'):
column_data = line.split(',')
molecule_name = column_data[1]
group_id = column_data[2]
seq = column_data[4].replace('-', '')
if group_id in main_ds.keys():
main_ds[group_id].append([molecule_name, seq])
else:
main_ds[group_id] = [[molecule_name, seq]]
return main_ds
In [ ]:
input_processor = InputProcessor('Specific_File.xlsx')
ds = input_processor.process_input()
print(ds)
In [ ]:
'''
FastaGen - Yevheniy Chuba - 6/1/2017
Prepares sequences from various groups for multiple alignment,
by combining sequences of the same group into a single fasta file.
The output is the list of fasta file names separated by group.
'''
class FastaGen:
def __init__(self, seq_ds):
self.seq_ds = seq_ds
def process_seqs(self):
file_names = []
for group_id, seqs in seq_ds.iteritems():
output_file_name = 'group_{}.fasta'.format(group_id)
file_names.append(output_file_name)
with open(output_file_name, 'w') as output_f:
for record in seqs:
seq_id = record[0]
seq = record[1]
output_f.write('>{}\n'.format(seq_id))
output_f.write(seq)
output_f.write('\n')
return file_names