This illustrates how to use global alignment to find a short sequence (e.g., a PCR primer) in a longer sequence (e.g., a 454 read) using Needleman-Wunsch alignment.
In [1]:
from skbio.core.alignment.pairwise import global_pairwise_align_nucleotide
In [15]:
s = "ACCGTGGACCGTTAGGATTGGACCCAAGGTTG"
t = "T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25
a = global_pairwise_align_nucleotide(s, t)
In [16]:
print s
print t
The alignment then looks like the following. Note that in scikit-bio 0.1.4 we do strict NW alignment, which means that the alignment always starts at the last position of both sequences. I need to fix this, as it's effectively useless for this problem due to the long artifical gap that it introduces.
In [3]:
print a.to_fasta()
We can then slice just the targeted region as follows.
In [4]:
gap_vector = a[0].gap_vector()
start_index = gap_vector.index(False)
end_index = (a.sequence_length() - 1) - gap_vector[::-1].index(False)
print a[0][start_index:end_index+1]
print a[1][start_index:end_index+1]
And count the mismatches and gaps.
In [5]:
mismatch_count = 0
for i in range(start_index, end_index+1):
if a[0][i] != a[1][i]:
mismatch_count += 1
print mismatch_count
Pulling some code from skbio and iab to explore this problem
In [12]:
import numpy as np
from skbio import Alignment, BiologicalSequence
from skbio.core.alignment.pairwise import (_compute_score_and_traceback_matrices, _make_nt_substitution_matrix,
_init_matrices_nw, _traceback, _get_seq_ids, blosum50, _init_matrices_nw_no_terminal_gap_penalty)
nt_substitution_matrix = _make_nt_substitution_matrix(1,-2)
traceback_decoding = {1: '\\', 2:'|', 3: '-', -1: 'E', 0: '*'}
def format_dynamic_programming_matrix(seq1, seq2, matrix, cell_width=6):
""" define a function for formatting dynamic programming matrices
"""
lines = []
cell_format = "%" + str(cell_width) + "s"
line_format = cell_format * (len(seq1) + 2)
# print seq1 (start the line with two empty strings)
lines.append(line_format % tuple([' ',' '] + map(str,list(seq1))))
# iterate over the rows and print each (starting with the
# corresponding base in sequence2)
for row, base in zip(matrix,' ' + seq2):
lines.append(line_format % tuple([base] + map(str,row)))
return '\n'.join(lines)
def format_traceback_matrix(seq1, seq2, matrix, cell_width=3):
translated_m = np.chararray(matrix.shape)
for i in range(matrix.shape[0]):
for j in range(matrix.shape[1]):
translated_m[i, j] = traceback_decoding[matrix[i, j]]
return format_dynamic_programming_matrix(seq1, seq2, translated_m,
cell_width)
def global_pairwise_align_(seq1, seq2, gap_open_penalty=10, gap_extend_penalty=5,
substitution_matrix=blosum50):
init_matrices_f = _init_matrices_nw_no_terminal_gap_penalty
score_matrix, traceback_matrix = \
_compute_score_and_traceback_matrices(
seq1, seq2, gap_open_penalty, gap_extend_penalty,
substitution_matrix, new_alignment_score=-np.inf,
init_matrices_f=init_matrices_f,
penalize_terminal_gaps=False)
print format_dynamic_programming_matrix(seq1, seq2, score_matrix, )
print format_traceback_matrix(seq1, seq2, traceback_matrix)
end_row_position = traceback_matrix.shape[0] - 1
end_col_position = traceback_matrix.shape[1] - 1
aligned1, aligned2, score, seq1_start_position, seq2_start_position = \
_traceback(traceback_matrix, score_matrix, seq1, seq2,
end_row_position, end_col_position)
start_end_positions = [(seq1_start_position, end_col_position-1),
(seq2_start_position, end_row_position-1)]
# Get ids to assign to the output sequences in the result Alignment object
seq1_id, seq2_id = _get_seq_ids(seq1, seq2)
return Alignment(
[BiologicalSequence(aligned1, id=seq1_id),
BiologicalSequence(aligned2, id=seq2_id)],
score=score, start_end_positions=start_end_positions)
In [13]:
s = "HEAGAWGHEE"
t = "PAWHEAE"
print global_pairwise_align_(s, t).to_fasta()
In [7]:
In [7]:
In [7]:
In [7]: