This illustrates how to use global alignment to find a short sequence (e.g., a PCR primer) in a longer sequence (e.g., a 454 read) using Needleman-Wunsch alignment.



In [1]:

    
from skbio.core.alignment.pairwise import global_pairwise_align_nucleotide



In [15]:

    
s = "ACCGTGGACCGTTAGGATTGGACCCAAGGTTG"
t = "T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25

a = global_pairwise_align_nucleotide(s, t)



In [16]:

    
print s
print t









    



ACCGTGGACCGTTAGGATTGGACCCAAGGTTG
TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGTAGGATTGGACCAAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA

The alignment then looks like the following. Note that in scikit-bio 0.1.4 we do strict NW alignment, which means that the alignment always starts at the last position of both sequences. I need to fix this, as it's effectively useless for this problem due to the long artifical gap that it introduces.



In [3]:

    
print a.to_fasta()









    



>0
-------------------------ACCGTGGACCGTTAGGATTGGACCCAAGGTTG-------------------------
>1
TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGATTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA

We can then slice just the targeted region as follows.



In [4]:

    
gap_vector = a[0].gap_vector()
start_index = gap_vector.index(False)
end_index = (a.sequence_length() - 1) - gap_vector[::-1].index(False)
print a[0][start_index:end_index+1]
print a[1][start_index:end_index+1]









    



ACCGTGGACCGTTAGGATTGGACCCAAGGTTG
ACCGTGGACCGT-AGGATTGGACC-AAGGTTA

And count the mismatches and gaps.



In [5]:

    
mismatch_count = 0
for i in range(start_index, end_index+1):
    if a[0][i] != a[1][i]:
        mismatch_count += 1
print mismatch_count

Pulling some code from skbio and iab to explore this problem



In [12]:

    
import numpy as np
from skbio import Alignment, BiologicalSequence 
from skbio.core.alignment.pairwise import (_compute_score_and_traceback_matrices, _make_nt_substitution_matrix,
                                           _init_matrices_nw, _traceback, _get_seq_ids, blosum50, _init_matrices_nw_no_terminal_gap_penalty)

nt_substitution_matrix = _make_nt_substitution_matrix(1,-2)
traceback_decoding = {1: '\\', 2:'|', 3: '-', -1: 'E', 0: '*'}

def format_dynamic_programming_matrix(seq1, seq2, matrix, cell_width=6):
    """ define a function for formatting dynamic programming matrices
    """
    lines = []

    cell_format = "%" + str(cell_width) + "s"
    line_format = cell_format * (len(seq1) + 2)
    # print seq1 (start the line with two empty strings)
    lines.append(line_format % tuple([' ',' '] + map(str,list(seq1))))

    # iterate over the rows and print each (starting with the
    # corresponding base in sequence2)
    for row, base in zip(matrix,' ' + seq2):
        lines.append(line_format % tuple([base] + map(str,row)))

    return '\n'.join(lines)

def format_traceback_matrix(seq1, seq2, matrix, cell_width=3):
    translated_m = np.chararray(matrix.shape)
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            translated_m[i, j] = traceback_decoding[matrix[i, j]]

    return format_dynamic_programming_matrix(seq1, seq2, translated_m,
                                             cell_width)


def global_pairwise_align_(seq1, seq2, gap_open_penalty=10, gap_extend_penalty=5, 
                          substitution_matrix=blosum50):


    init_matrices_f = _init_matrices_nw_no_terminal_gap_penalty

    score_matrix, traceback_matrix = \
        _compute_score_and_traceback_matrices(
            seq1, seq2, gap_open_penalty, gap_extend_penalty,
            substitution_matrix, new_alignment_score=-np.inf,
            init_matrices_f=init_matrices_f,
            penalize_terminal_gaps=False)
    print format_dynamic_programming_matrix(seq1, seq2, score_matrix, )
    print format_traceback_matrix(seq1, seq2, traceback_matrix)

    end_row_position = traceback_matrix.shape[0] - 1
    end_col_position = traceback_matrix.shape[1] - 1

    aligned1, aligned2, score, seq1_start_position, seq2_start_position = \
        _traceback(traceback_matrix, score_matrix, seq1, seq2,
                   end_row_position, end_col_position)
    start_end_positions = [(seq1_start_position, end_col_position-1),
                           (seq2_start_position, end_row_position-1)]

    # Get ids to assign to the output sequences in the result Alignment object
    seq1_id, seq2_id = _get_seq_ids(seq1, seq2)

    return Alignment(
        [BiologicalSequence(aligned1, id=seq1_id),
         BiologicalSequence(aligned2, id=seq2_id)],
        score=score, start_end_positions=start_end_positions)



In [13]:

    
s = "HEAGAWGHEE"
t = "PAWHEAE"
print global_pairwise_align_(s, t).to_fasta()









    



                 H     E     A     G     A     W     G     H     E     E
         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0
     P   0.0  -2.0  -1.0  -1.0  -2.0  -1.0  -4.0  -2.0  -2.0  -1.0   0.0
     A   0.0  -2.0  -3.0   4.0  -1.0   3.0  -4.0  -4.0  -4.0  -3.0   0.0
     W   0.0  -3.0  -5.0  -6.0   1.0  -4.0  18.0   8.0   3.0  -2.0   0.0
     H   0.0  10.0   0.0  -5.0  -8.0  -1.0   8.0  16.0  18.0   8.0   3.0
     E   0.0   0.0  16.0   6.0   1.0  -4.0   3.0   6.0  16.0  24.0  14.0
     A   0.0  -2.0   6.0  21.0  11.0   6.0   1.0   3.0   6.0  15.0  23.0
     E   0.0   0.0   4.0  11.0  18.0  18.0  18.0  18.0  18.0  18.0  23.0
        H  E  A  G  A  W  G  H  E  E
     *  -  -  -  -  -  -  -  -  -  -
  P  |  \  \  \  \  \  \  \  \  \  |
  A  |  \  \  \  \  \  \  \  \  \  |
  W  |  \  \  \  \  \  \  -  -  -  |
  H  |  \  -  -  \  \  |  \  \  -  -
  E  |  \  \  -  -  -  |  |  \  \  -
  A  |  \  |  \  -  -  -  \  |  \  \
  E  |  -  \  |  \  -  -  -  -  -  |
>0
HEAGAWGHEE-
>1
---PAW-HEAE



In [7]:



In [7]:



In [7]:



In [7]: