Local alignment

We define a simple cost function exampleCost and a function smithWaterman that, given strings x and y and a cost function, fills in and returns the corresponding local alignment matrix.


In [1]:
import numpy

def exampleCost(xc, yc):
    ''' Cost function: 2 to match, -6 to gap, -4 to mismatch '''
    if xc == yc: return 2 # match
    if xc == '-' or yc == '-': return -6 # gap
    return -4

def smithWaterman(x, y, s):
    ''' Calculate local alignment values of sequences x and y using
        dynamic programming.  Return maximal local alignment value. '''
    V = numpy.zeros((len(x)+1, len(y)+1), dtype=int)
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            V[i, j] = max(V[i-1, j-1] + s(x[i-1], y[j-1]), # diagonal
                          V[i-1, j  ] + s(x[i-1], '-'),    # vertical
                          V[i  , j-1] + s('-',    y[j-1]), # horizontal
                          0)                               # empty
    argmax = numpy.where(V == V.max())
    return V, int(V[argmax])

In [2]:
x, y = 'GGTATGCTGGCGCTA', 'TATATGCGGCGTTT'
V, best = smithWaterman(x, y, exampleCost)
print(V)
print("Best score=%d, in cell %s" % (best, numpy.unravel_index(numpy.argmax(V), V.shape)))


[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  2  2  0  2  0  0  0]
 [ 0  0  0  0  0  0  2  0  2  4  0  2  0  0  0]
 [ 0  2  0  2  0  2  0  0  0  0  0  0  4  2  2]
 [ 0  0  4  0  4  0  0  0  0  0  0  0  0  0  0]
 [ 0  2  0  6  0  6  0  0  0  0  0  0  2  2  2]
 [ 0  0  0  0  2  0  8  2  2  2  0  2  0  0  0]
 [ 0  0  0  0  0  0  2 10  4  0  4  0  0  0  0]
 [ 0  2  0  2  0  2  0  4  6  0  0  0  2  2  2]
 [ 0  0  0  0  0  0  4  0  6  8  2  2  0  0  0]
 [ 0  0  0  0  0  0  2  0  2  8  4  4  0  0  0]
 [ 0  0  0  0  0  0  0  4  0  2 10  4  0  0  0]
 [ 0  0  0  0  0  0  2  0  6  2  4 12  6  0  0]
 [ 0  0  0  0  0  0  0  4  0  2  4  6  8  2  0]
 [ 0  2  0  2  0  2  0  0  0  0  0  0  8 10  4]
 [ 0  0  4  0  4  0  0  0  0  0  0  0  2  4  6]]
Best score=12, in cell (12, 11)

Next we define a traceback function that, given a local alignment matrix, strings x and y, and a scoring function, returns the optimal edit transcript and alignment for the optimal substrings of x and y.


In [3]:
def traceback(V, x, y, s):
    """ Trace back from given cell in local-alignment matrix V """
    # get i, j for maximal cell
    i, j = numpy.unravel_index(numpy.argmax(V), V.shape)
    xscript, alx, aly, alm = [], [], [], []
    while (i > 0 or j > 0) and V[i, j] != 0:
        diag, vert, horz = 0, 0, 0
        if i > 0 and j > 0:
            diag = V[i-1, j-1] + s(x[i-1], y[j-1])
        if i > 0:
            vert = V[i-1, j] + s(x[i-1], '-')
        if j > 0:
            horz = V[i, j-1] + s('-', y[j-1])
        if diag >= vert and diag >= horz:
            match = x[i-1] == y[j-1]
            xscript.append('M' if match else 'R')
            alm.append('|' if match else ' ')
            alx.append(x[i-1]); aly.append(y[j-1])
            i -= 1; j -= 1
        elif vert >= horz:
            xscript.append('D')
            alx.append(x[i-1]); aly.append('-'); alm.append(' ')
            i -= 1
        else:
            xscript.append('I')
            aly.append(y[j-1]); alx.append('-'); alm.append(' ')
            j -= 1
    xscript = (''.join(xscript))[::-1]
    alignment = '\n'.join(map(lambda x: ''.join(x), [alx[::-1], alm[::-1], aly[::-1]]))
    return xscript, alignment

In [4]:
print(traceback(V, x, y, exampleCost)[1])


TATGCTGGCG
||||| ||||
TATGC-GGCG

In [5]:
def editDistanceLikeCost(xc, yc):
    return 1 if xc == yc else -1

x = 'he_will_after_his_sour_fashion_tell_you'
y = 'struts_and_frets_his_hour_upon_the_stage '
V, best = smithWaterman(x, y, editDistanceLikeCost)
print('Best score: %d' % best)
print(traceback(V, x, y, exampleCost)[1])


Best score: 8
_his_sour_
||||| ||||
_his_hour_