In [1]:
# Using an edit-distance-like dynamic programming formulation, we can
# look for approximate occurrences of p in t.
import sys
import numpy
# Assume x is the string labeling rows of the matrix and y is the
# string labeling the columns
def trace(D, x, y):
''' Backtrace edit-distance matrix D for strings x and y '''
i, j = len(x), len(y)
xscript = []
while i > 0:
diag, vert, horz = sys.maxsize, sys.maxsize, sys.maxsize
delt = None
if i > 0 and j > 0:
delt = 0 if x[i-1] == y[j-1] else 1
diag = D[i-1, j-1] + delt
if i > 0:
vert = D[i-1, j] + 1
if j > 0:
horz = D[i, j-1] + 1
if diag <= vert and diag <= horz:
# diagonal was best
xscript.append('R' if delt == 1 else 'M')
i -= 1; j -= 1
elif vert <= horz:
# vertical was best; this is an insertion in x w/r/t y
xscript.append('I')
i -= 1
else:
# horizontal was best
xscript.append('D')
j -= 1
# j = offset of the first (leftmost) character of t involved in the
# alignment
return j, (''.join(xscript))[::-1] # reverse and string-ize
def kEditDp(p, t):
''' Find and return the alignment of p to a substring of t with the
fewest edits. We return the edit distance, the offset of the
substring aligned to, and the edit transcript. If multiple
alignments tie for best, we report the leftmost. '''
D = numpy.zeros((len(p)+1, len(t)+1), dtype=int)
# Note: First row gets zeros. First column initialized as usual.
D[1:, 0] = range(1, len(p)+1)
for i in range(1, len(p)+1):
for j in range(1, len(t)+1):
delt = 1 if p[i-1] != t[j-1] else 0
D[i, j] = min(D[i-1, j-1] + delt, D[i-1, j] + 1, D[i, j-1] + 1)
# Find minimum edit distance in last row
mnJ, mn = None, len(p) + len(t)
for j in range(len(t)+1):
if D[len(p), j] < mn:
mnJ, mn = j, D[len(p), j]
# Backtrace; note: stops as soon as it gets to first row
off, xcript = trace(D, p, t[:mnJ])
# Return edit distance, offset into T, edit transcript
return mn, off, xcript, D
In [2]:
p, t = 'TACGTCAGC', 'AACCCTATGTCATGCCTTGGA'
mn, off, xscript, D = kEditDp(p, t)
mn, off, xscript
Out[2]:
In [3]:
D
Out[3]:
In [ ]: