In [2]:
# First we make a function that splits a string p up into a set of
# non-overlapping, non-empty substrings.
def partition(p, pieces=2):
assert len(p) >= pieces
base, mod = len(p) / pieces, len(p) % pieces
idx = 0
ps = []
modAdjust = 1
for i in xrange(0, pieces):
if i >= mod: modAdjust = 0
newIdx = idx + base + modAdjust
ps.append(p[idx:newIdx])
idx = newIdx
return ps
In [3]:
def bmApproximate(p, t, k, alph="ACGT"):
""" Use the pigeonhole principle together with Boyer-Moore to find
approximate matches with up to a specified number of mismatches. """
assert len(p) >= k+1
ps = partition(p, k+1) # split p into list of k+1 non-empty, non-overlapping substrings
off = 0 # offset into p of current partition
occurrences = set() # note we might see the same occurrence >1 time
for pi in ps: # for each partition
# NOTE: I haven't given the implementation for the BMPreprocessing object.
# It implements the Boyer-Moore skipping rules as we discussed in class.
bm_prep = BMPreprocessing(pi, alph=alph) # BM preprocess the partition
for hit in bm_prep.match(t)[0]:
if hit - off < 0: continue # pattern falls off left end of T?
if hit + len(p) - off > len(t): continue # falls off right end?
# Count mismatches to left and right of the matching partition
nmm = 0
for i in range(0, off) + range(off+len(pi), len(p)):
if t[hit-off+i] != p[i]:
nmm += 1
if nmm > k: break # exceeded maximum # mismatches
if nmm <= k:
occurrences.add(hit-off) # approximate match
off += len(pi) # Update offset of current partition
return sorted(list(occurrences))
In [4]:
bmApproximate('needle', 'needle noodle nargle', 2, alph='abcdefghijklmnopqrstuvwxyz ')
Out[4]:
In [4]: