In [2]:
# First we make a function that splits a string p up into a set of
# non-overlapping, non-empty substrings.

def partition(p, pieces=2):
    assert len(p) >= pieces
    base, mod = len(p) / pieces, len(p) % pieces
    idx = 0
    ps = []
    modAdjust = 1
    for i in xrange(0, pieces):
        if i >= mod: modAdjust = 0
        newIdx = idx + base + modAdjust
        ps.append(p[idx:newIdx])
        idx = newIdx
    return ps

In [3]:
def bmApproximate(p, t, k, alph="ACGT"):
    """ Use the pigeonhole principle together with Boyer-Moore to find
        approximate matches with up to a specified number of mismatches. """
    assert len(p) >= k+1
    ps = partition(p, k+1) # split p into list of k+1 non-empty, non-overlapping substrings
    off = 0                # offset into p of current partition
    occurrences = set()    # note we might see the same occurrence >1 time
    for pi in ps:          # for each partition
        # NOTE: I haven't given the implementation for the BMPreprocessing object.
        # It implements the Boyer-Moore skipping rules as we discussed in class.
        bm_prep = BMPreprocessing(pi, alph=alph) # BM preprocess the partition
        for hit in bm_prep.match(t)[0]:
            if hit - off < 0: continue # pattern falls off left end of T?
            if hit + len(p) - off > len(t): continue # falls off right end?
            # Count mismatches to left and right of the matching partition
            nmm = 0
            for i in range(0, off) + range(off+len(pi), len(p)):
                if t[hit-off+i] != p[i]:
                    nmm += 1
                    if nmm > k: break # exceeded maximum # mismatches
            if nmm <= k:
                occurrences.add(hit-off) # approximate match
        off += len(pi) # Update offset of current partition
    return sorted(list(occurrences))

In [4]:
bmApproximate('needle', 'needle noodle nargle', 2, alph='abcdefghijklmnopqrstuvwxyz ')


Out[4]:
[0, 7]

In [4]: