Approximate matching with pigeonhole principle and Boyer-Moore

Most of this notebook is the Boyer-Moore implementation, as we saw in a previous notebook. Scroll to the bottom for the bmApproximate function that uses it to do approximate matching with the pigeonhole principle.


In [1]:
import (
    "bytes"
    "sort"
)

In [2]:
func zArray(s string) []int {
    // Use Z-algorithm to preprocess given string.  See
    // Gusfield for complete description of algorithm.
    Z := make([]int, len(s)+1)
    Z[0] = len(s)

    // Initial comparison of s[1:] with prefix
    for i := 1; i < len(s); i++ {
        if s[i] == s[i-1] {
            Z[1] += 1
        } else {
            break
        }
    }

    r, l := 0, 0
    if Z[1] > 0 {
        r, l = Z[1], 1
    }

    for k := 2; k < len(s); k++ {
        if k > r {
            // Case 1
            for i := k; i < len(s); i++ {
                if s[i] == s[i-k] {
                    Z[k] += 1
                } else {
                    break
                }
            }
            r, l = k + Z[k] - 1, k
        } else {
            // Case 2
            // Calculate length of beta
            nbeta := r - k + 1
            Zkp := Z[k - l]
            if nbeta > Zkp {
                // Case 2a: Zkp wins
                Z[k] = Zkp
            } else {
                // Case 2b: Compare characters just past r
                nmatch := 0
                for i := r+1; i < len(s); i++ {
                    if s[i] == s[i - k] {
                        nmatch += 1
                    } else {
                        break
                    }
                }
                l, r = k, r + nmatch
                Z[k] = r - k + 1
            }
        }
    }
    return Z
}

In [3]:
// Helper function that returns a new string
// that is the reverse of the argument
func reverseString(s string) string {
    var buf bytes.Buffer
    for i := 0; i < len(s); i++ {
        buf.WriteByte(s[len(s)-i-1])
    }
    return buf.String()
}

// Helper function that returns a new int slice
// that is the reverse of the argument
func reverseInts(is []int) []int {
    buf := make([]int, len(is))
    for i := 0; i < len(is); i++ {
        buf[len(is)-i-1] = is[i]
    }
    return buf
}

In [4]:
// Make N array (Gusfield theorem 2.2.2) from Z array
func nArray(s string) []int {
    return reverseInts(zArray(reverseString(s)))
}

// Make L' array (Gusfield theorem 2.2.2) using p and N array.
// L'[i] = largest index j less than n such that N[j] = |P[i:]|
func bigLPrimeArray(length int, n []int) []int {
    lp := make([]int, length)
    for j := 0; j < length-1; j++ {
        i := length - n[j]
        if i < length {
            lp[i] = j + 1
        }
    }
    return lp
}

// Compile L array (Gusfield theorem 2.2.2) using p and L' array.
// L[i] = largest index j less than n such that N[j] >= |P[i:]|
func bigLArray(length int, lp []int) []int {
    l := make([]int, length)
    l[1] = lp[1]
    for i := 2; i < length; i++ {
        l[i] = l[i-1]
        if lp[i] > l[i] {
            l[i] = lp[i]
        }
    }
    return l
}

// Compile lp' array (Gusfield theorem 2.2.4) using N array
func smallLPrimeArray(n []int) []int {
    smallLps := make([]int, len(n))
    for i := 0; i < len(n); i++ {
        if n[i] == i+1 {  // prefix matching a suffix
            smallLps[len(n)-i-1] = i+1
        }
    }
    for i := len(n)-2; i > -1; i-- {  // "smear" them out to the left
        if smallLps[i] == 0 {
            smallLps[i] = smallLps[i+1]
        }
    }
    return smallLps
}

In [5]:
// Return tables needed to apply good suffix rule
func goodSuffixTable(p string) ([]int, []int) {
    n := nArray(p)
    lp := bigLPrimeArray(len(p), n)
    return bigLArray(len(p), lp), smallLPrimeArray(n)
}

// Given pattern string and list with ordered alphabet characters, create
// and return a dense bad character table.  Table is indexed by offset
// then by character.
func denseBadCharTab(p string, amap map[byte]int) [][]int {
    tab := make([][]int, len(p))
    nxt := make([]int, len(amap))
    for i, c := range []byte(p) {
        tab[i] = nxt[:]
        nxt[amap[c]] = i+1
    }
    return tab
}

In [6]:
// Boyer-Moore preprocessing object
type BoyerMoore struct {
    pattern             string
    amap                map[byte]int
    badChars            [][]int
    bigLS, smallLPrimes []int
}

// Constructor for Boyer-Moore preprocessing object
func NewBoyerMoore(p string, alphabet string) *BoyerMoore {
    m := new(BoyerMoore)
    m.pattern = p
    m.amap = make(map[byte]int)
    for i, c := range []byte(alphabet) {
        m.amap[c] = i
    }
    m.badChars = denseBadCharTab(p, m.amap)
    m.bigLS, m.smallLPrimes = goodSuffixTable(p)
    return m
}

In [7]:
// Return # skips given by bad character rule at offset i
func (bm *BoyerMoore) badCharacterRule(i int, c byte) int {
    ci := bm.amap[c]
    return i - (bm.badChars[i][ci]-1)
}

// Given a mismatch at offset i, return amount to shift
// as determined by (weak) good suffix rule.
func (bm *BoyerMoore) goodSuffixRule(i int) int {
    length := len(bm.bigLS)
    if i == length - 1 {
        return 0
    }
    i += 1   // i points to leftmost matching position of P
    if bm.bigLS[i] > 0 {
        return length - bm.bigLS[i]
    }
    return length - bm.smallLPrimes[i]
}

In [8]:
// Return amount to shift in case where P matches T
func (bm *BoyerMoore) matchSkip() int {
    return len(bm.smallLPrimes) - bm.smallLPrimes[1]
}

// Utility function for taking max of integers
func Max(x, y int) int {
    if x > y {
        return x
    }
    return y
}

// Do Boyer-Moore matching
func (bm *BoyerMoore) match(t string) []int {
    i := 0
    occurrences := make([]int, 0)
    p := bm.pattern
    for i < len(t) - len(p) + 1 {
        shift, skipGs := 1, 1
        mismatched := false
        for j := len(p) - 1; j >= 0; j-- {
            if p[j] != t[i+j] {
                skipBc := bm.badCharacterRule(j, t[i+j])
                skipGs = bm.goodSuffixRule(j)
                shift = Max(shift, Max(skipBc, skipGs))
                mismatched = true
                break
            }
        }
        if ! mismatched {
            occurrences = append(occurrences, i)
            skipGs := Max(shift, skipGs)
            shift = Max(shift, skipGs)
        }
        i += shift
    }
    return occurrences
}

We start the approximate matching implementation with a partition function that splits the pattern up into equal-sized (or nearly equal-sized) pieces.


In [9]:
// Return a string splice containing non-overlapping,
// non-empty substrings that cover p.  They should be
// as close to equal-length as possible.
func partition(p string, pieces int) []string {
    base, mod := len(p) / pieces, len(p) % pieces
    idx := 0
    ps := make([]string, 0)
    modAdjust := 1
    for i := 0; i < pieces; i++ {
        if i >= mod {
            modAdjust = 0
        }
        newIdx := idx + base + modAdjust
        ps = append(ps, p[idx:newIdx])
        idx = newIdx
    }
    return ps
}

In [10]:
// Use the pigeonhole principle together with Boyer-Moore to find
// approximate matches with up to a specified number of mismatches.
func bmApproximate(p string, t string, k int, alph string) []int {
    ps := partition(p, k+1) // split p into k+1 non-empty, non-overlapping substrings
    off := 0                // offset into p of current partition
    occurrences := make(map[int]int) // we might see the same occurrence >1 time
    for _, pi := range ps { // for each partition
        bm := NewBoyerMoore(pi, alph)  // BM preprocess the partition
        for _, hit := range bm.match(t) {
            if hit - off < 0 {
                continue  // pattern falls off left end of T?
            }
            if hit + len(p) - off > len(t) {
                continue  // falls off right end?
            }
            // Count mismatches to left and right of the matching partition
            nmm := 0
            for i := 0; i < off; i++ {
                if t[hit-off+i] != p[i] {
                    nmm++
                    if nmm > k {
                        break  // exceeded maximum # mismatches
                    }
                }
            }
            if nmm <= k {
                for i := off+len(pi); i < len(p); i++ {
                    if t[hit-off+i] != p[i] {
                        nmm++
                        if nmm > k {
                            break  // exceeded maximum # mismatches
                        }
                    }
                }
            }
            if nmm <= k {
                occurrences[hit-off]++  // approximate match
            }
        }
        off += len(pi)  // Update offset of current partition
    }
    var occurrence_list []int
    for k := range occurrences {
        occurrence_list = append(occurrence_list, k)
    }
    sort.Ints(occurrence_list)
    return occurrence_list
}

In [11]:
bmApproximate("needle", "needle noodle nargle", 2, "abcdefghijklmnopqrstuvwxyz ")


Out[11]:
[0 7]