In [1]:
import (
    "fmt"
    "sort"
)

Sorted list as index


In [2]:
// Let's make a class that implements an inverted (substring)
// index where the map data structure is a sorted list of
// (substring, offset) pairs.  Query w/ binary search.

type IndexItem struct {
    mer string
    offset int
}

type IndexSorted struct {
    length, interval int
    index []IndexItem
}

In [3]:
// Constructor for Boyer-Moore preprocessing object
func NewIndexSorted(t string, ln, ival int) *IndexSorted {
    idx := new(IndexSorted)
    idx.length = ln
    idx.interval = ival
    // add all k-mers to the index
    for i := 0; i < len(t) - ln + 1; i++ {
        idx.index = append(idx.index, IndexItem{mer: t[i:i+ln], offset:i})
    }
    // sort the index, first by k-mer (alphabetically) then by offset
    sort.Slice(idx.index, func(i, j int) bool {
        if idx.index[i].mer != idx.index[j].mer {
            return idx.index[i].mer < idx.index[j].mer
        } else {
            return idx.index[i].offset < idx.index[j].offset
        }
    })
    return idx
}

In [4]:
func (idx *IndexSorted) query(p string) []int {
    qu := p[:idx.length]
    // sort.Search does binary search, returning first
    // element for which function returns true
    st := sort.Search(len(idx.index), func(i int) bool { return qu <= idx.index[i].mer })
    en := sort.Search(len(idx.index), func(i int) bool { return qu <  idx.index[i].mer })
    ret := make([]int, 0)
    for i := st; i < en; i++ {
        ret = append(ret, idx.index[i].offset)
    }
    return ret
}

In [5]:
index := NewIndexSorted("CGTGCCTACTTACTTACAT", 5, 4)
index.query("CTTACTTA")


Out[5]:
[8 12]

In [6]:
index.query("TTTTTTTT")


Out[6]:
[]

Map (hash table) as index


In [7]:
// Now let's make a similar class but using a map instead of a sorted
// list.  A Go map is basically a hash table.

type IndexHash struct {
    length, interval int
    index map[string][]int
}

In [8]:
func NewIndexHash(t string, ln, ival int) *IndexHash {
    idx := new(IndexHash)
    idx.length = ln
    idx.interval = ival
    idx.index = make(map[string][]int)
    // add all k-mers to the index (map)
    for i := 0; i < len(t) - ln + 1; i++ {
        mer := t[i:i+ln]
        idx.index[mer] = append(idx.index[mer], i)
    }
    return idx
}

In [9]:
func (idx *IndexHash) query(p string) []int {
    qu := p[:idx.length]
    // No need for binary search here; just a lookup
    return idx.index[qu]
}

In [10]:
index := NewIndexHash("CGTGCCTACTTACTTACAT", 5, 4)
index.query("CTTACTTA")


Out[10]:
[8 12]

In [11]:
index.query("TTTTTTTT")


Out[11]:
[]