In [1]:
import (
"fmt"
"sort"
)
In [2]:
// Let's make a class that implements an inverted (substring)
// index where the map data structure is a sorted list of
// (substring, offset) pairs. Query w/ binary search.
type IndexItem struct {
mer string
offset int
}
type IndexSorted struct {
length, interval int
index []IndexItem
}
In [3]:
// Constructor for Boyer-Moore preprocessing object
func NewIndexSorted(t string, ln, ival int) *IndexSorted {
idx := new(IndexSorted)
idx.length = ln
idx.interval = ival
// add all k-mers to the index
for i := 0; i < len(t) - ln + 1; i++ {
idx.index = append(idx.index, IndexItem{mer: t[i:i+ln], offset:i})
}
// sort the index, first by k-mer (alphabetically) then by offset
sort.Slice(idx.index, func(i, j int) bool {
if idx.index[i].mer != idx.index[j].mer {
return idx.index[i].mer < idx.index[j].mer
} else {
return idx.index[i].offset < idx.index[j].offset
}
})
return idx
}
In [4]:
func (idx *IndexSorted) query(p string) []int {
qu := p[:idx.length]
// sort.Search does binary search, returning first
// element for which function returns true
st := sort.Search(len(idx.index), func(i int) bool { return qu <= idx.index[i].mer })
en := sort.Search(len(idx.index), func(i int) bool { return qu < idx.index[i].mer })
ret := make([]int, 0)
for i := st; i < en; i++ {
ret = append(ret, idx.index[i].offset)
}
return ret
}
In [5]:
index := NewIndexSorted("CGTGCCTACTTACTTACAT", 5, 4)
index.query("CTTACTTA")
Out[5]:
In [6]:
index.query("TTTTTTTT")
Out[6]:
In [7]:
// Now let's make a similar class but using a map instead of a sorted
// list. A Go map is basically a hash table.
type IndexHash struct {
length, interval int
index map[string][]int
}
In [8]:
func NewIndexHash(t string, ln, ival int) *IndexHash {
idx := new(IndexHash)
idx.length = ln
idx.interval = ival
idx.index = make(map[string][]int)
// add all k-mers to the index (map)
for i := 0; i < len(t) - ln + 1; i++ {
mer := t[i:i+ln]
idx.index[mer] = append(idx.index[mer], i)
}
return idx
}
In [9]:
func (idx *IndexHash) query(p string) []int {
qu := p[:idx.length]
// No need for binary search here; just a lookup
return idx.index[qu]
}
In [10]:
index := NewIndexHash("CGTGCCTACTTACTTACAT", 5, 4)
index.query("CTTACTTA")
Out[10]:
In [11]:
index.query("TTTTTTTT")
Out[11]: