In [1]:
# Let's make a class that implements an inverted (substring) index where the map data
# structure is a sorted list of (substring, offset) pairs.  Query w/ binary search.

import sys
import bisect # for binary search

class IndexSorted(object):
    
    def __init__(self, t, ln, ival):
        """ Create index, extracting substrings of length 'ln' """
        self.t = t
        self.ln = ln
        self.ival = ival
        self.index = []
        for i in range(len(t)-ln+1):
            self.index.append((t[i:i+ln], i)) # add <substr, offset> pair
        self.index.sort() # sort pairs
    
    def query(self, p):
        """ Return candidate alignments for p """
        st = bisect.bisect_left(self.index, (p[:self.ln], -1)) # binary search
        en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxsize)) # binary search
        hits = self.index[st:en] # this range of elements corresponds to the hits
        return [ h[1] for h in hits ] # return just the offsets

In [2]:
index = IndexSorted('CGTGCCTACTTACTTACAT', 5, 4)

In [3]:
index.query('CTTACTTA') # index: give us hints on where to look for this pattern


Out[3]:
[8, 12]

In [4]:
index.query('TTTTTTTT') # index: give us hints on where to look for this pattern


Out[4]:
[]

In [5]:
# Now let's make a similar class but using a Python dictionary instead of a sorted
# list.  A Python dictionary is basically a hash table.

class IndexHash(object):
    
    def __init__(self, t, ln, ival):
        """ Create index, extracting substrings of length 'ln' """
        self.t = t
        self.ln = ln
        self.ival = ival
        self.index = {}
        for i in range(len(t)-ln+1):
            substr = t[i:i+ln]
            if substr in self.index:
                self.index[substr].append(i) # substring already in dictionary
            else:
                self.index[substr] = [i] # add to dictionary
    
    def query(self, p):
        """ Return candidate alignments for p """
        return self.index.get(p[:self.ln], [])

In [6]:
index2 = IndexHash('CGTGCCTACTTACTTACAT', 5, 4)

In [7]:
index2.query('CTTACTTA') # index: give us hints on where to look for this pattern


Out[7]:
[8, 12]

In [8]:
index2.query('TTTTTTTT') # index: give us hints on where to look for this pattern


Out[8]:
[]