``````

In [1]:

# Let's make a class that implements an inverted (substring) index where the map data
# structure is a sorted list of (substring, offset) pairs.  Query w/ binary search.

import sys
import bisect # for binary search

class IndexSorted(object):

def __init__(self, t, ln, ival):
""" Create index, extracting substrings of length 'ln' """
self.t = t
self.ln = ln
self.ival = ival
self.index = []
for i in range(len(t)-ln+1):
self.index.append((t[i:i+ln], i)) # add <substr, offset> pair
self.index.sort() # sort pairs

def query(self, p):
""" Return candidate alignments for p """
st = bisect.bisect_left(self.index, (p[:self.ln], -1)) # binary search
en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxsize)) # binary search
hits = self.index[st:en] # this range of elements corresponds to the hits
return [ h[1] for h in hits ] # return just the offsets

``````
``````

In [2]:

index = IndexSorted('CGTGCCTACTTACTTACAT', 5, 4)

``````
``````

In [3]:

index.query('CTTACTTA') # index: give us hints on where to look for this pattern

``````
``````

Out[3]:

[8, 12]

``````
``````

In [4]:

index.query('TTTTTTTT') # index: give us hints on where to look for this pattern

``````
``````

Out[4]:

[]

``````
``````

In [5]:

# Now let's make a similar class but using a Python dictionary instead of a sorted
# list.  A Python dictionary is basically a hash table.

class IndexHash(object):

def __init__(self, t, ln, ival):
""" Create index, extracting substrings of length 'ln' """
self.t = t
self.ln = ln
self.ival = ival
self.index = {}
for i in range(len(t)-ln+1):
substr = t[i:i+ln]
if substr in self.index:
self.index[substr].append(i) # substring already in dictionary
else:
self.index[substr] = [i] # add to dictionary

def query(self, p):
""" Return candidate alignments for p """
return self.index.get(p[:self.ln], [])

``````
``````

In [6]:

index2 = IndexHash('CGTGCCTACTTACTTACAT', 5, 4)

``````
``````

In [7]:

index2.query('CTTACTTA') # index: give us hints on where to look for this pattern

``````
``````

Out[7]:

[8, 12]

``````
``````

In [8]:

index2.query('TTTTTTTT') # index: give us hints on where to look for this pattern

``````
``````

Out[8]:

[]

``````