In [1]:
# A simple example of a substring index; mirrors example from lecture notes

# we're going to extract 4 substrings like this:
# t:           CGTGCCTACTTACTTACAT
# substring 1: CGTGC
# substring 2:     CCTAC
# substring 3:         CTTAC
# substring 4:             CTTAC
t = 'CGTGCCTACTTACTTACAT'

In [2]:
# From t, make list of pairs, where first pair item is substring, second is its offset
def substringize(t, ln, iv):
    # ln = length of substrings to extract
    # iv = distance between substings to extract; e.g. 1 means take *every* substring
    pairs = []
    # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)
    for i in range(0, len(t) - ln + 1, iv):
        pairs.append((t[i:i+ln], i))
    return pairs
    # Could also have used list comprehension:
    # return [ (t[i:i+ln], i) for i in range(0, len(t) - ln + 1, iv) ]

In [3]:
substringize('CGTGCCTACTTACTTACAT', 5, 4)


Out[3]:
[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]

In [4]:
# Like substringize, but uses a map data structure
def mapize(t, ln, iv):
    index = {}
    for i in range(0, len(t) - ln + 1, iv):
        sub = t[i:i+ln]
        if sub in index:
            index[sub].append(i) # already have an entry; append to it
        else:
            index[sub] = [i] # don't yet have entry, make new one
    return index

In [5]:
index = mapize('CGTGCCTACTTACTTACAT', 5, 4)
index


Out[5]:
{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}

In [6]:
p = 'CTTACTTA'

In [7]:
# index: give me a hint where I should look for occurrences of p in t
index[p[:5]]


Out[7]:
[8, 12]