``````

In :

# A simple example of a substring index; mirrors example from lecture notes

# we're going to extract 4 substrings like this:
# t:           CGTGCCTACTTACTTACAT
# substring 1: CGTGC
# substring 2:     CCTAC
# substring 3:         CTTAC
# substring 4:             CTTAC
t = 'CGTGCCTACTTACTTACAT'

``````
``````

In :

# From t, make list of pairs, where first pair item is substring, second is its offset
def substringize(t, ln, iv):
# ln = length of substrings to extract
# iv = distance between substings to extract; e.g. 1 means take *every* substring
pairs = []
# Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)
for i in range(0, len(t) - ln + 1, iv):
pairs.append((t[i:i+ln], i))
return pairs
# Could also have used list comprehension:
# return [ (t[i:i+ln], i) for i in range(0, len(t) - ln + 1, iv) ]

``````
``````

In :

substringize('CGTGCCTACTTACTTACAT', 5, 4)

``````
``````

Out:

[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]

``````
``````

In :

# Like substringize, but uses a map data structure
def mapize(t, ln, iv):
index = {}
for i in range(0, len(t) - ln + 1, iv):
sub = t[i:i+ln]
if sub in index:
index[sub].append(i) # already have an entry; append to it
else:
index[sub] = [i] # don't yet have entry, make new one
return index

``````
``````

In :

index = mapize('CGTGCCTACTTACTTACAT', 5, 4)
index

``````
``````

Out:

{'CCTAC': , 'CGTGC': , 'CTTAC': [8, 12]}

``````
``````

In :

p = 'CTTACTTA'

``````
``````

In :

# index: give me a hint where I should look for occurrences of p in t
index[p[:5]]

``````
``````

Out:

[8, 12]

``````