In [1]:
# A simple example of a substring index; mirrors example from lecture notes
# we're going to extract 4 substrings like this:
# t: CGTGCCTACTTACTTACAT
# substring 1: CGTGC
# substring 2: CCTAC
# substring 3: CTTAC
# substring 4: CTTAC
t = 'CGTGCCTACTTACTTACAT'
In [2]:
# From t, make list of pairs, where first pair item is substring, second is its offset
def substringize(t, ln, iv):
# ln = length of substrings to extract
# iv = distance between substings to extract; e.g. 1 means take *every* substring
pairs = []
# Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)
for i in range(0, len(t) - ln + 1, iv):
pairs.append((t[i:i+ln], i))
return pairs
# Could also have used list comprehension:
# return [ (t[i:i+ln], i) for i in range(0, len(t) - ln + 1, iv) ]
In [3]:
substringize('CGTGCCTACTTACTTACAT', 5, 4)
Out[3]:
In [4]:
# Like substringize, but uses a map data structure
def mapize(t, ln, iv):
index = {}
for i in range(0, len(t) - ln + 1, iv):
sub = t[i:i+ln]
if sub in index:
index[sub].append(i) # already have an entry; append to it
else:
index[sub] = [i] # don't yet have entry, make new one
return index
In [5]:
index = mapize('CGTGCCTACTTACTTACAT', 5, 4)
index
Out[5]:
In [6]:
p = 'CTTACTTA'
In [7]:
# index: give me a hint where I should look for occurrences of p in t
index[p[:5]]
Out[7]: