notebook.community

Edit and run



In [1]:

    
# A simple example of a substring index; mirrors example from lecture notes

# we're going to extract 4 substrings like this:
# t:           CGTGCCTACTTACTTACAT
# substring 1: CGTGC
# substring 2:     CCTAC
# substring 3:         CTTAC
# substring 4:             CTTAC
t = 'CGTGCCTACTTACTTACAT'



In [2]:

    
# From t, make list of pairs, where first pair item is substring, second is its offset
def substringize(t, ln, iv):
    # ln = length of substrings to extract
    # iv = distance between substings to extract; e.g. 1 means take *every* substring
    pairs = []
    # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)
    for i in range(0, len(t) - ln + 1, iv):
        pairs.append((t[i:i+ln], i))
    return pairs
    # Could also have used list comprehension:
    # return [ (t[i:i+ln], i) for i in range(0, len(t) - ln + 1, iv) ]



In [3]:

    
substringize('CGTGCCTACTTACTTACAT', 5, 4)









    Out[3]:





[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]



In [4]:

    
# Like substringize, but uses a map data structure
def mapize(t, ln, iv):
    index = {}
    for i in range(0, len(t) - ln + 1, iv):
        sub = t[i:i+ln]
        if sub in index:
            index[sub].append(i) # already have an entry; append to it
        else:
            index[sub] = [i] # don't yet have entry, make new one
    return index



In [5]:

    
index = mapize('CGTGCCTACTTACTTACAT', 5, 4)
index









    Out[5]:





{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}



In [6]:

    
p = 'CTTACTTA'



In [7]:

    
# index: give me a hint where I should look for occurrences of p in t
index[p[:5]]









    Out[7]:





[8, 12]