notebook.community

Edit and run



In [1]:

    
# Non-greedy SCS, for comparison:

import itertools

def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest



In [2]:

    
def pick_maximal_overlap(reads, k):
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
    reada, readb = None, None
    best_olen = 0
    for a, b in itertools.permutations(reads, 2):
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen

def greedy_scs(reads, k):
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[-(len(read_b) - olen):])
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return ''.join(reads)



In [3]:

    
# Small example where brute-force gives shorter superstring than greedy
scs(['ABCD', 'CDBC', 'BCDA'])









    Out[3]:





'ABCDBCDA'



In [4]:

    
greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1)









    Out[4]:





'CDBCABCDA'



In [5]:

    
# Example from lecture notes
greedy_sup = greedy_scs(['BAA', 'AAB', 'BBA', 'ABA', 'ABB', 'BBB', 'AAA', 'BAB'], 1)



In [6]:

    
greedy_sup, len(greedy_sup)









    Out[6]:





('AAABBBABAAB', 11)



In [7]:

    
scs_sup = scs(['BAA', 'AAB', 'BBA', 'ABA', 'ABB', 'BBB', 'AAA', 'BAB'])



In [8]:

    
# Greedy SCS superstring might be longer than optimal
scs_sup, len(scs_sup)









    Out[8]:





('BAAABABBBA', 10)