In [1]:
# Non-greedy SCS, for comparison:
import itertools
def overlap(a, b, min_length=3):
""" Return length of longest suffix of 'a' matching
a prefix of 'b' that is at least 'min_length'
characters long. If no such overlap exists,
return 0. """
start = 0 # start all the way at the left
while True:
start = a.find(b[:min_length], start) # look for b's suffx in a
if start == -1: # no more occurrences to right
return 0
# found occurrence; check for full suffix/prefix match
if b.startswith(a[start:]):
return len(a)-start
start += 1 # move just past previous match
def scs(ss):
""" Returns shortest common superstring of given
strings, which must be the same length """
shortest_sup = None
for ssperm in itertools.permutations(ss):
sup = ssperm[0] # superstring starts as first string
for i in range(len(ss)-1):
# overlap adjacent strings A and B in the permutation
olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
# add non-overlapping portion of B to superstring
sup += ssperm[i+1][olen:]
if shortest_sup is None or len(sup) < len(shortest_sup):
shortest_sup = sup # found shorter superstring
return shortest_sup # return shortest
In [2]:
def pick_maximal_overlap(reads, k):
""" Return a pair of reads from the list with a
maximal suffix/prefix overlap >= k. Returns
overlap length 0 if there are no such overlaps."""
reada, readb = None, None
best_olen = 0
for a, b in itertools.permutations(reads, 2):
olen = overlap(a, b, min_length=k)
if olen > best_olen:
reada, readb = a, b
best_olen = olen
return reada, readb, best_olen
def greedy_scs(reads, k):
""" Greedy shortest-common-superstring merge.
Repeat until no edges (overlaps of length >= k)
remain. """
read_a, read_b, olen = pick_maximal_overlap(reads, k)
while olen > 0:
reads.remove(read_a)
reads.remove(read_b)
reads.append(read_a + read_b[-(len(read_b) - olen):])
read_a, read_b, olen = pick_maximal_overlap(reads, k)
return ''.join(reads)
In [3]:
# Small example where brute-force gives shorter superstring than greedy
scs(['ABCD', 'CDBC', 'BCDA'])
Out[3]:
In [4]:
greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1)
Out[4]:
In [5]:
# Example from lecture notes
greedy_sup = greedy_scs(['BAA', 'AAB', 'BBA', 'ABA', 'ABB', 'BBB', 'AAA', 'BAB'], 1)
In [6]:
greedy_sup, len(greedy_sup)
Out[6]:
In [7]:
scs_sup = scs(['BAA', 'AAB', 'BBA', 'ABA', 'ABB', 'BBB', 'AAA', 'BAB'])
In [8]:
# Greedy SCS superstring might be longer than optimal
scs_sup, len(scs_sup)
Out[8]: