In [1]:
import pandas as pd
import numpy as np
import math, sys, logging
logging.basicConfig()
log = logging.getLogger('dynamic')
log.setLevel(logging.INFO)

Shortest common supersequence

Ok, so I was confused here, I thought a supersequence was one that directly contained its subsequences intact, so like how "ABCDE" contains "ABC" and "CDE". But the definition of subsequence also includes "ACE" as a subsequence, because it contains the same elements, the same number of times, in the same order. This is the difference between subsequences and substrings.


In [2]:
a = {0: ('a', 'b'), 1: ('c', 'd')}
print(a[1])
df = pd.DataFrame(a)
print(df)
ssl = pd.DataFrame(-1, index=range(6), columns=range(5), dtype=int)
ssl
ssl.rank(1)
ssl.iat[3,4] = 1
ssl[:] = -1
ssl

sss = 'abcd'
print(sss[1:3])


('c', 'd')
   0  1
0  a  c
1  b  d
bc

In [3]:
def shortest_superseq(s1, s2):
    l1, l2 = len(s1), len(s2)
    seqs = pd.DataFrame(-1,
        index=range(l1+1), columns=range(l2+1),
        dtype=int)
    _ss(s1, s2, l1, l2, seqs)
    return seqs
    
def _ss(s1, s2, i, j, seqs):
    prev = seqs.iat[i,j]
    if prev != -1:
        return prev
    if i == 0:
        ret = j
    elif j == 0:
        ret = i
    elif s1[i-1] == s2[j-1]:
        ret = _ss(s1, s2, i-1, j-1, seqs) + 1
    else:
        r1 = _ss(s1, s2, i-1, j, seqs) + 1
        r2 = _ss(s1, s2, i, j-1, seqs) + 1
        ret = min(r1, r2)
    seqs.iat[i,j] = ret
    return ret

In [4]:
def find_supersequences(seqs, s1, s2):
    l1, l2 = len(s1), len(s2)
    sl = []
    msl = seqs.iat[l1, l2]
    _fss(seqs, s1, s2, l1, l2, sl, '')
    return sl
    
def _fss(seqs, s1, s2, i, j, sl, s):
    p = seqs.iat[i,j]
    if i == 0:
        sl.append(s2[0:j]+s)
    elif j == 0:
        sl.append(s1[0:i]+s)
    else:
        if s1[i-1] == s2[j-1]:
            _fss(seqs, s1, s2, i-1, j-1, sl, s1[i-1]+s)
        else:
            if seqs.iat[i,j] - 1 == seqs.iat[i-1,j]:
                _fss(seqs, s1, s2, i-1, j, sl, s1[i-1]+s)
            if seqs.iat[i,j] - 1 == seqs.iat[i,j-1]:
                _fss(seqs, s1, s2, i, j-1, sl, s2[j-1]+s)

In [5]:
s1 = 'ABCBDAB'
s2 = 'BDCABA'
seqs = shortest_superseq(s1, s2)
seqslabeled = seqs.copy()
seqslabeled.loc[1:,('str')] = list(s1)
seqslabeled = seqslabeled.T
seqslabeled.loc[1:-1,('str')] = list(s2)
print(seqslabeled)
seqlist = find_supersequences(seqs, s1, s2)
seqlist


       0   1   2   3   4   5   6   7  str
0     -1   1  -1   3  -1  -1  -1  -1  NaN
1      1   2   2   3   4  -1  -1  -1    B
2      2   3   3   4   5   5  -1  -1    D
3      3   4   4   4   5   6  -1  -1    C
4     -1   4   5   5   6   7   7  -1    A
5     -1  -1  -1  -1   6   7  -1   8    B
6     -1  -1  -1  -1  -1  -1   8   9    A
str  NaN   A   B   C   B   D   A   B  NaN
Out[5]:
['ABDCABDAB', 'ABDCBDABA', 'ABCBDCABA']

Word break problem

Given a dictionary and a condensed string of non-whitespace characters, split the string up into combinations of words in the dictionary. Both determine if it is possible, and find all the satisfying strings.


In [6]:
def word_break(dct, st):
    last = len(st)
    words = []
    if _wb(dct, st, 0, 1, last, words):
        return words
    else:
        return None

def _wb(dct, st, start, end, last, words):
    word = st[start:end]
    if (word in dct):
        log.debug('adding word: %s', word)
        words.append(word)
        if end == last:
            return True
        else:
            return _wb(dct, st, end, end+1, last, words)
    else:
        if end == last:
            return False
        else:
            return _wb(dct, st, start, end+1, last, words)

Dynamic approach

Given that this is from a dynamic programming list, it makes sense that this would exist. Once we identify any word or group of words from one portion of the string, and we know those are all the words in that substring, then we can do the same for the remaining string separately, and each combination of breakups of each of those two strings becomes the total combinations for all breaks that have a separation at that point. Each individual calculation is also a microcosm of the original problem, which is to say, the algorithm that solves the problem generally would be applied as-is to each substring.

Ex: "pickaxeatone" -> if separated into "pickaxe" and "atone", we can look at each separate string in isolation. If you look at "atone" and find "atone", "a tone", and "at one", then you know that the total combination involving that substring will be those combinations crossed with the combinations made from the remaining substring.

Therefore, what I want to do is constract a 2D array, where each place in the outer array represents the starting substring index, and each item in the inner array at that point is the list of words beginning at that point ("a" , "at", and "atone" in the prior example, if we start at "a"). Then to find the other combinations, you move forward in the outer array by the length of that string. Do that recursives and get all your combinations.


In [22]:
def word_break_complete(dct, st):
    size = len(st)
    prefixes = [None for _ in range(size)]
    build_pref_list(dct, st, 0, size, prefixes)
    log.debug('prefix list: %s', prefixes)
    break_list = []
    create_breaks(prefixes, break_list, size, 0, [])
    return break_list

def build_pref_list(dct, st, start, size, prefixes):
    if prefixes[start] is None:
        words = []
        prefixes[start] = words
        for i in range(start+1, size+1):
            word = st[start:i]
            if (word in dct):
                log.debug('adding word: %s', word)
                words.append(word)
                if i < size:
                    build_pref_list(dct, st, i, size, prefixes)
                    
def create_breaks(prefixes, break_list, size, i, l):
    if prefixes[i] is not None:
        for w in prefixes[i]:
            wl, ll = len(w), list(l)
            ll.append(w)
            if i+wl < size:
                create_breaks(prefixes, break_list, size, i+wl, ll)
            else:
                break_list.append(' '.join(ll))

In [23]:
log.setLevel(logging.DEBUG)
dct = { "this", "th", "is", "famous", "Word", "break", "b","r", "e", "a", "k", "br", "bre", "brea", "ak", "problem" }
st = "Wordbreakproblem"
print(word_break(dct, st))
print(word_break_complete(dct, st))


DEBUG:dynamic:adding word: Word
DEBUG:dynamic:adding word: b
DEBUG:dynamic:adding word: r
DEBUG:dynamic:adding word: e
DEBUG:dynamic:adding word: a
DEBUG:dynamic:adding word: k
DEBUG:dynamic:adding word: problem
DEBUG:dynamic:adding word: Word
DEBUG:dynamic:adding word: b
DEBUG:dynamic:adding word: r
DEBUG:dynamic:adding word: e
DEBUG:dynamic:adding word: a
DEBUG:dynamic:adding word: k
DEBUG:dynamic:adding word: problem
DEBUG:dynamic:adding word: ak
DEBUG:dynamic:adding word: br
DEBUG:dynamic:adding word: bre
DEBUG:dynamic:adding word: brea
DEBUG:dynamic:adding word: break
DEBUG:dynamic:prefix list: [['Word'], None, None, None, ['b', 'br', 'bre', 'brea', 'break'], ['r'], ['e'], ['a', 'ak'], ['k'], ['problem'], None, None, None, None, None, None]
['Word', 'b', 'r', 'e', 'a', 'k', 'problem']
['Word b r e a k problem', 'Word b r e ak problem', 'Word br e a k problem', 'Word br e ak problem', 'Word bre a k problem', 'Word bre ak problem', 'Word brea k problem', 'Word break problem']

Cheapest path matrix problem

You can only move right or down. Object is to get from top-left to bottom-right. The cost of the path is the sum of the numbers traversed. Find the lowest-cost path. Here is an example:

$$ \begin{equation*} \begin{bmatrix} \color{red}{4} & 7 & 8 & 6 & 4 \\ \color{red}6 & \color{red}{7} & \color{red}3 & 9 & 2 \\ 3 & 8 & \color{red}1 & \color{red}2 & 4 \\ 7 & 1 & 7 & \color{red}3 & \color{red}7 \\ 2 & 9 & 8 & 9 & \color{red}3 \\ \end{bmatrix} \end{equation*} $$

Path is highlighted in red.


In [41]:
def find_path(m):
    costs = np.full(fill_value=np.inf, shape=m.shape)
    w,h = m.shape
    costs[h-1, w-1] = m[h-1, w-1]
    _fp(m, costs, 0, 0)
    path = [(0,0)]
    i,j = 0,0
    while i+1 < h or j+1 < w:
        if i+1 == h:
            j += 1
        elif j+1 == w:
            i += 1
        elif costs[i+1,j] < costs[i,j+1]:
            i += 1
        else:
            j += 1
        path.append((i,j))
    return path

def _fp(m, costs, i, j):
    w,h = m.shape
    if i == h or j == w:
        return np.inf
    if costs[i,j] < np.inf:
        return costs[i,j]
    cost = min(_fp(m, costs, i+1, j),
               _fp(m, costs, i, j+1))
    costs[i,j] = cost + m[i,j]
    return costs[i,j]

In [51]:
m = np.array(
[[ 4, 7, 8, 6, 4 ],
[ 6, 7, 3, 9, 2 ],
[ 3, 8, 1, 2, 4 ],
[ 7, 1, 7, 3, 7 ],
[ 2, 9, 8, 9, 3 ]
])
print(find_path(m))
rm = np.random.randint(100, size=(20,20))
# print(rm)
path = find_path(rm)
print(path)
pdrm = pd.DataFrame(data=rm, dtype=str)
for i,j in path:
    pdrm.iloc[i,j] = '*' + pdrm.iloc[i,j]
# print(pdrm)


[(0, 0), (1, 0), (1, 1), (1, 2), (2, 2), (2, 3), (3, 3), (3, 4), (4, 4)]
[(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 1), (5, 1), (5, 2), (6, 2), (7, 2), (7, 3), (8, 3), (8, 4), (8, 5), (9, 5), (9, 6), (9, 7), (9, 8), (9, 9), (10, 9), (10, 10), (10, 11), (10, 12), (10, 13), (11, 13), (11, 14), (11, 15), (11, 16), (12, 16), (13, 16), (13, 17), (14, 17), (15, 17), (16, 17), (17, 17), (18, 17), (18, 18), (18, 19), (19, 19)]

Longest repeated subsequence

Given the definition of a subsequence above (as distinct from a substring) find the longest sequence that occurs twice in the given string. The two sequences must share no letters, meaning that if you identify and remove a subsequence from the original string, you must be able to do that same step again given the remainder.

How am I going to do this

First thing is to create a dict of the characters which points to the indexes where they show up.

Then, go through the string. If a character only shows up once in the whole string, it can be completely ignored, since it won't be in a repeated subsequence and it will not affect the presence of another subsequence. When we encounter a character that occurs more than once, then we have to consider if it will be in the repeated subsquence or not, as it's not guaranteed that it will be.

If we pick a letter, then we find its next pair, and check the letters in between. For each one in between, its pair must be after the pair for the previous letter, or else you cannot maintain the same order of the sequence. As soon as we pick the next paired letter, we have a new constraint for what that pair must be after. When there are no more letter pairs beginning in between the first and last of the found pairs, we have an isolated portion. Then, if the largest repeated subsequence contains the found sequences to this point, then the largest total repeated subsequence will be the found portion added to the longest repeated subsequence of the remaining string. That is a lot of words to describe what I am saying, I don't think that was very efficient. Here is an example:

[ADEFBGHCIABC][JKJK]

First we find A, jump ahead to the next A, which becomes the last character, scan in between until we find another paired letter, B, and extend the end to the next B, do it again with C. When we get to C, we will have scanned all the way to the end (the last C) without finding another character with a pair outside the end boundary, so we can consider that portion of the string exhausted. Then we can start again at the beginning of the remaining sequence, JKJK, and analyze that one separately.

The issue is, when we hit a paired character, we can't assume that character will be in the longest repeated subsequence. Here's an example:

ABCBCADEFEFDGHIHIG

A naive algorithm would find A and assume it is in the subsequence, whereas the actual longest ss is BCEFHI. To address this, we need to also consider the subsequence that results if we just ignore a pair we encounter each time we encounter it. I haven't gotten to this part yet.


In [175]:
def longest_repeat_subsequence(st):
    char_pos = {}
    char_arr = {}
    for i,c in enumerate(st):
        # if character is in the array,
        # take it out, and assign it to its pair
        try:
            i0 = char_pos[c]
            char_arr[i0] = i
            del char_pos[c]
        except KeyError:
            char_pos[c] = i
            
    log.debug("char maps: %s", char_arr)
    subseqs = _build_isolated_portions(st, char_arr)
    log.debug('sub sequences: %s', subseqs)
    seqs = _construct_full_sequences(subseqs, [], [], 0)
    return max(seqs, key=lambda s: len(s))
    
def _lrss_portion(st, char_arr, start):
    i, end = start, start
    seq, checked = [], set([])
    try:
        while True:
            c = st[i]
            try:
                pair = char_arr[i]
                if pair >= end:
                    log.debug('Found character with pair: %s, index %d',
                              c, i)
                    checked.update(range(end+1, pair))
                    end = pair
                    seq.append(c)
            except IndexError:
                pass
            i = checked.pop()
            log.debug("next i: %d", i)
    except KeyError:
        log.debug('No items left in queue to check, exiting')
    log.debug('end index: %d, returning sequence: %s', end, seq)
    return end, seq

def _build_isolated_portions(st, char_arr):
    end, subseqs = -1, []
    _tmpq = {}
    for start in range(len(st)):
        end, s = _lrss_portion(st, char_arr, start)
        if len(s) > 0:
            log.debug('in temp queue:')
            # Filter out sequences that are worse than the
            # one just found. If it ends later and is no bigger,
            # it is not needed.
            _tmpq = {k:v for k,v in
                     filter(lambda g: g[1][1] < end or
                                      len(g[1][2]) > len(s),
                            _tmpq.items())}
            if not end in _tmpq:
                grp = (start, end, s)            
                _tmpq[end] = grp
        try:
            grp = _tmpq[start]
            if grp is not None:
                subseqs.append(grp)
            del _tmpq[start]
        except KeyError:
            pass
#     for start, end, s in seqs:
        
    return subseqs

def _construct_full_sequences(subseqs, seqs, seq, start):
    end = start
    myseq = list(seq)
    for i,s in enumerate(subseqs):
        if s[0] < end:
            log.debug('found an overlapping subsequnce, checking that too: %s', s)
            _construct_full_sequences(subseqs[i:], seqs, seq, s[0])
        else:
            end = s[1]
            myseq.extend(s[2])
    seqs.append(myseq)
    return seqs

In [178]:
log.setLevel(logging.INFO)
st = 'ATACTCGGA'
st2 = 'ABCBCDEFEFGHIHIADG'
print(longest_repeat_subsequence(st))
print(longest_repeat_subsequence(st2))


['A', 'T', 'C', 'G']
['B', 'C', 'E', 'F', 'H', 'I']