In [1]:
import pandas as pd
import numpy as np
import math, sys, logging
logging.basicConfig()
log = logging.getLogger('dynamic')
log.setLevel(logging.INFO)
Ok, so I was confused here, I thought a supersequence was one that directly contained its subsequences intact, so like how "ABCDE" contains "ABC" and "CDE". But the definition of subsequence also includes "ACE" as a subsequence, because it contains the same elements, the same number of times, in the same order. This is the difference between subsequences and substrings.
In [2]:
a = {0: ('a', 'b'), 1: ('c', 'd')}
print(a[1])
df = pd.DataFrame(a)
print(df)
ssl = pd.DataFrame(-1, index=range(6), columns=range(5), dtype=int)
ssl
ssl.rank(1)
ssl.iat[3,4] = 1
ssl[:] = -1
ssl
sss = 'abcd'
print(sss[1:3])
In [3]:
def shortest_superseq(s1, s2):
l1, l2 = len(s1), len(s2)
seqs = pd.DataFrame(-1,
index=range(l1+1), columns=range(l2+1),
dtype=int)
_ss(s1, s2, l1, l2, seqs)
return seqs
def _ss(s1, s2, i, j, seqs):
prev = seqs.iat[i,j]
if prev != -1:
return prev
if i == 0:
ret = j
elif j == 0:
ret = i
elif s1[i-1] == s2[j-1]:
ret = _ss(s1, s2, i-1, j-1, seqs) + 1
else:
r1 = _ss(s1, s2, i-1, j, seqs) + 1
r2 = _ss(s1, s2, i, j-1, seqs) + 1
ret = min(r1, r2)
seqs.iat[i,j] = ret
return ret
In [4]:
def find_supersequences(seqs, s1, s2):
l1, l2 = len(s1), len(s2)
sl = []
msl = seqs.iat[l1, l2]
_fss(seqs, s1, s2, l1, l2, sl, '')
return sl
def _fss(seqs, s1, s2, i, j, sl, s):
p = seqs.iat[i,j]
if i == 0:
sl.append(s2[0:j]+s)
elif j == 0:
sl.append(s1[0:i]+s)
else:
if s1[i-1] == s2[j-1]:
_fss(seqs, s1, s2, i-1, j-1, sl, s1[i-1]+s)
else:
if seqs.iat[i,j] - 1 == seqs.iat[i-1,j]:
_fss(seqs, s1, s2, i-1, j, sl, s1[i-1]+s)
if seqs.iat[i,j] - 1 == seqs.iat[i,j-1]:
_fss(seqs, s1, s2, i, j-1, sl, s2[j-1]+s)
In [5]:
s1 = 'ABCBDAB'
s2 = 'BDCABA'
seqs = shortest_superseq(s1, s2)
seqslabeled = seqs.copy()
seqslabeled.loc[1:,('str')] = list(s1)
seqslabeled = seqslabeled.T
seqslabeled.loc[1:-1,('str')] = list(s2)
print(seqslabeled)
seqlist = find_supersequences(seqs, s1, s2)
seqlist
Out[5]:
In [6]:
def word_break(dct, st):
last = len(st)
words = []
if _wb(dct, st, 0, 1, last, words):
return words
else:
return None
def _wb(dct, st, start, end, last, words):
word = st[start:end]
if (word in dct):
log.debug('adding word: %s', word)
words.append(word)
if end == last:
return True
else:
return _wb(dct, st, end, end+1, last, words)
else:
if end == last:
return False
else:
return _wb(dct, st, start, end+1, last, words)
Given that this is from a dynamic programming list, it makes sense that this would exist. Once we identify any word or group of words from one portion of the string, and we know those are all the words in that substring, then we can do the same for the remaining string separately, and each combination of breakups of each of those two strings becomes the total combinations for all breaks that have a separation at that point. Each individual calculation is also a microcosm of the original problem, which is to say, the algorithm that solves the problem generally would be applied as-is to each substring.
Ex: "pickaxeatone" -> if separated into "pickaxe" and "atone", we can look at each separate string in isolation. If you look at "atone" and find "atone", "a tone", and "at one", then you know that the total combination involving that substring will be those combinations crossed with the combinations made from the remaining substring.
Therefore, what I want to do is constract a 2D array, where each place in the outer array represents the starting substring index, and each item in the inner array at that point is the list of words beginning at that point ("a" , "at", and "atone" in the prior example, if we start at "a"). Then to find the other combinations, you move forward in the outer array by the length of that string. Do that recursives and get all your combinations.
In [22]:
def word_break_complete(dct, st):
size = len(st)
prefixes = [None for _ in range(size)]
build_pref_list(dct, st, 0, size, prefixes)
log.debug('prefix list: %s', prefixes)
break_list = []
create_breaks(prefixes, break_list, size, 0, [])
return break_list
def build_pref_list(dct, st, start, size, prefixes):
if prefixes[start] is None:
words = []
prefixes[start] = words
for i in range(start+1, size+1):
word = st[start:i]
if (word in dct):
log.debug('adding word: %s', word)
words.append(word)
if i < size:
build_pref_list(dct, st, i, size, prefixes)
def create_breaks(prefixes, break_list, size, i, l):
if prefixes[i] is not None:
for w in prefixes[i]:
wl, ll = len(w), list(l)
ll.append(w)
if i+wl < size:
create_breaks(prefixes, break_list, size, i+wl, ll)
else:
break_list.append(' '.join(ll))
In [23]:
log.setLevel(logging.DEBUG)
dct = { "this", "th", "is", "famous", "Word", "break", "b","r", "e", "a", "k", "br", "bre", "brea", "ak", "problem" }
st = "Wordbreakproblem"
print(word_break(dct, st))
print(word_break_complete(dct, st))
You can only move right or down. Object is to get from top-left to bottom-right. The cost of the path is the sum of the numbers traversed. Find the lowest-cost path. Here is an example:
$$ \begin{equation*} \begin{bmatrix} \color{red}{4} & 7 & 8 & 6 & 4 \\ \color{red}6 & \color{red}{7} & \color{red}3 & 9 & 2 \\ 3 & 8 & \color{red}1 & \color{red}2 & 4 \\ 7 & 1 & 7 & \color{red}3 & \color{red}7 \\ 2 & 9 & 8 & 9 & \color{red}3 \\ \end{bmatrix} \end{equation*} $$Path is highlighted in red.
In [41]:
def find_path(m):
costs = np.full(fill_value=np.inf, shape=m.shape)
w,h = m.shape
costs[h-1, w-1] = m[h-1, w-1]
_fp(m, costs, 0, 0)
path = [(0,0)]
i,j = 0,0
while i+1 < h or j+1 < w:
if i+1 == h:
j += 1
elif j+1 == w:
i += 1
elif costs[i+1,j] < costs[i,j+1]:
i += 1
else:
j += 1
path.append((i,j))
return path
def _fp(m, costs, i, j):
w,h = m.shape
if i == h or j == w:
return np.inf
if costs[i,j] < np.inf:
return costs[i,j]
cost = min(_fp(m, costs, i+1, j),
_fp(m, costs, i, j+1))
costs[i,j] = cost + m[i,j]
return costs[i,j]
In [51]:
m = np.array(
[[ 4, 7, 8, 6, 4 ],
[ 6, 7, 3, 9, 2 ],
[ 3, 8, 1, 2, 4 ],
[ 7, 1, 7, 3, 7 ],
[ 2, 9, 8, 9, 3 ]
])
print(find_path(m))
rm = np.random.randint(100, size=(20,20))
# print(rm)
path = find_path(rm)
print(path)
pdrm = pd.DataFrame(data=rm, dtype=str)
for i,j in path:
pdrm.iloc[i,j] = '*' + pdrm.iloc[i,j]
# print(pdrm)
Given the definition of a subsequence above (as distinct from a substring) find the longest sequence that occurs twice in the given string. The two sequences must share no letters, meaning that if you identify and remove a subsequence from the original string, you must be able to do that same step again given the remainder.
First thing is to create a dict of the characters which points to the indexes where they show up.
Then, go through the string. If a character only shows up once in the whole string, it can be completely ignored, since it won't be in a repeated subsequence and it will not affect the presence of another subsequence. When we encounter a character that occurs more than once, then we have to consider if it will be in the repeated subsquence or not, as it's not guaranteed that it will be.
If we pick a letter, then we find its next pair, and check the letters in between. For each one in between, its pair must be after the pair for the previous letter, or else you cannot maintain the same order of the sequence. As soon as we pick the next paired letter, we have a new constraint for what that pair must be after. When there are no more letter pairs beginning in between the first and last of the found pairs, we have an isolated portion. Then, if the largest repeated subsequence contains the found sequences to this point, then the largest total repeated subsequence will be the found portion added to the longest repeated subsequence of the remaining string. That is a lot of words to describe what I am saying, I don't think that was very efficient. Here is an example:
[ADEFBGHCIABC][JKJK]
First we find A, jump ahead to the next A, which becomes the last character, scan in between until we find another paired letter, B, and extend the end to the next B, do it again with C. When we get to C, we will have scanned all the way to the end (the last C) without finding another character with a pair outside the end boundary, so we can consider that portion of the string exhausted. Then we can start again at the beginning of the remaining sequence, JKJK, and analyze that one separately.
The issue is, when we hit a paired character, we can't assume that character will be in the longest repeated subsequence. Here's an example:
ABCBCADEFEFDGHIHIG
A naive algorithm would find A and assume it is in the subsequence, whereas the actual longest ss is BCEFHI
. To address this, we need to also consider the subsequence that results if we just ignore a pair we encounter each time we encounter it. I haven't gotten to this part yet.
In [175]:
def longest_repeat_subsequence(st):
char_pos = {}
char_arr = {}
for i,c in enumerate(st):
# if character is in the array,
# take it out, and assign it to its pair
try:
i0 = char_pos[c]
char_arr[i0] = i
del char_pos[c]
except KeyError:
char_pos[c] = i
log.debug("char maps: %s", char_arr)
subseqs = _build_isolated_portions(st, char_arr)
log.debug('sub sequences: %s', subseqs)
seqs = _construct_full_sequences(subseqs, [], [], 0)
return max(seqs, key=lambda s: len(s))
def _lrss_portion(st, char_arr, start):
i, end = start, start
seq, checked = [], set([])
try:
while True:
c = st[i]
try:
pair = char_arr[i]
if pair >= end:
log.debug('Found character with pair: %s, index %d',
c, i)
checked.update(range(end+1, pair))
end = pair
seq.append(c)
except IndexError:
pass
i = checked.pop()
log.debug("next i: %d", i)
except KeyError:
log.debug('No items left in queue to check, exiting')
log.debug('end index: %d, returning sequence: %s', end, seq)
return end, seq
def _build_isolated_portions(st, char_arr):
end, subseqs = -1, []
_tmpq = {}
for start in range(len(st)):
end, s = _lrss_portion(st, char_arr, start)
if len(s) > 0:
log.debug('in temp queue:')
# Filter out sequences that are worse than the
# one just found. If it ends later and is no bigger,
# it is not needed.
_tmpq = {k:v for k,v in
filter(lambda g: g[1][1] < end or
len(g[1][2]) > len(s),
_tmpq.items())}
if not end in _tmpq:
grp = (start, end, s)
_tmpq[end] = grp
try:
grp = _tmpq[start]
if grp is not None:
subseqs.append(grp)
del _tmpq[start]
except KeyError:
pass
# for start, end, s in seqs:
return subseqs
def _construct_full_sequences(subseqs, seqs, seq, start):
end = start
myseq = list(seq)
for i,s in enumerate(subseqs):
if s[0] < end:
log.debug('found an overlapping subsequnce, checking that too: %s', s)
_construct_full_sequences(subseqs[i:], seqs, seq, s[0])
else:
end = s[1]
myseq.extend(s[2])
seqs.append(myseq)
return seqs
In [178]:
log.setLevel(logging.INFO)
st = 'ATACTCGGA'
st2 = 'ABCBCDEFEFGHIHIADG'
print(longest_repeat_subsequence(st))
print(longest_repeat_subsequence(st2))