In [1]:
# given a long string, find shortest substring with alphabet in order
# relies on the string being in Unicode, but the logic of Find_AZ can easily be adapted to any ordered alphabet
# runs in O(N) time, with N=len(long string)
# and uses O(k) space, with k=size of alphabet
# .txt files from project Gutemberg https://www.gutenberg.org/
In [2]:
### - Preliminaries functions
In [3]:
def previous_char(c):
# return the previous character in the alphabet
# simple function but might be confusing if in main logic
# can also be changed if a different alphabet is required
# relies on unicode
return chr(ord(c)-1)
assert previous_char('f')=='e'
assert
In [4]:
def pretty_print_AZ(solution_text):
# function to hightlight the A-Z alphabet in a solution
# will crash if run past the final Z
BOLD = '\033[1m'
RED = '\033[91m'
END = '\033[0m'
alphabet = [chr(c) for c in range(97, 97+26)]
next_char = 0
r = solution_text[:]
for p,c in enumerate(r):
if c.lower()==alphabet[next_char]:
r = r[:p+(13*next_char)] + BOLD+RED+c+END + r[p+1+(13*next_char):]
next_char+=1
print(r)
pretty_print_AZ("This is a test string with many characters to embolded and reddify, such as D and E and F, and so on...")
In [5]:
#####
# MAIN FUNCTION
#####
In [6]:
def find_AZ(text):
alphabet = [chr(c) for c in range(97, 97+26)]
# list containing where to start the current substring that ends with character c
# best to walk through an example to understand
V = {c:None for c in alphabet}
cur_best_start_end = None
cur_best_len = float('inf')
for p,c in enumerate(text):
# convert char to lower case
c = c.lower()
# if it's an alphabet letter
# relies on internal ord() function
if ord(c)<97+26 and ord(c)>96:
# case 1: character is first of the alphabet
if c=='a':
V[c] = p
# case 2: character is in the middle of alphabet
else:
# if the previous char has a value
if V[previous_char(c)] != None:
V[c] = V[previous_char(c)]
V[previous_char(c)] = None
# we could do a chec khere to see if p-V[c] < cur_best_len
# but it wouldn't actually save anything
# instead only check when you hit a Z character
# debug print statements
# if V[c] != None:
# print(p, c, V)
# finally, if it's the end of the alphabet
# and V['z'] exists
if c=='z' and V[c]!=None:
# len from finish to start
cur_len = p-V[c]
if cur_len < cur_best_len:
cur_best_len = cur_len
cur_best_start_end = (V[c], p)
print("final V: {}".format(V))
return cur_best_start_end
In [7]:
r = find_AZ("aaaaabcdezzzfghijzzzklmnzzopqrzzstuvwxyzzzaabcdefghijklmnopqrstuvwxyzabcdeabcdabcaba")
print(r)
In [8]:
# close to a worst case? not really, it's still constant
# just a cast where almost all the dic values are filled
r = find_AZ("zabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxabcdefghijklmnopqrstuvwabcdefghijklmnopqrstuvabcdefghijklmnopqrstuvabcdefghijklmnopqrstuabcdefghijklmnopqrstabcdefghijklmnopqrsabcdefghijklmnopqrabcdefghijklmnopqabcdefghijklmnopabcdefghijklmnoabcdefghijklmnabcdefghijklmabcdefghijklabcdefghijkabcdefghijabcdefghiabcdefghabcdefgabcdefabcdeabcdabcabaxyz")
print(r)
In [9]:
# now load a large unicode text
In [10]:
# Open a file:
# file = open('bible_king_james.txt',mode='r')
file = open('heart_of_darkness.txt',mode='r')
# file = open('xmas_carol.txt',mode='r')
# file = open('frankenstein.txt',mode='r')
# file = open('huck_finn.txt',mode='r')
# file = open('dracula.txt',mode='r')
# file = open('war_peace.txt',mode='r')
# file = open('metamorphosis.txt',mode='r')
# file = open('works_of_poe.txt',mode='r')
# file = open('ulysses.txt',mode='r')
# read all lines at once
text = file.read()
# close the file
file.close()
In [11]:
# number of characters in the long string
len(text)
Out[11]:
In [12]:
r = find_AZ(text)
In [13]:
print("Shortest solution interval is {}".format(r))
In [14]:
pretty_print_AZ(text[r[0]:r[1]+1])
In [ ]: