In [1]:
# given a long string, find shortest substring with alphabet in order

# relies on the string being in Unicode, but the logic of Find_AZ can easily be adapted to any ordered alphabet

# runs in O(N) time, with N=len(long string)
# and uses O(k) space, with k=size of alphabet

# .txt files from project Gutemberg https://www.gutenberg.org/

In [2]:
### - Preliminaries functions

In [3]:
def previous_char(c):
    # return the previous character in the alphabet
    # simple function but might be confusing if in main logic
    # can also be changed if a different alphabet is required
    # relies on unicode
    return chr(ord(c)-1)

assert previous_char('f')=='e'
assert

In [4]:
def pretty_print_AZ(solution_text):
    # function to hightlight the A-Z alphabet in a solution
    # will crash if run past the final Z
    
    BOLD = '\033[1m'
    RED = '\033[91m'
    END = '\033[0m'
    
    alphabet = [chr(c) for c in range(97, 97+26)]
    next_char = 0 
    
    r = solution_text[:]
    
    for p,c in enumerate(r):
        
        if c.lower()==alphabet[next_char]:
            r = r[:p+(13*next_char)] + BOLD+RED+c+END + r[p+1+(13*next_char):]
            next_char+=1
    
    print(r)
    
pretty_print_AZ("This is a test string with many characters to embolded and reddify, such as D and E and F, and so on...")


This is a test string with many characters to embolded and reddify, such as D and E and F, and so on...

In [5]:
#####
# MAIN FUNCTION
#####

In [6]:
def find_AZ(text):
    
    alphabet = [chr(c) for c in range(97, 97+26)]
    # list containing where to start the current substring that ends with character c
    # best to walk through an example to understand
    V = {c:None for c in alphabet}
    
    cur_best_start_end = None
    cur_best_len = float('inf')
    
    for p,c in enumerate(text):
                
        # convert char to lower case
        c = c.lower()
                
        # if it's an alphabet letter
        # relies on internal ord() function
        if ord(c)<97+26 and ord(c)>96:
                        
            # case 1: character is first of the alphabet
            if c=='a':
                V[c] = p

            # case 2: character is in the middle of alphabet
            else:
                # if the previous char has a value
                if V[previous_char(c)] != None: 
                    V[c] = V[previous_char(c)]
                    V[previous_char(c)] = None
                    
                # we could do a chec khere to see if p-V[c] < cur_best_len
                # but it wouldn't actually save anything
                # instead only check when you hit a Z character
                
            # debug print statements
#             if V[c] != None:
#                 print(p, c, V)

            # finally, if it's the end of the alphabet
            # and V['z'] exists
            if c=='z' and V[c]!=None:
                # len from finish to start
                cur_len = p-V[c]
                if cur_len < cur_best_len:
                    cur_best_len = cur_len
                    cur_best_start_end = (V[c], p)
                    
    print("final V: {}".format(V))
    return cur_best_start_end

In [7]:
r = find_AZ("aaaaabcdezzzfghijzzzklmnzzopqrzzstuvwxyzzzaabcdefghijklmnopqrstuvwxyzabcdeabcdabcaba")
print(r)


final V: {'a': 83, 'c': 78, 'b': 81, 'e': 69, 'd': 74, 'g': None, 'f': None, 'i': None, 'h': None, 'k': None, 'j': None, 'm': None, 'l': None, 'o': None, 'n': None, 'q': None, 'p': None, 's': None, 'r': None, 'u': None, 't': None, 'w': None, 'v': None, 'y': None, 'x': None, 'z': 43}
(43, 68)

In [8]:
# close to a worst case? not really, it's still constant
# just a cast where almost all the dic values are filled
r = find_AZ("zabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxabcdefghijklmnopqrstuvwabcdefghijklmnopqrstuvabcdefghijklmnopqrstuvabcdefghijklmnopqrstuabcdefghijklmnopqrstabcdefghijklmnopqrsabcdefghijklmnopqrabcdefghijklmnopqabcdefghijklmnopabcdefghijklmnoabcdefghijklmnabcdefghijklmabcdefghijklabcdefghijkabcdefghijabcdefghiabcdefghabcdefgabcdefabcdeabcdabcabaxyz")
print(r)


final V: {'a': 347, 'c': 342, 'b': 345, 'e': 333, 'd': 338, 'g': 320, 'f': 327, 'i': 303, 'h': 312, 'k': 282, 'j': 293, 'm': 257, 'l': 270, 'o': 228, 'n': 243, 'q': 195, 'p': 212, 's': 158, 'r': 177, 'u': 117, 't': 138, 'w': None, 'v': 95, 'y': None, 'x': None, 'z': 50}
(50, 350)

In [9]:
# now load a large unicode text

In [10]:
# Open a file: 
# file = open('bible_king_james.txt',mode='r')
file = open('heart_of_darkness.txt',mode='r')
# file = open('xmas_carol.txt',mode='r')
# file = open('frankenstein.txt',mode='r')
# file = open('huck_finn.txt',mode='r')
# file = open('dracula.txt',mode='r')
# file = open('war_peace.txt',mode='r')
# file = open('metamorphosis.txt',mode='r')
# file = open('works_of_poe.txt',mode='r')
# file = open('ulysses.txt',mode='r')

# read all lines at once
text = file.read()
 
# close the file
file.close()

In [11]:
# number of characters in the long string
len(text)


Out[11]:
234089

In [12]:
r = find_AZ(text)


final V: {'a': None, 'c': 234016, 'b': 234070, 'e': 233918, 'd': None, 'g': None, 'f': 233864, 'i': None, 'h': None, 'k': None, 'j': None, 'm': None, 'l': 233784, 'o': 233647, 'n': None, 'q': None, 'p': 232944, 's': None, 'r': None, 'u': None, 't': None, 'w': None, 'v': None, 'y': 231224, 'x': None, 'z': 226149}

In [13]:
print("Shortest solution interval is {}".format(r))


Shortest solution interval is (92395, 94143)

In [14]:
pretty_print_AZ(text[r[0]:r[1]+1])


a sluggish beetle crawling on the
floor of a lofty portico. It made you feel very small, very lost, and
yet it was not altogether depressing, that feeling. After all, if you
were small, the grimy beetle crawled on--which was just what you wanted
it to do. Where the pilgrims imagined it crawled to I don't know.
To some place where they expected to get something. I bet! For me it
crawled towards Kurtz--exclusively; but when the steam-pipes started
leaking we crawled very slow. The reaches opened before us and closed
behind, as if the forest had stepped leisurely across the water to bar
the way for our return. We penetrated deeper and deeper into the heart
of darkness. It was very quiet there. At night sometimes the roll of
drums behind the curtain of trees would run up the river and remain
sustained faintly, as if hovering in the air high over our heads, till
the first break of day. Whether it meant war, peace, or prayer we could
not tell. The dawns were heralded by the descent of a chill stillness;
the wood-cutters slept, their fires burned low; the snapping of a twig
would make you start. We were wanderers on a prehistoric earth, on an
earth that wore the aspect of an unknown planet. We could have fancied
ourselves the first of men taking possession of an accursed inheritance,
to be subdued at the cost of profound anguish and of excessive toil. But
suddenly, as we struggled round a bend, there would be a glimpse of rush
walls, of peaked grass-roofs, a burst of yells, a whirl of black limbs,
a mass of hands clapping of feet stamping, of bodies swaying, of eyes
rolling, under the droop of heavy and motionless foliage. The steamer
toiled along slowly on the edge of a black and incomprehensible frenz

In [ ]: