In [1]:
import re

def words(text): 
    return re.findall('[a-z]+', text.lower())

In [12]:
sample_words = "a b c try big die_hard cool7 cool"
print(words(sample_words))


['a', 'b', 'c', 'try', 'big', 'die', 'hard', 'cool', 'cool']

In [13]:
from itertools import groupby

dictionary = dict((w, len(list(ws)))
                  for w, ws in groupby(sorted(words(sample_words))))

print(dictionary)


{'big': 1, 'b': 1, 'hard': 1, 'a': 1, 'die': 1, 'cool': 2, 'c': 1, 'try': 1}

In [14]:
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
print(total)
print(max_word_length)


9.0
4

In [16]:
def word_prob(word): 
    return dictionary.get(word, 0) / total

for w in ['t', 'a', 'hard', 'cool']:
    print(w, word_prob(w))


t 0.0
a 0.1111111111111111
hard 0.1111111111111111
cool 0.2222222222222222

In [17]:
def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

In [20]:
for w in ['', 'gogo', 'coolweather', 'hardtry', 'ac']:
    print(w, viterbi_segment(w))


 ([], 1.0)
gogo (['g', 'o', 'g', 'o'], 0.0)
coolweather (['cool', 'w', 'e', 'a', 't', 'h', 'e', 'r'], 0.0)
hardtry (['hard', 'try'], 0.012345679012345678)
ac (['a', 'c'], 0.012345679012345678)

In [ ]: