experiments with How can I split multiple joined words? from Stackoverflow



In [1]:

    
import re

def words(text): 
    return re.findall('[a-z]+', text.lower())



In [12]:

    
sample_words = "a b c try big die_hard cool7 cool"
print(words(sample_words))









    



['a', 'b', 'c', 'try', 'big', 'die', 'hard', 'cool', 'cool']



In [13]:

    
from itertools import groupby

dictionary = dict((w, len(list(ws)))
                  for w, ws in groupby(sorted(words(sample_words))))

print(dictionary)









    



{'big': 1, 'b': 1, 'hard': 1, 'a': 1, 'die': 1, 'cool': 2, 'c': 1, 'try': 1}



In [14]:

    
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
print(total)
print(max_word_length)



In [16]:

    
def word_prob(word): 
    return dictionary.get(word, 0) / total

for w in ['t', 'a', 'hard', 'cool']:
    print(w, word_prob(w))









    



t 0.0
a 0.1111111111111111
hard 0.1111111111111111
cool 0.2222222222222222



In [17]:

    
def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]



In [20]:

    
for w in ['', 'gogo', 'coolweather', 'hardtry', 'ac']:
    print(w, viterbi_segment(w))









    



 ([], 1.0)
gogo (['g', 'o', 'g', 'o'], 0.0)
coolweather (['cool', 'w', 'e', 'a', 't', 'h', 'e', 'r'], 0.0)
hardtry (['hard', 'try'], 0.012345679012345678)
ac (['a', 'c'], 0.012345679012345678)



In [ ]: