experiments with How can I split multiple joined words? from Stackoverflow
In [1]:
import re
def words(text):
return re.findall('[a-z]+', text.lower())
In [12]:
sample_words = "a b c try big die_hard cool7 cool"
print(words(sample_words))
In [13]:
from itertools import groupby
dictionary = dict((w, len(list(ws)))
for w, ws in groupby(sorted(words(sample_words))))
print(dictionary)
In [14]:
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
print(total)
print(max_word_length)
In [16]:
def word_prob(word):
return dictionary.get(word, 0) / total
for w in ['t', 'a', 'hard', 'cool']:
print(w, word_prob(w))
In [17]:
def viterbi_segment(text):
probs, lasts = [1.0], [0]
for i in range(1, len(text) + 1):
prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
for j in range(max(0, i - max_word_length), i))
probs.append(prob_k)
lasts.append(k)
words = []
i = len(text)
while 0 < i:
words.append(text[lasts[i]:i])
i = lasts[i]
words.reverse()
return words, probs[-1]
In [20]:
for w in ['', 'gogo', 'coolweather', 'hardtry', 'ac']:
print(w, viterbi_segment(w))
In [ ]: