In [1]:
"""Imports that we'll want throughout this notebook"""
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
In [5]:
"""Load the words set"""
from string import ascii_lowercase
import re
def get_word(word_candidate):
"""
Return
Namely, a candidate is a word if it comprises letters, and is
optionally terminated with whitespace.
"""
word_pattern = r'^(?P<word>\w+)[\r\n\t ]*$'
m = re.match(word_pattern, word_candidate)
if m is None:
return m
else:
return m.group('word')
with open('354984si.ngl') as f:
wordset = set([get_word(line) for line in f if get_word(line)])
# Quirk of Moby Words: remove all one-letter words but 'a' and 'I'
wordset = set([w for w in wordset if len(w) > 1]) | set(['a', 'i'])
wordset |= set(['']) # empty string needs to be in there as a base case
In [ ]:
"""
Find the longest recursively-decomposable subword: bottom-up
Start with ["a", "i"], add letters in each position.
"""
from string import ascii_lowercase
from collections import deque
def bottom_up(wordset):
results = {}
longest_words = ['']
Q = deque([('', '', '')])
while len(Q) > 0:
word, delta, child = Q.pop()
if word in wordset:
results[child] = results.get(child, []) + [(delta, word)]
if len(word) > len(longest_words[0]):
longest_words = [word]
elif len(word) == len(longest_words[0]):
longest_words.append(word)
for i in range(len(word) + 1):
for c in ascii_lowercase:
Q.append((''.join([word[:i], c, word[i:]]), c, word))
return longest_words, results
bu_list, bu_results = bottom_up(wordset)
set(bu_list)
In [ ]:
"""
Find the subword graph, from root to longest
"""
from collections import deque
from graph_tool import Graph, PropertyMap
def build_bottomup_subword_graph(bu_results):
# bu_results: shorter -> [(char, longer)]
G = Graph(directed=False)
names = G.new_vertex_property('string')
deltas = G.new_edge_property('string')
# add '' first to prevent an infinite loop
v = G.add_vertex()
names[v] = ''
w2v = dict([('', v)])
# create nodes for each point
i = 0
Q = deque([c for _, c in bu_results['']]) # Q <- [word]
while len(Q) > 0:
i += 1
word = Q.pop()
if word not in w2v:
v = G.add_vertex()
w2v[word] = v
names[v] = word
for _, longer in bu_results.get(word, []):
Q.append(longer)
# add character transitions
for w, v in w2v.items():
for delta, longer in bu_results.get(w, []):
e = G.add_edge(v, w2v[longer])
deltas[e] = delta
return G, names, deltas, w2v['']
G, names, deltas, root = build_bottomup_subword_graph(bu_results)
In [ ]:
from graph_tool.draw import graph_draw, radial_tree_layout
pos = radial_tree_layout(G, root=root)
graph_draw(G, pos=pos, vertex_text=names, edge_text=deltas,
vertex_font_size=16, output_size=(10000, 10000), output='words.png')
In [ ]:
"""
Find the longest recursively-decomposable subwords: top-down
Start with each word, work on decomposition graph. Memoized for speed.
"""
def get_subwords(word):
return [word[:i] + word[i+1:] for i in range(len(word))]
def is_recursively_decomposable(word, wordset, res_cache={}):
res = res_cache.get(word, None)
if res is not None:
return res
else:
if len(word) == 0:
res = True
parents = []
else:
subwords = [(w, is_recursively_decomposable(w, wordset, res_cache)[0])
for w in get_subwords(word)
if w in wordset]
parents = [w for w, d in subwords if d]
res = len(parents) > 0
res_cache[word] = (res, parents)
return (res, parents)
def top_down(wordset):
longest_words = ['']
result_cache = {}
for i, w in enumerate(wordset):
is_rec_decomp, _ = is_recursively_decomposable(w, wordset, result_cache)
if is_rec_decomp:
if len(w) > len(longest_words[0]):
longest_words = [w]
elif len(w) == len(longest_words[0]):
longest_words.append(w)
else:
pass
if i % 10000 == 0:
print(i)
return (longest_words, result_cache)
td_list, td_results = top_down(wordset)
In [ ]:
"""
Build the subword graph, from longestest words to ''
"""
from collections import deque
from graph_tool import Graph, PropertyMap
def build_topdown_subword_graph(word_list, result_cache):
w2v = {}
G = Graph()
P = G.new_vertex_property('string')
Q = deque([(w, None) for w in word_list]) # Q <- [word, parent_vertex]
root = None
while len(Q) > 0:
word, parent = Q.pop()
if word in w2v:
v = w2v[word]
else:
v = G.add_vertex()
P[v] = word
w2v[word] = v
if word == '':
root = v
if parent is not None:
G.add_edge(parent, v)
_, children = result_cache[word]
for child in children:
Q.append((child, v))
return G, P, root
G, P, root = build_topdown_subword_graph(topdown_list, results)
In [ ]:
"""
Visualize derivation graph of longest words -> ''
"""
from graph_tool.draw import graph_draw, radial_tree_layout
pos = radial_tree_layout(G, root=root)
graph_draw(G, pos=pos, vertex_text=P, vertex_font_size=16, output_size=(5000, 5000), output='words.pdf')
In [ ]:
"""Across all \(w \in wordset\), how are word lengths distributed?"""
all_word_lengths = [len(w) for w in wordset]
n, bins, patches = plt.hist(all_word_lengths, max(all_word_lengths)-1, facecolor='red')
plt.title('Moby Words: Word count vs. Word length')
plt.savefig('moby_words_hist.png')
plt.show()
bu_results # shorter -> [(char, longer)]
seen = set([''])
lengths = []
Q = deque([w for _, w in bu_results['']])
while len(Q) > 0:
w = Q.pop()
if w not in seen:
seen.add(w)
lengths.append(len(w))
for _, longer in bu_results.get(w, []):
Q.append(longer)
nc = np.bincount(lengths)
ac = np.bincount(all_word_lengths)
x, y = zip(*[(i, n / a) for i, (n, a) in enumerate(zip(nc, ac))])
plt.plot(x, y, 'r-')
plt.title('Moby Words: Percentage Rec. Decomp. words vs. Length')
plt.savefig('rec_comp_perc_wc.png')
plt.show()