In [1]:
"""Imports that we'll want throughout this notebook"""
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint

In [5]:
"""Load the words set"""
from string import ascii_lowercase
import re

def get_word(word_candidate):
    """
    Return 

    Namely, a candidate is a word if it comprises letters, and is
    optionally terminated with whitespace.
    """
    word_pattern = r'^(?P<word>\w+)[\r\n\t ]*$'
    m = re.match(word_pattern, word_candidate)
    if m is None:
        return m
    else:
        return m.group('word')

with open('354984si.ngl') as f:
    wordset = set([get_word(line) for line in f if get_word(line)])

# Quirk of Moby Words: remove all one-letter words but 'a' and 'I'
wordset = set([w for w in wordset if len(w) > 1]) | set(['a', 'i'])
wordset |= set([''])  # empty string needs to be in there as a base case

In [ ]:
"""
Find the longest recursively-decomposable subword: bottom-up

Start with ["a", "i"], add letters in each position.
"""
from string import ascii_lowercase
from collections import deque


def bottom_up(wordset):
    results = {}
    longest_words = ['']
    Q = deque([('', '', '')])
    while len(Q) > 0:
        word, delta, child = Q.pop()
        if word in wordset:
            results[child] = results.get(child, []) + [(delta, word)]
            if len(word) > len(longest_words[0]):
                longest_words = [word]
            elif len(word) == len(longest_words[0]):
                longest_words.append(word)

            for i in range(len(word) + 1):
                for c in ascii_lowercase:
                    Q.append((''.join([word[:i], c, word[i:]]), c, word))
    return longest_words, results


bu_list, bu_results = bottom_up(wordset)
set(bu_list)

In [ ]:
"""
Find the subword graph, from root to longest
"""
from collections import deque
from graph_tool import Graph, PropertyMap


def build_bottomup_subword_graph(bu_results):    
    # bu_results: shorter -> [(char, longer)]
    G = Graph(directed=False)
    names = G.new_vertex_property('string')
    deltas = G.new_edge_property('string')
    
    # add '' first to prevent an infinite loop
    v = G.add_vertex()
    names[v] = ''
    w2v = dict([('', v)])

    # create nodes for each point
    i = 0
    Q = deque([c for _, c in bu_results['']])  # Q <- [word]
    while len(Q) > 0:
        i += 1
        word = Q.pop()
        if word not in w2v:
            v = G.add_vertex()
            w2v[word] = v
            names[v] = word
            for _, longer in bu_results.get(word, []):
                Q.append(longer)

    # add character transitions
    for w, v in w2v.items():
        for delta, longer in bu_results.get(w, []):
            e = G.add_edge(v, w2v[longer])
            deltas[e] = delta

    return G, names, deltas, w2v['']


G, names, deltas, root = build_bottomup_subword_graph(bu_results)

In [ ]:
from graph_tool.draw import graph_draw, radial_tree_layout

pos = radial_tree_layout(G, root=root)
graph_draw(G, pos=pos, vertex_text=names, edge_text=deltas,
           vertex_font_size=16, output_size=(10000, 10000), output='words.png')

In [ ]:
"""
Find the longest recursively-decomposable subwords: top-down

Start with each word, work on decomposition graph. Memoized for speed.
"""
def get_subwords(word):
    return [word[:i] + word[i+1:] for i in range(len(word))]


def is_recursively_decomposable(word, wordset, res_cache={}):
    res = res_cache.get(word, None)
    if res is not None:
        return res
    else:
        if len(word) == 0:
            res = True
            parents = []
        else:
            subwords = [(w, is_recursively_decomposable(w, wordset, res_cache)[0])
                        for w in get_subwords(word)
                        if w in wordset]
            parents = [w for w, d in subwords if d]
            res = len(parents) > 0
        res_cache[word] = (res, parents)
        return (res, parents)


def top_down(wordset):
    longest_words = ['']
    result_cache = {}
    for i, w in enumerate(wordset):
        is_rec_decomp, _ = is_recursively_decomposable(w, wordset, result_cache)
        if is_rec_decomp:
            if len(w) > len(longest_words[0]):
                longest_words = [w]
            elif len(w) == len(longest_words[0]):
                longest_words.append(w)
            else:
                pass
        if i % 10000 == 0:
            print(i)
    return (longest_words, result_cache)


td_list, td_results = top_down(wordset)

In [ ]:
"""
Build the subword graph, from longestest words to ''
"""
from collections import deque
from graph_tool import Graph, PropertyMap


def build_topdown_subword_graph(word_list, result_cache):
    w2v = {}
    G = Graph()
    P = G.new_vertex_property('string')
    Q = deque([(w, None) for w in word_list])  # Q <- [word, parent_vertex]
    root = None

    while len(Q) > 0:
        word, parent = Q.pop()
        if word in w2v:
            v = w2v[word]
        else:
            v = G.add_vertex()
            P[v] = word
            w2v[word] = v
        if word == '':
            root = v
        if parent is not None:
            G.add_edge(parent, v)
        _, children = result_cache[word]
        for child in children:
            Q.append((child, v))
    return G, P, root


G, P, root = build_topdown_subword_graph(topdown_list, results)

In [ ]:
"""
Visualize derivation graph of longest words -> ''
"""
from graph_tool.draw import graph_draw, radial_tree_layout

pos = radial_tree_layout(G, root=root)
graph_draw(G, pos=pos, vertex_text=P, vertex_font_size=16, output_size=(5000, 5000), output='words.pdf')

In [ ]:
"""Across all \(w \in wordset\), how are word lengths distributed?"""
all_word_lengths = [len(w) for w in wordset]
n, bins, patches = plt.hist(all_word_lengths, max(all_word_lengths)-1, facecolor='red')
plt.title('Moby Words: Word count vs. Word length')
plt.savefig('moby_words_hist.png')
plt.show()

bu_results # shorter -> [(char, longer)]

seen = set([''])
lengths = []
Q = deque([w for _, w in bu_results['']])
while len(Q) > 0:
    w = Q.pop()
    if w not in seen:
        seen.add(w)
        lengths.append(len(w))
        for _, longer in bu_results.get(w, []):
            Q.append(longer)

nc = np.bincount(lengths)
ac = np.bincount(all_word_lengths)

x, y = zip(*[(i, n / a) for i, (n, a) in enumerate(zip(nc, ac))])
plt.plot(x, y, 'r-')
plt.title('Moby Words: Percentage Rec. Decomp. words vs. Length')
plt.savefig('rec_comp_perc_wc.png')
plt.show()