In [1]:
"""
Compare word frequencies to log-uniform (sampling model for nce Candidate Sampling)
Patrick Coady (pcoady@alum.mit.edu)
"""
from wordvector import WordVector
from windowmodel import WindowModel
import docload
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
In [2]:
files = ['../data/adventures_of_sherlock_holmes.txt',
'../data/hound_of_the_baskervilles.txt',
'../data/sign_of_the_four.txt']
word_counter, word_list, num_lines, num_words = docload.load_books(
files, gutenberg=True)
print('Document loaded and processed: {} lines, {} words.'
.format(num_lines, num_words))
dictionary = docload.build_dict(word_counter)
reverse_dictionary = {v: k for k, v in dictionary.items()}
In [3]:
top_words = word_counter.most_common(50000)
top_words.sort(key=lambda t: -t[1])
counts = np.array([t[1] for t in top_words])
counts = counts/sum(counts) # actual work probability
x = np.array(range(len(counts)))
log_uniform = (np.log(x+2) - np.log(x+1))/np.log(len(counts)+1) # ideal log-uniform
In [6]:
plt.xscale('log')
plt.yscale('log')
plt.plot(x, counts)
plt.plot(x, log_uniform)
plt.show()
# reasonable match, but definitely under-samples first 200 words
In [ ]: