In [1]:
import os
import zipfile
import numpy as np
from gensim.models import word2vec
from gensim.models.doc2vec import Word2Vec
from keras.layers import Activation, Embedding, Merge, Reshape
from keras.models import Sequential
from keras.preprocessing.sequence import skipgrams, make_sampling_table
from keras.preprocessing.text import Tokenizer, base_filter
from keras.utils.data_utils import get_file
In [11]:
# Hyper Parameter Settings
embedding_size = 200
epochs_to_train = 10
num_neg_samples = 5
sampling_factor = 1e-5
window_size = 5
save_path = './word_vectors.txt'
In [3]:
def maybe_download(url):
"""
Download a file if not present.
"""
filename = url.split('/')[-1]
path = get_file(filename, url)
return path
def unzip(zip_filename):
"""
Extract a file from the zipfile
"""
with zipfile.ZipFile(zip_filename) as f:
for filename in f.namelist():
dirname = os.path.dirname(filename)
f.extract(filename, dirname)
return os.path.abspath(filename)
# Download Data
url = 'http://mattmahoney.net/dc/text8.zip'
filename = maybe_download(url)
text_file = unzip(filename)
url = 'http://download.tensorflow.org/data/questions-words.txt'
eval_data = maybe_download(url)
In [4]:
# Load Data
sentences = word2vec.Text8Corpus(text_file)
sentences = [' '.join(sent) for sent in sentences]
tokenizer = Tokenizer(filters=base_filter() + "'")
tokenizer.fit_on_texts(sentences)
sentences = tokenizer.texts_to_sequences(sentences)
V = len(tokenizer.word_index) + 1
print('Vocabulary:', V)
In [5]:
def build_model():
target_word = Sequential()
target_word.add(Embedding(V, embedding_size, input_length=1))
context = Sequential()
context.add(Embedding(V, embedding_size, input_length=1))
model = Sequential()
model.add(Merge([target_word, context], mode='dot', dot_axes=2))
model.add(Reshape((1,), input_shape=(1, 1)))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
return model
model = build_model()
In [ ]:
def train_model(model):
sampling_table = make_sampling_table(V, sampling_factor=sampling_factor)
for epoch in range(epochs_to_train):
loss = 0.
for i, sent in enumerate(sentences):
if i % 500 == 0:
print('{}/{}'.format(i, len(sentences)))
couples, labels = skipgrams(sequence=sent, vocabulary_size=V, window_size=window_size,
negative_samples=num_neg_samples, sampling_table=sampling_table)
if couples:
words, contexts = zip(*couples)
words = np.array(words, dtype=np.int32)
contexts = np.array(contexts, dtype=np.int32)
y = np.array(labels, dtype=np.int32)
loss += model.train_on_batch([words, contexts], y)
print('num epoch: {} loss: {}'.format(epoch, loss))
return model
model = train_model(model)
In [12]:
def save_model(model):
with open(save_path, 'w') as f:
f.write(' '.join([str(V - 1), str(embedding_size)]))
f.write('\n')
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write(word)
f.write(' ')
f.write(' '.join(map(str, list(vectors[i, :]))))
f.write('\n')
save_model(model)
In [13]:
def read_analogies(filename, word2id):
"""
Reads through the analogy question file.
Returns:
questions: a [n, 4] numpy array containing the analogy question's word ids.
questions_skipped: questions skipped due to unknown words.
"""
questions = []
questions_skipped = 0
with open(filename, 'r') as analogy_f:
for line in analogy_f:
if line.startswith(':'): # Skip comments.
continue
words = line.strip().lower().split()
ids = [w in word2id for w in words]
if False in ids or len(ids) != 4:
questions_skipped += 1
else:
questions.append(words)
print('Eval analogy file: {}'.format(filename))
print('Questions: {}'.format(len(questions)))
print('Skipped: {}'.format(questions_skipped))
return questions
In [14]:
def eval_model():
w2v = Word2Vec.load_word2vec_format(save_path, binary=False)
w2v.most_similar(positive=['country'])
word2id = dict([(w, i) for i, w in enumerate(w2v.index2word)])
analogy_questions = read_analogies(eval_data, word2id)
correct = 0
total = len(analogy_questions)
for question in analogy_questions:
a, b, c, d = question # E.g. [Athens, Greece, Baghdad, Iraq]
analogies = w2v.most_similar(positive=[b, c], negative=[a], topn=4)
for analogy in analogies:
word, _ = analogy
if d == word:
# Predicted Correctly!
correct += 1
break
print('Eval %4d/%d accuracy = %4.1f%%' % (correct, total, correct * 100.0 / total))
eval_model()