In [1]:
import os
import zipfile

import numpy as np
from gensim.models import word2vec
from gensim.models.doc2vec import Word2Vec
from keras.layers import Activation, Embedding, Merge, Reshape
from keras.models import Sequential
from keras.preprocessing.sequence import skipgrams, make_sampling_table
from keras.preprocessing.text import Tokenizer, base_filter
from keras.utils.data_utils import get_file


Using TensorFlow backend.

Setting Hyperparameters

You set hyperparameters for Skip-gram with negative sampling.

By default, it is set as follows.


In [11]:
# Hyper Parameter Settings
embedding_size = 200
epochs_to_train = 10
num_neg_samples = 5
sampling_factor = 1e-5
window_size = 5
save_path = './word_vectors.txt'

Download training and evaluation data

You can download training data and evaluation data.


In [3]:
def maybe_download(url):
    """
    Download a file if not present.
    """
    filename = url.split('/')[-1]
    path = get_file(filename, url)
    return path
    

def unzip(zip_filename):
    """
    Extract a file from the zipfile
    """
    with zipfile.ZipFile(zip_filename) as f:
        for filename in f.namelist():
            dirname = os.path.dirname(filename)
            f.extract(filename, dirname)
            return os.path.abspath(filename)
            

# Download Data
url = 'http://mattmahoney.net/dc/text8.zip'
filename = maybe_download(url)
text_file = unzip(filename)
url = 'http://download.tensorflow.org/data/questions-words.txt'
eval_data = maybe_download(url)


Downloading data from http://mattmahoney.net/dc/text8.zip
31309824/31344016 [============================>.] - ETA: 0sDownloading data from http://download.tensorflow.org/data/questions-words.txt
  8192/603955 [..............................] - ETA: 0s

Reading training data

You can read training data from a text file using the word2vec.Text8Corpus class.

By default, it assumes that the text file is given.

Tokenizer tokenizes the sentences and assign ID to the vocabulary.


In [4]:
# Load Data
sentences = word2vec.Text8Corpus(text_file)
sentences = [' '.join(sent) for sent in sentences]
tokenizer = Tokenizer(filters=base_filter() + "'")
tokenizer.fit_on_texts(sentences)
sentences = tokenizer.texts_to_sequences(sentences)
V = len(tokenizer.word_index) + 1
print('Vocabulary:', V)


Vocabulary: 253855

Layers and Activation Functions

The two main components to build neural networks architecture in Keras is Layer and Activation. There are many useful layers and activation functions in Keras.

For now, let's create a binary classifier with Embedding layer:


In [5]:
def build_model():
    target_word = Sequential()
    target_word.add(Embedding(V, embedding_size, input_length=1))

    context = Sequential()
    context.add(Embedding(V, embedding_size, input_length=1))

    model = Sequential()
    model.add(Merge([target_word, context], mode='dot', dot_axes=2))
    model.add(Reshape((1,), input_shape=(1, 1)))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    return model

model = build_model()

Training Model

Now, we obtained skip-gram model. Let's train it by calling train_on_batch and passing training examples:


In [ ]:
def train_model(model):
    sampling_table = make_sampling_table(V, sampling_factor=sampling_factor)
    for epoch in range(epochs_to_train):
        loss = 0.
        for i, sent in enumerate(sentences):
            if i % 500 == 0:
                print('{}/{}'.format(i, len(sentences)))
            couples, labels = skipgrams(sequence=sent, vocabulary_size=V, window_size=window_size,
                                        negative_samples=num_neg_samples, sampling_table=sampling_table)
            if couples:
                words, contexts = zip(*couples)
                words = np.array(words, dtype=np.int32)
                contexts = np.array(contexts, dtype=np.int32)
                y = np.array(labels, dtype=np.int32)
                loss += model.train_on_batch([words, contexts], y)
        print('num epoch: {} loss: {}'.format(epoch, loss))

    return model

model = train_model(model)


0/1701
500/1701
1000/1701
1500/1701
num epoch: 0 loss: 529.3742761611938
0/1701
500/1701
1000/1701
1500/1701
num epoch: 1 loss: 290.1124480217695
0/1701
500/1701
1000/1701
1500/1701
num epoch: 2 loss: 260.58118246495724
0/1701
500/1701
1000/1701
1500/1701
num epoch: 3 loss: 245.68536188453436
0/1701
500/1701
1000/1701
1500/1701

Save word embeddings

Congraturations! We finished training. Let's save word embeddings into text file:


In [12]:
def save_model(model):
    with open(save_path, 'w') as f:
        f.write(' '.join([str(V - 1), str(embedding_size)]))
        f.write('\n')
        vectors = model.get_weights()[0]
        for word, i in tokenizer.word_index.items():
            f.write(word)
            f.write(' ')
            f.write(' '.join(map(str, list(vectors[i, :]))))
            f.write('\n')
            
save_model(model)

Definition for reading evaluation data

You can read evaluation data from a text file:


In [13]:
def read_analogies(filename, word2id):
    """
    Reads through the analogy question file.

    Returns:
      questions: a [n, 4] numpy array containing the analogy question's word ids.
      questions_skipped: questions skipped due to unknown words.
    """
    questions = []
    questions_skipped = 0
    with open(filename, 'r') as analogy_f:
        for line in analogy_f:
            if line.startswith(':'):  # Skip comments.
                continue
            words = line.strip().lower().split()
            ids = [w in word2id for w in words]
            if False in ids or len(ids) != 4:
                questions_skipped += 1
            else:
                questions.append(words)
    print('Eval analogy file: {}'.format(filename))
    print('Questions: {}'.format(len(questions)))
    print('Skipped: {}'.format(questions_skipped))
    return questions

Evaluation

We evaluate obtained word embeddings on analogy task.

In analogy task, when A, B, C are given, you need to find D such that A is to B what C is to D.(e.g. man is to king what women is to D.)


In [14]:
def eval_model():
    w2v = Word2Vec.load_word2vec_format(save_path, binary=False)
    w2v.most_similar(positive=['country'])
    word2id = dict([(w, i) for i, w in enumerate(w2v.index2word)])
    analogy_questions = read_analogies(eval_data, word2id)
    correct = 0
    total = len(analogy_questions)
    for question in analogy_questions:
        a, b, c, d = question  # E.g. [Athens, Greece, Baghdad, Iraq]
        analogies = w2v.most_similar(positive=[b, c], negative=[a], topn=4)
        for analogy in analogies:
            word, _ = analogy
            if d == word:
                # Predicted Correctly!
                correct += 1
                break
    print('Eval %4d/%d accuracy = %4.1f%%' % (correct, total, correct * 100.0 / total))
    
eval_model()


Eval analogy file: /home/ubuntu/.keras/datasets/questions-words.txt
Questions: 19106
Skipped: 438
Eval 2936/19106 accuracy = 15.4%