Word2Vec

The code for this lecture is based off the great tutorial example from tensorflow!

Walkthrough:

https://www.tensorflow.org/tutorials/word2vec

Raw Code: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/word2vec/word2vec_basic.py

Step 0: Imports


In [ ]:
import collections
import math
import os
import errno
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange 
import tensorflow as tf

Step 1: The data.


In [ ]:
data_dir = ".data/word2vec_data/words"
data_url = 'http://mattmahoney.net/dc/text8.zip'

In [ ]:
def fetch_words_data(url = data_url, words_data = data_dir):
    
    # Make the Dir if it does not exist
    os.makedirs(words_data, exist_ok=True)
    
    # Path to zip file 
    zip_path = os.path.join(words_data, "words.zip")
    
    # If the zip file isn't there, download it from the data url
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
        
    # Now that the zip file is there, get the data from it
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
    
    # Return a list of all the words in the data source.
    return data.decode("ascii").split()

In [ ]:
# Use Defaults (this make take awhile!!)
words = fetch_words_data()

In [ ]:
# Total words
len(words)

In [ ]:
# Random slice of words
words[9000:9040]

In [ ]:
for w in words[9000:9040]:
    print(w, end=' ')

Build Word Counts


In [ ]:
from collections import Counter

In [ ]:
mylist = ["one", 'one', 'two']

In [ ]:
Counter(mylist)

In [ ]:
Counter(mylist).most_common(1)

Create Word Data and Vocab


In [ ]:
def create_counts(vocab_size = 50000):

    # Begin adding vocab counts with Counter
    vocab = [] + Counter(words).most_common(vocab_size )
    
    # Turn into a numpy array
    vocab = np.array([word for word, _ in vocab])
    
    
    dictionary = {word: code for code, word in enumerate(vocab)}
    data = np.array([dictionary.get(word, 0) for word in words])
    return data,vocab

In [ ]:
vocab_size = 50000

In [ ]:
# This may take awhile
data, vocabulary = create_counts(vocab_size = vocab_size)

In [ ]:
data[0]

In [ ]:
(words[100],data[100])

In [ ]:
vocabulary.shape

In [ ]:
vocabulary[np.random.randint(0, 50000)]

Function for Batches

Direct from the word2vec file from TensorFlow Docs


In [ ]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
    if data_index == len(data):
        buffer[:] = data[:span]
        data_index = span
    else:
        buffer.append(data[data_index])
        data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [ ]:
data_index = 0
batch, labels = generate_batch(8, 2, 1)

In [ ]:
batch

In [ ]:
labels

Constants


In [ ]:
# Size of the bath
batch_size = 32

# Dimension of embedding vector
embedding_size = 150

# How many words to consider left and right (the bigger, the longer the training)
skip_window = 1       

# How many times to reuse an input to generate a label
num_skips = 2

In [ ]:
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.

# Random set of words to evaluate similarity on.
valid_size = 16   

# Only pick dev samples in the head of the distribution.
valid_window = 100  
valid_examples = np.random.choice(valid_window, valid_size, replace = False)

# Number of negative examples to sample.
num_sampled = 64   

# Model Learning Rate
learning_rate = 0.01

# How many words in vocab
vocabulary_size = 50000

TensorFlow Placeholders and Constants


In [ ]:
tf.reset_default_graph()

# Input data.
train_inputs = tf.placeholder(tf.int32, shape = [None])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])
valid_dataset = tf.constant(valid_examples, 
                            dtype = tf.int32)

Variables


In [ ]:
# Look up embeddings for inputs.
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeds)

In [ ]:
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

NCE Loss


In [ ]:
# Construct the variables for the NCE loss
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev = 1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [ ]:
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
    tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed,
                   num_sampled, vocabulary_size))

Optimizer


In [ ]:
# Construct the Adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate = 1.0)
trainer = optimizer.minimize(loss)

In [ ]:
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 
                             axis = 1, 
                             keepdims = True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, 
                       normalized_embeddings, 
                       transpose_b = True)

In [ ]:
# Add variable initializer.
init = tf.global_variables_initializer()

Session


In [ ]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.9)

In [ ]:
# Usually needs to be quite large to get good results, 
# training takes a long time!
num_steps = 200001

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    sess.run(init)
    average_loss = 0
    for step in range(num_steps):
         
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

        # We perform one update step by evaluating the training op (including it
        # in the list of returned values for session.run()
        empty, loss_val = sess.run([trainer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 1000 == 0:
            if step > 0:
                average_loss /= 1000
            # The average loss is an estimate of the loss over the last 1000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

       

    final_embeddings = normalized_embeddings.eval()

Visualizing Results


In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy = (x, y),
                     xytext = (5, 2),
                     textcoords = 'offset points',
                     ha = 'right',
                     va = 'bottom')

TSNE

Dimensionality reduction to 2-D vectors (down from 150), this takes awhile.


In [ ]:
type(final_embeddings)

In [ ]:
final_embeddings.shape

In [ ]:
from  sklearn.manifold import TSNE
tsne = TSNE(perplexity = 30, 
            n_components = 2, 
            init = 'pca', 
            n_iter = 5000)

In [ ]:
plot_only = 2000
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])

In [ ]:
labels = [vocabulary[i] for i in range(plot_only)]

In [ ]:
low_dim_embs.shape

In [ ]:
plot_with_labels(low_dim_embs, labels)

In [ ]:
plot_with_labels(low_dim_embs, labels)
plt.xlim(-10,10)
plt.ylim(-10,10)

In [ ]:
np.save('trained_embeddings_200k_steps', final_embeddings)

Great Job!