Note to Self: Stop doing it "quick'n dirty", take time and use the best practises

Vector Representations of Words

Train a Word to Vector skip-gram model over Text8 Data


In [14]:
import numpy as np 
import os 
import tensorflow as tf 
import matplotlib.pyplot as plt
import zipfile
from six.moves.urllib.request import urlretrieve
import collections
import random 
import math
from sklearn.manifold import TSNE
import matplotlib.pylab

% matplotlib inline 
plt.style.use('ggplot')

1. Let's Prepare the Data Set

1.1 Download the Data


In [2]:
url = 'http://mattmahoney.net/dc/'

def try_download(filename, expected_bytes):
    # Download the file if it is not already present
    # Check the file is the correct size
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    # Grab the stats about the file
    
    if statinfo.st_size == expected_bytes:
        print('Found and verified file: {}' .format(filename))
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify the file: {}!' .format(filename))
    
    return filename

filename = try_download('text8.zip', 31344016)


Found and verified file: text8.zip

1.2 Unzip the Data and read it into RAM


In [3]:
def read_data(filename):
    # Extrac the first file enclosed in the zip as a list of words
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print('Data Size: {}' .format(len(words)))
print(words[:20])


Data Size: 17005207
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english']

1.3 Replace rare words with UNK token


In [4]:
vocabulary_size = 50000

def build_dataset(words):
    count=[['UNK', -1]]
    # Get the most frequently used words and their frequency
    count.extend(collections.Counter(words).most_common(vocabulary_size -1))
    
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        
    return data, count, reverse_dictionary, dictionary


data, count, reverse_dictionary, dictionary = build_dataset(words)

print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3084, 12, 6, 195, 2, 3136, 46, 59, 156]

1.4 Function to generate a training batch for the skip-gram model


In [5]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index 
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype = np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype = np.int32)
    span = 2 * skip_window + 1 #[skip_window, target, skip_window ]
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index+1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span-1)
            targets_to_avoid.append(target)
            batch[i*num_skips +j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['anarchism', 'as', 'a', 'originated', 'term', 'as', 'a', 'of']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['anarchism', 'a', 'term', 'originated', 'as', 'originated', 'of', 'term']

2. Modelling

2.1 Define the Graph


In [8]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

2.2 Training the Model


In [9]:
num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(
      batch_size, num_skips, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    '''
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
    '''
  final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step 0: 7.825277
Average loss at step 2000: 4.853410
Average loss at step 4000: 4.159984
Average loss at step 6000: 4.026088
Average loss at step 8000: 3.873921
Average loss at step 10000: 3.871650
Average loss at step 12000: 3.787697
Average loss at step 14000: 3.649309
Average loss at step 16000: 3.572261
Average loss at step 18000: 3.708147
Average loss at step 20000: 3.682890
Average loss at step 22000: 3.656179
Average loss at step 24000: 3.650971
Average loss at step 26000: 3.667479
Average loss at step 28000: 3.613138
Average loss at step 30000: 3.679241
Average loss at step 32000: 3.619907
Average loss at step 34000: 3.638377
Average loss at step 36000: 3.477542
Average loss at step 38000: 3.619985
Average loss at step 40000: 3.575888
Average loss at step 42000: 3.574001
Average loss at step 44000: 3.585221
Average loss at step 46000: 3.544047
Average loss at step 48000: 3.536453
Average loss at step 50000: 3.516057
Average loss at step 52000: 3.555631
Average loss at step 54000: 3.576334
Average loss at step 56000: 3.546787
Average loss at step 58000: 3.565790
Average loss at step 60000: 3.501998
Average loss at step 62000: 3.300880
Average loss at step 64000: 3.553130
Average loss at step 66000: 3.525501
Average loss at step 68000: 3.489125
Average loss at step 70000: 3.538499
Average loss at step 72000: 3.496172
Average loss at step 74000: 3.492016
Average loss at step 76000: 3.448916
Average loss at step 78000: 3.494678
Average loss at step 80000: 3.517780
Average loss at step 82000: 3.551564
Average loss at step 84000: 3.517640
Average loss at step 86000: 3.485559
Average loss at step 88000: 3.492768
Average loss at step 90000: 3.509365
Average loss at step 92000: 3.436369
Average loss at step 94000: 3.467226
Average loss at step 96000: 3.449503
Average loss at step 98000: 3.425100
Average loss at step 100000: 3.490701

In [10]:
num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [17]:
def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  plt.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)



In [ ]: