Deep Learning

Assignment 5

The goal of this assignment is to train a skip-gram model over Text8 data.


In [37]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

Download the data from the source website if necessary.


In [38]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


Found and verified text8.zip

Read the data into a string. It's a 100MB of text file with ~17 million separate words.


In [39]:
def read_data(filename):
  f = zipfile.ZipFile(filename)
  for name in f.namelist():
    return tf.compat.as_str(f.read(name)).split()
  f.close()
  
words = read_data(filename)
print('Data size %d' % len(words))


Data size 17005207

Build the dictionary and replace rare words with UNK token.


In [40]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
print('Reverse dict', list(reverse_dictionary[d] for d in data[:10]))
del words  # Hint to reduce memory.


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156]
Reverse dict ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']

Function to generate a training batch for the skip-gram model.


In [43]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['anarchism', 'as', 'a', 'originated', 'term', 'as', 'of', 'a']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['a', 'originated', 'term', 'anarchism', 'as', 'of', 'originated', 'term']

Train a skip-gram model.


In [44]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                               train_labels, num_sampled, vocabulary_size))

  # Optimizer.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [45]:
num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(
      batch_size, num_skips, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
  final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step 0: 8.007317
Nearest to seven: dayan, ean, professes, interactivity, revive, mystics, alluded, strayed,
Nearest to after: minix, begins, functor, invades, slower, hindustan, denmark, socialists,
Nearest to first: slovene, franchisees, effect, therapy, offence, nba, authored, goldman,
Nearest to can: walser, uptime, liquidity, polg, otranto, homophonic, hom, utilised,
Nearest to state: intruding, lawmaking, hittite, islamist, shielding, consilience, orwell, haley,
Nearest to zero: shikoku, anacreon, vessel, neumann, gem, reinstating, reassert, manages,
Nearest to about: yahweh, temptation, appalachian, depressants, anthems, redundantly, merrill, dune,
Nearest to not: warlike, jaya, attributed, chukchi, cries, webzine, indecision, ule,
Nearest to that: health, mol, importing, illegality, epicycle, hingis, vanquished, andriessen,
Nearest to been: graphene, detainee, rrna, vowel, upheavals, pda, keeping, boycotted,
Nearest to was: pray, neologism, johannesburg, supranationalism, entrepot, earhart, biomes, influences,
Nearest to united: watchmen, face, ramsey, infects, collegiate, geologically, abounds, functionals,
Nearest to with: missiles, commercialization, coleman, displace, ferromagnetism, admiralty, mediating, pendants,
Nearest to time: wields, prizes, rossetti, pavlov, definiteness, thrusts, eee, emphasise,
Nearest to people: southside, walkie, lin, lightbulb, encapsulated, toland, held, telco,
Nearest to between: lcms, plan, repeats, transcriptase, bankruptcies, repatriation, stabilizing, oppress,
Average loss at step 2000: 4.359738
Average loss at step 4000: 3.866328
Average loss at step 6000: 3.787311
Average loss at step 8000: 3.688992
Average loss at step 10000: 3.618184
Nearest to seven: eight, six, nine, four, three, five, zero, two,
Nearest to after: during, in, eine, inject, livers, bix, hindustan, chambers,
Nearest to first: franchisees, slovene, own, therapy, doublet, deathmatch, joined, bellows,
Nearest to can: could, may, will, would, homophonic, must, sherbrooke, hom,
Nearest to state: premium, intruding, coleco, calculations, lawmaking, filking, overestimated, genus,
Nearest to zero: nine, seven, eight, six, five, four, three, two,
Nearest to about: redundantly, yahweh, depressants, cardoso, merrill, recognisable, simplest, temptation,
Nearest to not: also, they, warlike, bankrupt, to, often, chukchi, majapahit,
Nearest to that: which, ecclesiastica, aon, oleg, it, disrupted, could, health,
Nearest to been: graphene, be, by, chipmunks, strident, earthquake, detainee, was,
Nearest to was: is, were, be, by, has, had, are, jabber,
Nearest to united: infects, darmstadt, geologically, watchmen, face, ramsey, ghent, caller,
Nearest to with: in, for, by, including, on, praying, defection, dominus,
Nearest to time: wields, eisner, base, prizes, snorri, if, eee, study,
Nearest to people: held, southside, decins, telco, remembrance, toland, barthes, walkie,
Nearest to between: plan, repeats, stabilizing, lcms, retorted, transcriptase, allure, jasper,
Average loss at step 12000: 3.603925
Average loss at step 14000: 3.566786
Average loss at step 16000: 3.411347
Average loss at step 18000: 3.456045
Average loss at step 20000: 3.540293
Nearest to seven: eight, five, six, four, three, nine, zero, two,
Nearest to after: during, from, eine, into, in, through, inject, livers,
Nearest to first: last, same, franchisees, second, deathmatch, johore, joined, own,
Nearest to can: may, could, will, would, must, to, should, veer,
Nearest to state: premium, genus, intruding, orwell, islamist, lawmaking, fleury, cascade,
Nearest to zero: five, six, seven, four, three, nine, eight, two,
Nearest to about: cardoso, yahweh, interned, redundantly, rinzai, buber, simplest, labouring,
Nearest to not: they, it, warlike, also, attributed, bankrupt, still, to,
Nearest to that: which, aon, gradients, but, what, shades, motivate, ecclesiastica,
Nearest to been: be, graphene, was, become, paraphrased, provoke, locks, by,
Nearest to was: is, were, has, had, when, became, be, wipo,
Nearest to united: darmstadt, infects, geologically, ramsey, ghent, cairns, caller, domitian,
Nearest to with: between, against, in, by, or, consequentialist, including, for,
Nearest to time: oversized, wields, nation, if, study, eisner, eee, vagus,
Nearest to people: held, southside, areas, decins, other, mace, remembrance, krona,
Nearest to between: plan, with, to, retorted, against, wooded, transcriptase, in,
Average loss at step 22000: 3.502595
Average loss at step 24000: 3.494614
Average loss at step 26000: 3.481209
Average loss at step 28000: 3.480234
Average loss at step 30000: 3.501897
Nearest to seven: eight, nine, six, four, five, three, one, zero,
Nearest to after: during, before, when, from, eine, in, if, into,
Nearest to first: last, second, same, building, franchisees, final, present, joined,
Nearest to can: may, could, will, would, must, might, should, cannot,
Nearest to state: premium, genus, inclined, eudes, ample, orwell, intruding, toru,
Nearest to zero: five, seven, eight, three, six, four, nine, two,
Nearest to about: redundantly, cardoso, simplest, labouring, yahweh, interned, last, hypocritical,
Nearest to not: they, warlike, still, it, majapahit, attributed, often, there,
Nearest to that: which, what, but, collated, ecclesiastica, aon, this, however,
Nearest to been: become, be, graphene, was, were, by, locks, recurrent,
Nearest to was: is, were, had, has, became, been, when, being,
Nearest to united: darmstadt, geologically, ramsey, infects, asian, ghent, domitian, cairns,
Nearest to with: between, including, against, in, for, by, under, remarque,
Nearest to time: nation, base, oversized, arabidopsis, if, wields, case, das,
Nearest to people: countries, areas, decins, southside, states, mace, krona, remembrance,
Nearest to between: with, plan, against, inspirations, wooded, in, elamites, stabilizing,
Average loss at step 32000: 3.502155
Average loss at step 34000: 3.496641
Average loss at step 36000: 3.453287
Average loss at step 38000: 3.300613
Average loss at step 40000: 3.426027
Nearest to seven: nine, six, eight, five, four, three, zero, one,
Nearest to after: before, during, when, from, roommates, lettering, through, contractions,
Nearest to first: last, second, same, next, final, reverence, present, vera,
Nearest to can: could, may, will, would, must, should, cannot, might,
Nearest to state: premium, government, genus, demigod, eudes, toru, rehabilitated, orwell,
Nearest to zero: seven, five, nine, six, eight, three, four, two,
Nearest to about: yahweh, simplest, interned, truso, redundantly, labouring, cardoso, on,
Nearest to not: they, it, still, only, warlike, linseed, often, never,
Nearest to that: which, what, this, gradients, however, where, because, collated,
Nearest to been: become, be, were, graphene, was, paraphrased, recurrent, locks,
Nearest to was: is, became, were, has, had, did, been, when,
Nearest to united: darmstadt, ramsey, geologically, asian, ghent, infects, domitian, seventeen,
Nearest to with: between, including, against, railtrack, under, using, achaeans, dominus,
Nearest to time: base, nation, oversized, way, enamoured, vagus, case, proclamation,
Nearest to people: tarn, decins, countries, remembrance, man, children, southside, sixtus,
Nearest to between: with, plan, against, into, inspirations, wooded, within, retorted,
Average loss at step 42000: 3.436612
Average loss at step 44000: 3.453883
Average loss at step 46000: 3.447457
Average loss at step 48000: 3.351052
Average loss at step 50000: 3.382114
Nearest to seven: six, eight, nine, four, five, three, one, zero,
Nearest to after: before, during, when, from, through, lettering, while, into,
Nearest to first: last, second, same, next, third, final, johore, fourth,
Nearest to can: could, may, will, would, must, should, cannot, might,
Nearest to state: premium, genus, demigod, government, rehabilitated, eudes, ample, minima,
Nearest to zero: seven, six, eight, four, five, nine, three, two,
Nearest to about: interned, lems, yahweh, sounded, simplest, truso, last, merrill,
Nearest to not: still, it, never, they, usually, frisbee, attributed, almost,
Nearest to that: which, what, however, collated, where, vein, aon, gradients,
Nearest to been: become, be, graphene, was, paraphrased, already, were, recurrent,
Nearest to was: is, were, has, became, had, be, been, by,
Nearest to united: darmstadt, asian, ramsey, geologically, ghent, landlord, domitian, seventeen,
Nearest to with: between, by, including, against, gavrilo, under, inoculated, digitally,
Nearest to time: year, season, way, tiananmen, enamoured, case, if, nation,
Nearest to people: children, countries, men, decins, areas, players, mace, individuals,
Nearest to between: with, against, into, wooded, in, plan, including, within,
Average loss at step 52000: 3.442596
Average loss at step 54000: 3.424637
Average loss at step 56000: 3.440898
Average loss at step 58000: 3.396812
Average loss at step 60000: 3.393040
Nearest to seven: four, eight, six, five, nine, three, zero, two,
Nearest to after: before, during, from, while, when, through, lettering, without,
Nearest to first: last, second, next, same, third, final, only, fourth,
Nearest to can: could, may, would, will, must, cannot, should, might,
Nearest to state: premium, government, demigod, rehabilitated, ample, eudes, student, genus,
Nearest to zero: five, four, eight, seven, six, nine, three, two,
Nearest to about: interned, on, sounded, yahweh, truso, lems, telling, over,
Nearest to not: still, they, never, warlike, now, usually, it, almost,
Nearest to that: which, what, this, superscript, however, gradients, aon, where,
Nearest to been: become, be, was, were, recurrent, graphene, had, already,
Nearest to was: is, had, became, were, has, been, be, when,
Nearest to united: darmstadt, ramsey, asian, domitian, geologically, landlord, usa, ghent,
Nearest to with: between, including, by, heisei, cantatas, coptic, into, camillo,
Nearest to time: enamoured, season, case, nation, year, oversized, base, period,
Nearest to people: children, men, those, women, tarn, players, countries, millions,
Nearest to between: with, within, wooded, into, against, including, plan, inspirations,
Average loss at step 62000: 3.240554
Average loss at step 64000: 3.248923
Average loss at step 66000: 3.403711
Average loss at step 68000: 3.394813
Average loss at step 70000: 3.360279
Nearest to seven: eight, six, nine, four, five, three, zero, one,
Nearest to after: before, during, when, while, through, without, from, until,
Nearest to first: last, second, next, third, same, final, fourth, original,
Nearest to can: may, could, will, would, must, cannot, should, might,
Nearest to state: rehabilitated, government, demigod, premium, creationism, states, omens, eudes,
Nearest to zero: five, four, eight, six, seven, nine, three, two,
Nearest to about: over, interned, on, phonemically, telling, that, lems, sounded,
Nearest to not: still, never, generally, now, warlike, almost, illnesses, they,
Nearest to that: which, however, this, what, gradients, where, aon, typically,
Nearest to been: become, be, was, were, had, paraphrased, already, graphene,
Nearest to was: is, has, had, were, became, been, be, when,
Nearest to united: darmstadt, asian, domitian, ramsey, apathetic, geologically, tentacle, clovis,
Nearest to with: between, including, remarque, hexafluoride, by, industrialized, rajiv, coptic,
Nearest to time: season, enamoured, year, way, case, period, base, nation,
Nearest to people: men, children, women, tarn, students, eureka, members, players,
Nearest to between: with, within, wooded, loads, against, into, among, in,
Average loss at step 72000: 3.373470
Average loss at step 74000: 3.347211
Average loss at step 76000: 3.310390
Average loss at step 78000: 3.351412
Average loss at step 80000: 3.379233
Nearest to seven: six, eight, four, nine, five, three, one, zero,
Nearest to after: before, during, when, until, without, while, despite, within,
Nearest to first: last, second, third, next, same, final, musicbrainz, only,
Nearest to can: could, may, will, would, must, cannot, should, might,
Nearest to state: government, rehabilitated, premium, city, states, omens, creationism, demigod,
Nearest to zero: seven, five, four, eight, six, three, nine, pmid,
Nearest to about: telling, interned, approximately, merrill, unconsciousness, lems, yahweh, over,
Nearest to not: still, never, actually, generally, warlike, attributed, almost, thant,
Nearest to that: which, however, where, vein, this, aon, jamo, banisteriopsis,
Nearest to been: become, be, were, already, was, graphene, paraphrased, had,
Nearest to was: is, became, were, has, had, be, been, apteryx,
Nearest to united: darmstadt, ramsey, domitian, clovis, tentacle, usa, consonances, apathetic,
Nearest to with: between, in, including, coptic, rajiv, mere, demeanor, by,
Nearest to time: year, season, period, enamoured, nation, proclamation, guernica, times,
Nearest to people: men, children, women, students, members, tarn, man, eureka,
Nearest to between: with, within, among, in, into, against, elamites, inspirations,
Average loss at step 82000: 3.406134
Average loss at step 84000: 3.409493
Average loss at step 86000: 3.390149
Average loss at step 88000: 3.351411
Average loss at step 90000: 3.363954
Nearest to seven: six, eight, five, nine, four, three, one, zero,
Nearest to after: before, during, when, while, until, without, through, within,
Nearest to first: last, second, next, third, same, rte, final, fourth,
Nearest to can: could, may, will, must, would, cannot, should, might,
Nearest to state: government, city, creationism, rehabilitated, states, premium, acclamation, zubrin,
Nearest to zero: five, eight, seven, six, nine, four, three, pmid,
Nearest to about: interned, on, over, telling, phonemically, lems, unconsciousness, courteous,
Nearest to not: still, actually, generally, snacks, almost, never, chukchi, they,
Nearest to that: which, however, collated, aon, what, hostage, vein, zapatista,
Nearest to been: become, be, already, was, were, paraphrased, recurrent, stanhope,
Nearest to was: is, became, had, were, has, be, been, cowpox,
Nearest to united: darmstadt, usa, ramsey, asian, consonances, domitian, plein, clovis,
Nearest to with: between, in, recoup, into, including, from, rajiv, by,
Nearest to time: season, enamoured, oversized, year, period, appointments, title, tf,
Nearest to people: women, children, man, men, students, tarn, players, eureka,
Nearest to between: with, within, in, among, into, against, from, through,
Average loss at step 92000: 3.399632
Average loss at step 94000: 3.248714
Average loss at step 96000: 3.362181
Average loss at step 98000: 3.242620
Average loss at step 100000: 3.357733
Nearest to seven: eight, six, four, five, nine, three, two, zero,
Nearest to after: before, during, when, without, while, until, through, at,
Nearest to first: last, second, next, third, final, fourth, same, previous,
Nearest to can: may, could, will, must, would, cannot, should, might,
Nearest to state: government, city, rehabilitated, zubrin, states, eastward, burger, premium,
Nearest to zero: five, eight, six, four, seven, nine, three, pmid,
Nearest to about: unconsciousness, telling, mukherjee, interned, ayling, yahweh, blessings, lems,
Nearest to not: never, still, actually, almost, chukchi, warlike, legally, generally,
Nearest to that: which, however, what, who, aon, superscript, pacing, when,
Nearest to been: become, be, already, was, paraphrased, recurrent, being, newly,
Nearest to was: is, became, has, had, were, been, becomes, descriptors,
Nearest to united: darmstadt, domitian, consonances, usa, asian, clovis, plein, nato,
Nearest to with: between, including, into, in, when, cavalier, efnet, hexafluoride,
Nearest to time: season, year, oversized, disruption, toole, times, disarray, way,
Nearest to people: children, men, players, women, man, students, tarn, jews,
Nearest to between: with, within, among, against, into, through, inspirations, hexafluoride,

In [47]:
num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [48]:
%matplotlib inline

def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)



Problem

An alternative to Word2Vec is called CBOW (Continuous Bag of Words). In the CBOW model, instead of predicting a context word from a word vector, you predict a word from the sum of all the word vectors in its context. Implement and evaluate a CBOW model trained on the text8 dataset.



In [52]:
# For CBOW we switch batch output and context label:

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    # NEW: switch batch and labels around!
    # batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    labels, batch = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    # NEW: reshaping needs to happen with batch now as that is a (batch_size,1) vector
    #print('    batch:', [reverse_dictionary[bi] for bi in batch])
    #print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
    print('    batch:', [reverse_dictionary[bi] for bi in batch.reshape(8)])
    print('    labels:', [reverse_dictionary[li] for li in labels])


with num_skips = 2 and skip_window = 1:
    batch: ['as', 'anarchism', 'a', 'originated', 'term', 'as', 'a', 'of']
    labels: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']

with num_skips = 4 and skip_window = 2:
    batch: ['originated', 'term', 'a', 'anarchism', 'originated', 'term', 'as', 'of']
    labels: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']

In [54]:
batch_size = 128
num_skips = 2
skip_window = 1

num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(batch_size, num_skips, skip_window)
    # NEW: Need to explicitly reshape dataset and labels following above switch:
    #feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    feed_dict = {train_dataset : batch_data.reshape(batch_size), train_labels : batch_labels.reshape(batch_size, 1)}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
  final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step 0: 8.077976
Nearest to seven: cimbri, class, cats, sheaf, amit, angelica, michaux, jia,
Nearest to after: humorists, eventual, weighed, radiotelephone, operated, jungles, dai, supernovae,
Nearest to first: safavid, terminal, openness, sailboat, aulus, rests, medulla, emperors,
Nearest to can: disclaimer, rollo, genes, israelis, weasel, toads, angels, methodists,
Nearest to state: composer, warcraft, npd, goodbye, handwritten, patsy, galois, artform,
Nearest to zero: citizen, flocked, rcaf, compensatory, dangers, probability, cade, teborg,
Nearest to about: arexx, suppressing, dances, svc, macdonald, satisfying, wright, marco,
Nearest to not: peacetime, criticized, deputies, erection, jahn, psychic, madame, counterrevolutionary,
Nearest to that: zach, responding, melancholic, aliases, dilemma, archibald, commander, flick,
Nearest to been: warren, mehr, mexicana, talks, abbreviate, ich, banister, storm,
Nearest to was: paddy, poetically, dicke, farrell, ghent, cope, northern, transwomen,
Nearest to united: mints, dalmatian, tong, kashubians, si, inclusive, bazin, militiamen,
Nearest to with: diffuses, conveying, valencian, austrofascism, emanated, farouk, pusan, stinging,
Nearest to time: doses, completion, jekyll, proletarians, belarusian, motif, multiprocessor, floats,
Nearest to people: week, slams, bottleneck, tley, lder, hometown, recreational, tong,
Nearest to between: ahasuerus, laminated, qi, obliquely, fernand, sympathy, mixolydian, bombs,
Average loss at step 2000: 4.362734
Average loss at step 4000: 3.863462
Average loss at step 6000: 3.791149
Average loss at step 8000: 3.693276
Average loss at step 10000: 3.615714
Nearest to seven: six, eight, four, five, nine, three, two, zero,
Nearest to after: before, humorists, on, in, jamaicans, with, takeovers, weighed,
Nearest to first: unjustly, alluding, safavid, terminal, mandeville, frescoes, aulus, invalidate,
Nearest to can: may, must, sidecar, would, will, wanna, allege, testing,
Nearest to state: goodbye, grilled, nitride, artform, galois, bunch, serialism, austerlitz,
Nearest to zero: six, nine, five, eight, seven, four, three, two,
Nearest to about: svc, inspected, itu, dances, communist, fairness, arexx, nage,
Nearest to not: it, still, they, we, yahweh, to, adenylate, peacetime,
Nearest to that: which, it, responding, never, this, aliases, same, cal,
Nearest to been: was, pharisees, speght, warren, tip, mehr, by, sean,
Nearest to was: is, had, were, has, are, elixir, did, by,
Nearest to united: mints, si, hume, tong, dalmatian, remixes, lack, legend,
Nearest to with: in, into, by, brzezinski, while, after, lever, uplifted,
Nearest to time: doses, jekyll, completion, technician, jdk, corridors, janet, unfounded,
Nearest to people: sunglasses, discredited, grit, hometown, quadrilateral, hauling, bottleneck, misr,
Nearest to between: ahasuerus, in, obliquely, photographer, lustrous, against, statutory, bombs,
Average loss at step 12000: 3.604235
Average loss at step 14000: 3.569135
Average loss at step 16000: 3.409211
Average loss at step 18000: 3.453756
Average loss at step 20000: 3.535929
Nearest to seven: eight, six, five, four, nine, three, two, zero,
Nearest to after: before, for, when, from, during, require, elixir, if,
Nearest to first: rearden, unjustly, second, funnel, frescoes, last, main, manaus,
Nearest to can: may, must, would, will, could, should, might, sidecar,
Nearest to state: artform, goodbye, bunch, grilled, subsumed, period, national, sites,
Nearest to zero: five, six, seven, four, three, nine, eight, two,
Nearest to about: discontinued, itu, fairness, inspected, off, svc, macdonald, hellenic,
Nearest to not: still, adenylate, almost, it, we, also, to, they,
Nearest to that: which, but, what, rtp, same, invaluable, this, responding,
Nearest to been: be, pharisees, slick, was, tip, baptismal, were, by,
Nearest to was: is, were, be, has, had, did, are, sami,
Nearest to united: si, remixes, mints, hume, lack, roofed, tong, beyond,
Nearest to with: in, between, by, using, while, and, miko, clout,
Nearest to time: doses, jekyll, technician, jdk, corridors, misunderstandings, year, way,
Nearest to people: grit, hauling, years, hometown, zygote, oxidation, airflow, chipmunks,
Nearest to between: from, against, in, ahasuerus, with, obliquely, corticosteroids, donating,
Average loss at step 22000: 3.502499
Average loss at step 24000: 3.491139
Average loss at step 26000: 3.483423
Average loss at step 28000: 3.478860
Average loss at step 30000: 3.504298
Nearest to seven: eight, six, nine, five, four, three, two, zero,
Nearest to after: before, during, when, from, for, following, mexican, takeovers,
Nearest to first: second, last, next, rearden, unjustly, manaus, main, sulawesi,
Nearest to can: may, would, could, will, must, should, might, to,
Nearest to state: bunch, artform, period, city, grilled, goodbye, googol, adverts,
Nearest to zero: five, eight, four, seven, six, nine, three, two,
Nearest to about: discontinued, fairness, off, itu, macdonald, over, proclaiming, artifacts,
Nearest to not: still, almost, adenylate, they, yahweh, always, generally, never,
Nearest to that: which, what, this, but, where, sinauer, crist, rtp,
Nearest to been: be, become, was, pharisees, were, slick, tip, baptismal,
Nearest to was: is, had, were, has, be, became, been, condense,
Nearest to united: beyond, legend, mints, remixes, hume, pronged, lack, roofed,
Nearest to with: between, in, including, while, for, using, when, by,
Nearest to time: doses, surely, jekyll, corridors, year, jdk, troughs, sierpinski,
Nearest to people: hauling, years, grit, airflow, those, shorty, oxidation, quadrilateral,
Nearest to between: with, against, from, ahasuerus, corticosteroids, in, macrobiotic, on,
Average loss at step 32000: 3.496633
Average loss at step 34000: 3.495910
Average loss at step 36000: 3.452565
Average loss at step 38000: 3.304704
Average loss at step 40000: 3.426030
Nearest to seven: eight, six, nine, five, four, three, two, one,
Nearest to after: before, during, when, despite, through, numismatic, mexican, from,
Nearest to first: second, next, last, unjustly, manaus, name, sulawesi, outsold,
Nearest to can: may, will, would, could, must, should, might, cannot,
Nearest to state: city, artform, bunch, grilled, bumps, goodbye, period, wordperfect,
Nearest to zero: five, seven, six, nine, eight, four, two, three,
Nearest to about: over, discontinued, nicomedia, age, approximately, nicephorus, buson, kwan,
Nearest to not: almost, still, never, it, always, adenylate, often, entertained,
Nearest to that: which, what, however, this, but, where, if, tincture,
Nearest to been: was, become, be, were, pharisees, slick, already, baptismal,
Nearest to was: is, became, were, has, been, had, did, be,
Nearest to united: legend, pronged, hume, beyond, remixes, hangman, lack, mints,
Nearest to with: between, while, using, including, by, in, ccds, hazardous,
Nearest to time: nodes, jekyll, nightfall, corridors, year, doses, holder, surely,
Nearest to people: hauling, men, chipmunks, hometown, children, those, shorty, airflow,
Nearest to between: against, with, ahasuerus, from, among, corticosteroids, into, across,
Average loss at step 42000: 3.433749
Average loss at step 44000: 3.454694
Average loss at step 46000: 3.446434
Average loss at step 48000: 3.355413
Average loss at step 50000: 3.379580
Nearest to seven: six, eight, nine, three, four, five, zero, two,
Nearest to after: before, when, during, while, despite, if, through, without,
Nearest to first: second, next, last, third, only, fourth, sulawesi, unjustly,
Nearest to can: may, would, could, will, must, should, might, cannot,
Nearest to state: goodbye, artform, bunch, city, realise, bumps, cambridge, grilled,
Nearest to zero: eight, seven, five, four, nine, six, two, three,
Nearest to about: discontinued, over, artifacts, age, approximately, nicephorus, on, declares,
Nearest to not: almost, still, never, generally, always, jahn, usually, now,
Nearest to that: which, however, what, where, tincture, this, because, too,
Nearest to been: be, become, were, pharisees, was, already, sean, had,
Nearest to was: is, has, were, became, had, be, by, being,
Nearest to united: legend, pronged, hume, hangman, imac, roofed, debtors, excision,
Nearest to with: between, including, by, while, when, clout, using, against,
Nearest to time: year, jekyll, way, nodes, nightfall, surely, period, corridors,
Nearest to people: men, children, hauling, chipmunks, words, grit, hometown, women,
Nearest to between: with, against, across, among, including, corticosteroids, bombs, ahasuerus,
Average loss at step 52000: 3.436884
Average loss at step 54000: 3.425748
Average loss at step 56000: 3.441896
Average loss at step 58000: 3.394205
Average loss at step 60000: 3.393061
Nearest to seven: eight, six, four, five, nine, three, zero, two,
Nearest to after: before, during, when, despite, while, without, through, if,
Nearest to first: second, next, last, same, only, sulawesi, third, kilobyte,
Nearest to can: may, could, would, must, will, should, might, cannot,
Nearest to state: states, artform, city, bunch, wordperfect, government, realise, period,
Nearest to zero: five, four, seven, eight, six, nine, three, two,
Nearest to about: discontinued, over, iso, carla, nicomedia, artifacts, approximately, horseshoes,
Nearest to not: still, almost, never, usually, generally, now, jahn, always,
Nearest to that: which, what, however, this, there, it, misskelley, where,
Nearest to been: become, be, was, were, pharisees, sean, had, recently,
Nearest to was: is, became, had, were, has, be, vliw, been,
Nearest to united: legend, pronged, hangman, hume, excision, roofed, beyond, lindy,
Nearest to with: between, including, using, when, by, in, while, clout,
Nearest to time: jekyll, way, year, nodes, period, term, surely, watford,
Nearest to people: men, children, women, others, years, hauling, those, players,
Nearest to between: with, against, in, among, including, into, across, corticosteroids,
Average loss at step 62000: 3.239926
Average loss at step 64000: 3.254494
Average loss at step 66000: 3.399705
Average loss at step 68000: 3.388807
Average loss at step 70000: 3.361463
Nearest to seven: eight, six, four, nine, five, three, two, zero,
Nearest to after: before, during, despite, while, when, through, without, if,
Nearest to first: second, next, last, same, third, implements, original, fourth,
Nearest to can: may, could, would, will, must, should, might, cannot,
Nearest to state: states, government, city, bumps, artform, bunch, grilled, wordperfect,
Nearest to zero: five, four, six, eight, seven, three, nine, two,
Nearest to about: discontinued, approximately, over, carla, artifacts, nicephorus, inclusion, around,
Nearest to not: still, never, almost, now, usually, generally, jahn, therefore,
Nearest to that: which, what, this, however, supplementation, but, where, sinauer,
Nearest to been: be, become, were, was, sean, pharisees, recently, already,
Nearest to was: is, had, has, were, became, be, been, vliw,
Nearest to united: pronged, hangman, legend, hume, lindy, excision, roofed, lawful,
Nearest to with: using, including, between, clout, in, when, containing, while,
Nearest to time: jekyll, period, year, way, surely, nodes, doses, term,
Nearest to people: men, women, children, others, those, individuals, players, hauling,
Nearest to between: with, from, among, against, across, in, donating, within,
Average loss at step 72000: 3.372562
Average loss at step 74000: 3.347585
Average loss at step 76000: 3.317123
Average loss at step 78000: 3.352588
Average loss at step 80000: 3.378135
Nearest to seven: eight, six, nine, five, four, three, zero, one,
Nearest to after: before, during, when, despite, while, without, through, until,
Nearest to first: second, last, next, same, third, final, only, during,
Nearest to can: could, may, will, would, must, should, cannot, might,
Nearest to state: government, city, states, bumps, hydrofoils, beast, panoramas, wordperfect,
Nearest to zero: seven, five, six, eight, three, nine, four, two,
Nearest to about: approximately, discontinued, over, artifacts, nicephorus, carla, age, inauguration,
Nearest to not: still, never, almost, generally, it, we, now, yahweh,
Nearest to that: which, however, what, this, but, sinauer, kimono, concerning,
Nearest to been: be, become, was, recently, were, sean, pharisees, had,
Nearest to was: is, became, had, were, has, been, condense, mysteriously,
Nearest to united: hangman, legend, pronged, hume, soviet, retention, avenged, excision,
Nearest to with: including, between, when, clout, in, tow, bloomberg, regatta,
Nearest to time: year, jekyll, nodes, way, ergo, period, surely, commented,
Nearest to people: men, children, women, individuals, those, others, parents, words,
Nearest to between: across, among, against, within, with, through, donating, including,
Average loss at step 82000: 3.409103
Average loss at step 84000: 3.404700
Average loss at step 86000: 3.392101
Average loss at step 88000: 3.348653
Average loss at step 90000: 3.363007
Nearest to seven: eight, six, four, nine, five, three, one, two,
Nearest to after: before, during, despite, while, when, without, through, if,
Nearest to first: second, last, next, original, same, final, sulawesi, third,
Nearest to can: could, may, must, will, should, might, would, cannot,
Nearest to state: government, city, states, bunch, bumps, indignant, artform, beast,
Nearest to zero: five, seven, nine, eight, six, four, three, two,
Nearest to about: over, discontinued, approximately, nicomedia, moyers, nicephorus, artifacts, carla,
Nearest to not: still, now, never, almost, we, they, therefore, otherwise,
Nearest to that: which, what, however, mischief, sinauer, who, concerning, trintignant,
Nearest to been: become, be, was, sean, were, recently, pharisees, previously,
Nearest to was: is, had, became, were, be, has, been, sami,
Nearest to united: hangman, legend, pronged, soviet, retention, hume, excision, diagnosing,
Nearest to with: between, including, using, in, by, without, ginette, clout,
Nearest to time: jekyll, period, year, nodes, way, ergo, commented, alberti,
Nearest to people: men, children, women, parents, individuals, airflow, christians, hauling,
Nearest to between: with, across, among, against, within, through, into, corticosteroids,
Average loss at step 92000: 3.395613
Average loss at step 94000: 3.250082
Average loss at step 96000: 3.358450
Average loss at step 98000: 3.247167
Average loss at step 100000: 3.357481
Nearest to seven: four, eight, nine, six, five, three, zero, two,
Nearest to after: before, during, despite, when, while, without, through, following,
Nearest to first: next, second, last, third, final, original, sulawesi, same,
Nearest to can: could, may, must, should, would, might, will, cannot,
Nearest to state: city, states, bumps, government, baptism, oversee, indignant, seimas,
Nearest to zero: five, four, seven, eight, six, nine, three, two,
Nearest to about: approximately, discontinued, over, around, nicephorus, min, moyers, route,
Nearest to not: still, never, almost, now, actually, t, they, usually,
Nearest to that: which, what, however, sinauer, who, this, but, supplementation,
Nearest to been: become, be, was, sean, recently, already, pharisees, were,
Nearest to was: is, became, were, has, been, had, when, sami,
Nearest to united: hangman, legend, pronged, senate, ottoman, soviet, retention, avenged,
Nearest to with: including, between, using, when, in, of, transclusion, lever,
Nearest to time: way, year, jekyll, nodes, commented, doses, period, life,
Nearest to people: men, children, women, individuals, those, parents, players, jews,
Nearest to between: across, with, among, against, through, corticosteroids, within, in,

In [55]:
num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [56]:
def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)


Additional notes

1. word2vec training batch generation

Closer look at batch handling. We generate a batch_size worth of training samples. With num_skips set to 2 this means we repeat each input in output batch. The data starts with the sentence "anarchism originated as a term of abuse". On running generate_batch with skip_window of 1 and num_skips 2, we will get the standard word2vec model where label shows context word (left,right) associated with batch output word as follows:

batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
labels: ['as', 'anarchism', 'originated', 'a', 'term', 'as', 'a', 'of']

In [49]:
batch_size = 8
num_skips = 2
skip_window = 1
data_index = 0
for i in range(5):
    print("--- minibatch %d ---" % i)
    print('data_index',data_index)
    batch, labels = generate_batch(batch_size, num_skips, skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (2, 1))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


--- minibatch 0 ---
data_index 0

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['anarchism', 'as', 'originated', 'a', 'term', 'as', 'a', 'of']
--- minibatch 1 ---
data_index 7

with num_skips = 2 and skip_window = 1:
    batch: ['used', 'used', 'against', 'against', 'early', 'early', 'working', 'working']
    labels: ['first', 'against', 'early', 'used', 'against', 'working', 'early', 'class']
--- minibatch 2 ---
data_index 14

with num_skips = 2 and skip_window = 1:
    batch: ['the', 'the', 'diggers', 'diggers', 'of', 'of', 'the', 'the']
    labels: ['including', 'diggers', 'the', 'of', 'the', 'diggers', 'english', 'of']
--- minibatch 3 ---
data_index 21

with num_skips = 2 and skip_window = 1:
    batch: ['the', 'the', 'sans', 'sans', 'UNK', 'UNK', 'of', 'of']
    labels: ['sans', 'and', 'UNK', 'the', 'sans', 'of', 'the', 'UNK']
--- minibatch 4 ---
data_index 28

with num_skips = 2 and skip_window = 1:
    batch: ['whilst', 'whilst', 'the', 'the', 'term', 'term', 'is', 'is']
    labels: ['revolution', 'the', 'whilst', 'term', 'the', 'is', 'still', 'term']

2. CBOW training batch generation

For CBOW (continuous bag of words) model, we switch output and context (label) so now our label shows the word that corresponds to a particular output context:


In [50]:
batch_size = 8
num_skips = 2
skip_window = 1
data_index = 0
for i in range(5):
    print("--- minibatch %d ---" % i)
    print('data_index',data_index)
    labels, batch = generate_batch(batch_size, num_skips, skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (2, 1))
    print('    batch:', [reverse_dictionary[bi] for bi in batch.reshape(8)])
    print('    labels:', [reverse_dictionary[li] for li in labels])


--- minibatch 0 ---
data_index 0

with num_skips = 2 and skip_window = 1:
    batch: ['as', 'anarchism', 'a', 'originated', 'as', 'term', 'a', 'of']
    labels: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
--- minibatch 1 ---
data_index 7

with num_skips = 2 and skip_window = 1:
    batch: ['against', 'first', 'early', 'used', 'working', 'against', 'class', 'early']
    labels: ['used', 'used', 'against', 'against', 'early', 'early', 'working', 'working']
--- minibatch 2 ---
data_index 14

with num_skips = 2 and skip_window = 1:
    batch: ['diggers', 'including', 'of', 'the', 'diggers', 'the', 'of', 'english']
    labels: ['the', 'the', 'diggers', 'diggers', 'of', 'of', 'the', 'the']
--- minibatch 3 ---
data_index 21

with num_skips = 2 and skip_window = 1:
    batch: ['and', 'sans', 'UNK', 'the', 'of', 'sans', 'the', 'UNK']
    labels: ['the', 'the', 'sans', 'sans', 'UNK', 'UNK', 'of', 'of']
--- minibatch 4 ---
data_index 28

with num_skips = 2 and skip_window = 1:
    batch: ['the', 'revolution', 'whilst', 'term', 'is', 'the', 'still', 'term']
    labels: ['whilst', 'whilst', 'the', 'the', 'term', 'term', 'is', 'is']

3. valid_examples

Final note: valid_examples should give you an array of 16 random indices of common words (all less that 100):


In [51]:
print(valid_examples)
print([reverse_dictionary[ex] for ex in valid_examples])


[23 61 46 53 94  8 79 38 20 54 18 86 24 65 83 78]
['seven', 'after', 'first', 'can', 'state', 'zero', 'about', 'not', 'that', 'been', 'was', 'united', 'with', 'time', 'people', 'between']