This notebook trains a Word2Vec skip-gram model over Text8 data. It's based on the code here, with the additional use of the SummaryWriter, so that we can track training progress in TensorBoard. See this example for its script counterpart.


In [0]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import time
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from sklearn.manifold import TSNE

Download the data from the source website if necessary.


In [0]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


Found and verified text8.zip

Read the data into a string.


In [0]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)
print('Data size %d' % len(words))


Data size 17005207

Build the dictionary and replace rare words with UNK token.


In [0]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
del words  # Hint to reduce memory.


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5244, 3084, 12, 6, 195, 2, 3136, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']

Function to generate a training batch for the skip-gram model.


In [0]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['as', 'anarchism', 'originated', 'a', 'term', 'as', 'of', 'a']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['originated', 'anarchism', 'a', 'term', 'term', 'of', 'as', 'originated']

Build and train a skip-gram model.


In [0]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  loss = tf.reduce_mean(
      tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                     num_sampled, vocabulary_size))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Define info to be used by the SummaryWriter. This will let TensorBoard
  # plot loss values during the training process.
  loss_summary = tf.scalar_summary("loss", loss)
  train_summary_op = tf.merge_summary([loss_summary])

  # Add variable initializer.
  init = tf.initialize_all_variables()
  print("finished building graph.")


finished building graph.

In [0]:
# Begin training.
num_steps = 100001

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print("Initialized")

  # Directory in which to write summary information.
  # You can point TensorBoard to this directory via:
  # $ tensorboard --logdir=/tmp/word2vec_basic/summaries
  # Tensorflow assumes this directory already exists, so we need to create it.
  timestamp = str(int(time.time()))
  if not os.path.exists(os.path.join("/tmp/word2vec_basic",
                                     "summaries", timestamp)):
    os.makedirs(os.path.join("/tmp/word2vec_basic", "summaries", timestamp))
  # Create the SummaryWriter
  train_summary_writer = tf.train.SummaryWriter(
      os.path.join(
          "/tmp/word2vec_basic", "summaries", timestamp), session.graph)

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    # Also evaluate the training summary op.
    _, loss_val, tsummary = session.run(
        [optimizer, loss, train_summary_op],
        feed_dict=feed_dict)
    average_loss += loss_val
    # Write the evaluated summary info to the SummaryWriter. This info will
    # then show up in the TensorBoard events.
    train_summary_writer.add_summary(tsummary, step)

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print("Average loss at step ", step, ": ", average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = "Nearest to %s:" % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = "%s %s," % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()
  print("finished training.")


Initialized
Average loss at step  0 :  322.68536377
Nearest to between: inbound, frail, ground, thermochemistry, unaspirated, treachery, quickest, knuckle,
Nearest to his: samplers, celebrating, monophonic, sichuan, engravings, colleague, javan, campaigned,
Nearest to system: lagoons, oversee, impinging, wakefield, marcomanni, annals, sf, embroiled,
Nearest to to: minarchists, automate, cucumber, kornbluth, deviate, homegrown, informers, substituting,
Nearest to d: skilling, jozef, amassed, owners, dredging, pronouncement, bis, antinous,
Nearest to only: hee, wyatt, subkey, hypopituitarism, philosophie, cherry, bet, ancillary,
Nearest to this: dormitory, xxvi, yaqub, leaguers, refuted, hodge, disciplinarian, analyzing,
Nearest to of: oresund, bournemouth, carbine, ambulance, wardens, gnostics, catcher, rudy,
Nearest to i: domitian, colonel, scams, routine, connelly, contorted, hurt, decade,
Nearest to after: whigs, recruit, clarity, bijection, playmate, eucharist, knife, plot,
Nearest to war: damon, newnode, instantly, antelope, zoo, telegraphic, advaita, baraka,
Nearest to may: nostratic, kinase, ferraris, sentence, rang, lifeform, fraudulent, chart,
Nearest to eight: prokofiev, stabbed, alemannic, improve, dejima, cephalon, revealing, ordered,
Nearest to be: oriente, magnetization, websites, charterers, richland, smothers, dmu, enigma,
Nearest to up: erich, generis, vittorio, leland, horsepower, budge, anarchist, enders,
Nearest to new: selwyn, buckle, violations, abridged, deciphered, when, ddrmax, ref,
Average loss at step  2000 :  113.376254956
Average loss at step  4000 :  52.2611660848
Average loss at step  6000 :  33.3537528234
Average loss at step  8000 :  23.822204227
Average loss at step  10000 :  17.8010921574
Nearest to between: ground, reginae, cl, metres, projects, personality, sink, tackle,
Nearest to his: the, waite, rooted, victoriae, celebrating, colleague, analogue, agave,
Nearest to system: consistent, distinctive, rhetoric, grande, seed, impinging, dressing, vs,
Nearest to to: and, in, docked, cl, victoriae, abba, preparations, not,
Nearest to d: victoriae, owners, scant, reginae, combined, crust, two, orange,
Nearest to only: wyatt, philosophie, shall, reginae, roper, implicit, inches, cherry,
Nearest to this: a, the, reginae, services, thousand, firstly, categorized, authored,
Nearest to of: and, in, for, with, victoriae, pogrom, from, nine,
Nearest to i: responsible, decade, colonel, routine, orbitals, trinidad, left, montgomery,
Nearest to after: from, deposits, writing, off, plot, offering, atom, gollancz,
Nearest to war: cl, factors, heavily, beckwith, instantly, unpopular, poem, brisbane,
Nearest to may: nostratic, chart, abdali, holidays, sentence, adaptive, was, movement,
Nearest to eight: nine, reginae, gland, zero, vs, victoriae, agave, cl,
Nearest to be: dogmas, have, schopenhauer, spider, had, apollo, lunar, clear,
Nearest to up: cc, think, erich, anarchist, along, eleven, australis, ever,
Nearest to new: when, negative, deductive, arbitrary, periods, deciphered, bang, fitting,
Average loss at step  12000 :  14.0647679223
Average loss at step  14000 :  11.6428226123
Average loss at step  16000 :  9.99395846704
Average loss at step  18000 :  8.71788009775
Average loss at step  20000 :  7.90638680458
Nearest to between: ground, cosmology, from, in, reginae, cl, projects, four,
Nearest to his: the, its, waite, their, agave, victoriae, rooted, dasyprocta,
Nearest to system: oversee, dasyprocta, consistent, lagoons, annals, grande, distinctive, rhetoric,
Nearest to to: not, for, in, and, nine, docked, could, would,
Nearest to d: and, dasyprocta, b, victoriae, scant, reginae, crust, fantasy,
Nearest to only: wyatt, dasyprocta, shall, philosophie, roper, reginae, hee, tailed,
Nearest to this: a, the, which, dormitory, it, one, categorized, documentary,
Nearest to of: and, in, for, victoriae, agouti, nine, eight, dasyprocta,
Nearest to i: decade, UNK, responsible, overhead, colonel, ponce, bela, windmill,
Nearest to after: from, deposits, by, dasyprocta, yang, for, fellow, writing,
Nearest to war: cl, unpopular, serine, poem, heavily, beckwith, instantly, brisbane,
Nearest to may: nostratic, chart, adaptive, lauder, was, gigantic, nine, epistles,
Nearest to eight: nine, zero, five, six, four, seven, agouti, dasyprocta,
Nearest to be: have, was, is, dogmas, had, polyhedra, bend, by,
Nearest to up: cc, erich, think, eye, eleven, along, anarchist, nursery,
Nearest to new: when, negative, arbitrary, aboard, periods, buckle, deductive, fitting,
Average loss at step  22000 :  7.16769013131
Average loss at step  24000 :  6.91010405433
Average loss at step  26000 :  6.75107297122
Average loss at step  28000 :  6.17949648464
Average loss at step  30000 :  6.19571543956
Nearest to between: from, in, ground, cosmology, inbound, jts, abet, reginae,
Nearest to his: the, their, its, waite, s, a, rooted, agave,
Nearest to system: oversee, dasyprocta, judaea, annals, grande, lagoons, consistent, marcomanni,
Nearest to to: would, for, could, not, docked, in, can, nine,
Nearest to d: b, dasyprocta, and, victoriae, scant, reginae, abet, combined,
Nearest to only: wyatt, dasyprocta, hee, shall, roper, tailed, philosophie, reginae,
Nearest to this: a, which, it, the, dormitory, abitibi, that, vdash,
Nearest to of: in, and, for, from, victoriae, nine, dasyprocta, eight,
Nearest to i: decade, UNK, overhead, bela, responsible, ark, abet, four,
Nearest to after: from, by, for, dasyprocta, in, deposits, six, with,
Nearest to war: cl, nicolson, unpopular, shoulders, serine, damon, poem, brisbane,
Nearest to may: nostratic, will, would, can, chart, gigantic, adaptive, zero,
Nearest to eight: nine, six, seven, five, four, zero, three, agouti,
Nearest to be: have, was, is, were, by, are, bend, ung,
Nearest to up: cc, erich, think, eye, eleven, generis, provide, ernie,
Nearest to new: abitibi, when, pds, negative, deductive, aboard, periods, amalthea,
Average loss at step  32000 :  5.89053646111
Average loss at step  34000 :  5.83200742936
Average loss at step  36000 :  5.72352871478
Average loss at step  38000 :  5.27656394398
Average loss at step  40000 :  5.49755538332
Nearest to between: from, in, ground, cosmology, with, through, abet, inbound,
Nearest to his: their, its, the, waite, s, her, rooted, agave,
Nearest to system: oversee, lemmy, dasyprocta, grande, albury, annals, judaea, lagoons,
Nearest to to: would, can, could, nine, not, for, mishnayot, victoriae,
Nearest to d: b, UNK, dasyprocta, scant, victoriae, and, waterfalls, bis,
Nearest to only: wyatt, dasyprocta, hee, roper, shall, tailed, reginae, abet,
Nearest to this: the, it, which, that, a, dormitory, vdash, abitibi,
Nearest to of: in, for, victoriae, dasyprocta, agouti, abet, and, eight,
Nearest to i: UNK, decade, ii, bela, you, three, overhead, ark,
Nearest to after: from, recitative, dasyprocta, by, four, before, six, lemmy,
Nearest to war: cl, nicolson, unpopular, shoulders, brisbane, serine, damon, poem,
Nearest to may: can, will, would, nostratic, chart, gigantic, must, zero,
Nearest to eight: nine, six, seven, five, four, zero, three, agouti,
Nearest to be: have, were, is, was, by, are, bend, as,
Nearest to up: cc, recitative, provide, them, think, erich, eye, dresden,
Nearest to new: abitibi, pds, amalthea, deductive, negative, when, igor, fitting,
Average loss at step  42000 :  5.31733204591
Average loss at step  44000 :  5.29860926902
Average loss at step  46000 :  5.28137998092
Average loss at step  48000 :  5.00739693534
Average loss at step  50000 :  5.16847843921
Nearest to between: from, with, in, ground, cosmology, through, critic, to,
Nearest to his: their, its, the, her, s, waite, rooted, agave,
Nearest to system: oversee, grande, lemmy, lagoons, dasyprocta, consistent, judaea, antibody,
Nearest to to: could, would, can, manipulated, not, cloaca, docked, mishnayot,
Nearest to d: b, UNK, dasyprocta, scant, waterfalls, victoriae, bis, crust,
Nearest to only: wyatt, roper, dasyprocta, hee, tailed, shall, shaw, merchants,
Nearest to this: which, it, the, that, abitibi, agouti, one, nguni,
Nearest to of: and, in, victoriae, for, nine, dasyprocta, agouti, seven,
Nearest to i: you, three, we, ii, solicitation, sickness, bela, abdulaziz,
Nearest to after: from, before, four, six, recitative, by, dasyprocta, when,
Nearest to war: cl, nicolson, unpopular, shoulders, damon, brisbane, serine, thibetanus,
Nearest to may: can, will, would, nostratic, must, chart, could, gigantic,
Nearest to eight: six, nine, seven, five, four, three, zero, agouti,
Nearest to be: have, was, is, were, are, by, bend, been,
Nearest to up: cc, them, dresden, provide, eye, recitative, oscillators, erich,
Nearest to new: abitibi, pds, deductive, negative, aboard, fitting, amalthea, igor,
Average loss at step  52000 :  5.17127143049
Average loss at step  54000 :  5.13998410141
Average loss at step  56000 :  5.0575108695
Average loss at step  58000 :  5.0854887284
Average loss at step  60000 :  4.93594445455
Nearest to between: from, with, in, ground, cosmology, through, critic, granularity,
Nearest to his: their, its, her, the, s, waite, agave, rooted,
Nearest to system: dasyprocta, oversee, lemmy, grande, microcebus, albury, saguinus, wct,
Nearest to to: would, not, could, can, wct, for, docked, nine,
Nearest to d: b, dasyprocta, UNK, scant, bis, victoriae, waterfalls, six,
Nearest to only: roper, wyatt, hee, dasyprocta, cebus, callithrix, shall, tailed,
Nearest to this: which, it, that, the, wct, there, tamarin, one,
Nearest to of: for, dasyprocta, tamarin, and, in, agouti, victoriae, ssbn,
Nearest to i: you, ii, UNK, we, sickness, bela, solicitation, m,
Nearest to after: before, from, when, recitative, five, four, dasyprocta, six,
Nearest to war: cl, nicolson, shoulders, damon, unpopular, brisbane, serine, utraquists,
Nearest to may: can, would, will, must, could, nostratic, might, chart,
Nearest to eight: six, nine, seven, five, four, zero, three, dasyprocta,
Nearest to be: have, was, were, been, by, are, is, bend,
Nearest to up: cc, them, him, eye, dresden, oscillators, provide, recitative,
Nearest to new: deductive, abitibi, negative, pds, aboard, periods, fitting, amalthea,
Average loss at step  62000 :  4.7950760119
Average loss at step  64000 :  4.78206047785
Average loss at step  66000 :  4.96684676468
Average loss at step  68000 :  4.91816470301
Average loss at step  70000 :  4.77152032399
Nearest to between: with, from, ground, in, through, critic, jts, cosmology,
Nearest to his: their, its, her, the, waite, agave, rooted, s,
Nearest to system: oversee, grande, dasyprocta, thaler, microcebus, lemmy, albury, antibody,
Nearest to to: would, can, could, docked, wct, must, for, not,
Nearest to d: b, UNK, bis, scant, dasyprocta, seven, waterfalls, leigh,
Nearest to only: roper, wyatt, dasyprocta, thaler, but, hee, tailed, callithrix,
Nearest to this: which, it, the, that, wct, celled, one, there,
Nearest to of: for, dasyprocta, tamarin, victoriae, agouti, aveiro, callithrix, or,
Nearest to i: UNK, ii, you, we, g, sickness, bela, solicitation,
Nearest to after: before, when, from, until, four, in, for, five,
Nearest to war: cl, shoulders, nicolson, unpopular, serine, damon, brisbane, thaler,
Nearest to may: can, would, will, must, could, might, should, nostratic,
Nearest to eight: nine, six, seven, five, four, zero, three, agouti,
Nearest to be: have, been, were, are, by, was, is, bend,
Nearest to up: them, cc, thaler, him, eye, oscillators, dresden, provide,
Nearest to new: deductive, abitibi, pds, negative, aboard, whirl, amalthea, igor,
Average loss at step  72000 :  4.80108989704
Average loss at step  74000 :  4.77867026335
Average loss at step  76000 :  4.87726270449
Average loss at step  78000 :  4.79176552665
Average loss at step  80000 :  4.8272833041
Nearest to between: with, from, in, ground, cegep, through, cosmology, abet,
Nearest to his: their, her, its, the, waite, s, your, rooted,
Nearest to system: grande, oversee, dasyprocta, thaler, microcebus, antibody, albury, lemmy,
Nearest to to: would, could, can, docked, mahor, beeb, wct, clodius,
Nearest to d: b, UNK, bis, scant, dasyprocta, p, waterfalls, ubiquity,
Nearest to only: roper, dasyprocta, but, wyatt, thaler, callithrix, vec, saguinus,
Nearest to this: which, it, the, wct, that, cegep, what, nguni,
Nearest to of: in, nine, agouti, victoriae, tamarin, dasyprocta, including, eight,
Nearest to i: you, ii, g, UNK, we, sickness, bela, lexicographer,
Nearest to after: before, when, from, during, until, by, in, five,
Nearest to war: cl, shoulders, nicolson, unpopular, clodius, damon, serine, utraquists,
Nearest to may: can, would, will, must, could, might, should, gigantic,
Nearest to eight: six, seven, nine, five, four, zero, three, agouti,
Nearest to be: been, have, were, was, is, are, by, being,
Nearest to up: them, cc, him, thaler, oscillators, eye, dresden, recitative,
Nearest to new: deductive, pds, abitibi, amalthea, whirl, negative, periods, dasyprocta,
Average loss at step  82000 :  4.79888994157
Average loss at step  84000 :  4.78643635249
Average loss at step  86000 :  4.7426726377
Average loss at step  88000 :  4.69790493059
Average loss at step  90000 :  4.75021843386
Nearest to between: with, from, in, through, scalia, ground, cegep, abet,
Nearest to his: their, her, its, the, waite, your, agave, s,
Nearest to system: dasyprocta, tamias, thaler, microcebus, grande, oversee, albury, lemmy,
Nearest to to: would, can, docked, mahor, could, beeb, nine, cloaca,
Nearest to d: b, six, UNK, bis, dasyprocta, p, scant, leigh,
Nearest to only: dasyprocta, roper, thaler, wyatt, callithrix, shall, but, saguinus,
Nearest to this: which, it, the, wct, that, cegep, what, itself,
Nearest to of: in, for, tamarin, dasyprocta, victoriae, including, agouti, callithrix,
Nearest to i: you, g, ii, we, UNK, m, sickness, l,
Nearest to after: before, when, during, until, five, from, dasyprocta, recitative,
Nearest to war: cl, shoulders, nicolson, unpopular, calypso, dissipated, brisbane, serine,
Nearest to may: can, would, will, must, could, might, should, gigantic,
Nearest to eight: seven, six, five, nine, four, zero, three, cegep,
Nearest to be: been, have, was, were, is, refer, are, being,
Nearest to up: them, him, cc, thaler, eye, oscillators, variance, philippi,
Nearest to new: pds, abitibi, amalthea, whirl, deductive, negative, aboard, thibetanus,
Average loss at step  92000 :  4.71757662702
Average loss at step  94000 :  4.63263672411
Average loss at step  96000 :  4.72182215083
Average loss at step  98000 :  4.62883195364
Average loss at step  100000 :  4.68067348731
Nearest to between: with, from, through, ground, in, scalia, cegep, granularity,
Nearest to his: their, her, its, the, waite, your, s, my,
Nearest to system: grande, dasyprocta, tamias, microcebus, oversee, thaler, albury, antibody,
Nearest to to: could, would, can, must, mahor, beeb, victoriae, wct,
Nearest to d: b, dasyprocta, scant, bis, six, p, victoriae, waterfalls,
Nearest to only: roper, dasyprocta, thaler, callithrix, reginae, saguinus, wyatt, wct,
Nearest to this: which, it, the, that, what, wct, itself, cegep,
Nearest to of: nine, including, tamarin, victoriae, in, eight, dasyprocta, agouti,
Nearest to i: you, we, ii, g, sickness, m, they, lexicographer,
Nearest to after: before, when, during, until, from, following, five, dasyprocta,
Nearest to war: cl, shoulders, unpopular, nicolson, serine, calypso, dissipated, utraquists,
Nearest to may: can, would, will, could, must, might, should, gigantic,
Nearest to eight: seven, nine, six, five, four, zero, three, agouti,
Nearest to be: been, have, were, are, was, by, is, refer,
Nearest to up: them, him, cc, thaler, eye, provide, oscillators, variance,
Nearest to new: pds, deductive, amalthea, abitibi, dogma, whirl, negative, aboard,
finished training.

In [0]:
# Visualize the embeddings.
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

In [0]:
try:
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
  plot_only = 500
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels)

except ImportError:
  print("Please install sklearn and matplotlib to visualize embeddings.")



In [ ]: