In [4]:
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Basic word2vec example."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'

In [20]:
def baseconvert(n, base):
    """convert positive decimal integer n to equivalent in another base (2-36)"""

    digits = "0123456789abcdefghijklmnopqrstuvwxyz"

    try:
        n = int(n)
        base = int(base)
    except:
        return ""

    if n < 0 or base < 2 or base > 36:
        return ""

    s = ""
    while 1:
        r = n % base
        s = digits[r] + s
        n = n / base
        if n == 0:
            break

    return s

In [5]:
def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception(
        'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


Found and verified text8.zip

In [6]:
# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

vocabulary = read_data(filename)
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


Data size 17005207

In [7]:
vocabulary


Out[7]:
['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the',
 'organization',
 'of',
 'society',
 'it',
 'has',
 'also',
 'been',
 'taken',
 'up',
 'as',
 'a',
 'positive',
 'label',
 'by',
 'self',
 'defined',
 'anarchists',
 'the',
 'word',
 'anarchism',
 'is',
 'derived',
 'from',
 'the',
 'greek',
 'without',
 'archons',
 'ruler',
 'chief',
 'king',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'is',
 'the',
 'belief',
 'that',
 'rulers',
 'are',
 'unnecessary',
 'and',
 'should',
 'be',
 'abolished',
 'although',
 'there',
 'are',
 'differing',
 'interpretations',
 'of',
 'what',
 'this',
 'means',
 'anarchism',
 'also',
 'refers',
 'to',
 'related',
 'social',
 'movements',
 'that',
 'advocate',
 'the',
 'elimination',
 'of',
 'authoritarian',
 'institutions',
 'particularly',
 'the',
 'state',
 'the',
 'word',
 'anarchy',
 'as',
 'most',
 'anarchists',
 'use',
 'it',
 'does',
 'not',
 'imply',
 'chaos',
 'nihilism',
 'or',
 'anomie',
 'but',
 'rather',
 'a',
 'harmonious',
 'anti',
 'authoritarian',
 'society',
 'in',
 'place',
 'of',
 'what',
 'are',
 'regarded',
 'as',
 'authoritarian',
 'political',
 'structures',
 'and',
 'coercive',
 'economic',
 'institutions',
 'anarchists',
 'advocate',
 'social',
 'relations',
 'based',
 'upon',
 'voluntary',
 'association',
 'of',
 'autonomous',
 'individuals',
 'mutual',
 'aid',
 'and',
 'self',
 'governance',
 'while',
 'anarchism',
 'is',
 'most',
 'easily',
 'defined',
 'by',
 'what',
 'it',
 'is',
 'against',
 'anarchists',
 'also',
 'offer',
 'positive',
 'visions',
 'of',
 'what',
 'they',
 'believe',
 'to',
 'be',
 'a',
 'truly',
 'free',
 'society',
 'however',
 'ideas',
 'about',
 'how',
 'an',
 'anarchist',
 'society',
 'might',
 'work',
 'vary',
 'considerably',
 'especially',
 'with',
 'respect',
 'to',
 'economics',
 'there',
 'is',
 'also',
 'disagreement',
 'about',
 'how',
 'a',
 'free',
 'society',
 'might',
 'be',
 'brought',
 'about',
 'origins',
 'and',
 'predecessors',
 'kropotkin',
 'and',
 'others',
 'argue',
 'that',
 'before',
 'recorded',
 'history',
 'human',
 'society',
 'was',
 'organized',
 'on',
 'anarchist',
 'principles',
 'most',
 'anthropologists',
 'follow',
 'kropotkin',
 'and',
 'engels',
 'in',
 'believing',
 'that',
 'hunter',
 'gatherer',
 'bands',
 'were',
 'egalitarian',
 'and',
 'lacked',
 'division',
 'of',
 'labour',
 'accumulated',
 'wealth',
 'or',
 'decreed',
 'law',
 'and',
 'had',
 'equal',
 'access',
 'to',
 'resources',
 'william',
 'godwin',
 'anarchists',
 'including',
 'the',
 'the',
 'anarchy',
 'organisation',
 'and',
 'rothbard',
 'find',
 'anarchist',
 'attitudes',
 'in',
 'taoism',
 'from',
 'ancient',
 'china',
 'kropotkin',
 'found',
 'similar',
 'ideas',
 'in',
 'stoic',
 'zeno',
 'of',
 'citium',
 'according',
 'to',
 'kropotkin',
 'zeno',
 'repudiated',
 'the',
 'omnipotence',
 'of',
 'the',
 'state',
 'its',
 'intervention',
 'and',
 'regimentation',
 'and',
 'proclaimed',
 'the',
 'sovereignty',
 'of',
 'the',
 'moral',
 'law',
 'of',
 'the',
 'individual',
 'the',
 'anabaptists',
 'of',
 'one',
 'six',
 'th',
 'century',
 'europe',
 'are',
 'sometimes',
 'considered',
 'to',
 'be',
 'religious',
 'forerunners',
 'of',
 'modern',
 'anarchism',
 'bertrand',
 'russell',
 'in',
 'his',
 'history',
 'of',
 'western',
 'philosophy',
 'writes',
 'that',
 'the',
 'anabaptists',
 'repudiated',
 'all',
 'law',
 'since',
 'they',
 'held',
 'that',
 'the',
 'good',
 'man',
 'will',
 'be',
 'guided',
 'at',
 'every',
 'moment',
 'by',
 'the',
 'holy',
 'spirit',
 'from',
 'this',
 'premise',
 'they',
 'arrive',
 'at',
 'communism',
 'the',
 'diggers',
 'or',
 'true',
 'levellers',
 'were',
 'an',
 'early',
 'communistic',
 'movement',
 'during',
 'the',
 'time',
 'of',
 'the',
 'english',
 'civil',
 'war',
 'and',
 'are',
 'considered',
 'by',
 'some',
 'as',
 'forerunners',
 'of',
 'modern',
 'anarchism',
 'in',
 'the',
 'modern',
 'era',
 'the',
 'first',
 'to',
 'use',
 'the',
 'term',
 'to',
 'mean',
 'something',
 'other',
 'than',
 'chaos',
 'was',
 'louis',
 'armand',
 'baron',
 'de',
 'lahontan',
 'in',
 'his',
 'nouveaux',
 'voyages',
 'dans',
 'l',
 'am',
 'rique',
 'septentrionale',
 'one',
 'seven',
 'zero',
 'three',
 'where',
 'he',
 'described',
 'the',
 'indigenous',
 'american',
 'society',
 'which',
 'had',
 'no',
 'state',
 'laws',
 'prisons',
 'priests',
 'or',
 'private',
 'property',
 'as',
 'being',
 'in',
 'anarchy',
 'russell',
 'means',
 'a',
 'libertarian',
 'and',
 'leader',
 'in',
 'the',
 'american',
 'indian',
 'movement',
 'has',
 'repeatedly',
 'stated',
 'that',
 'he',
 'is',
 'an',
 'anarchist',
 'and',
 'so',
 'are',
 'all',
 'his',
 'ancestors',
 'in',
 'one',
 'seven',
 'nine',
 'three',
 'in',
 'the',
 'thick',
 'of',
 'the',
 'french',
 'revolution',
 'william',
 'godwin',
 'published',
 'an',
 'enquiry',
 'concerning',
 'political',
 'justice',
 'although',
 'godwin',
 'did',
 'not',
 'use',
 'the',
 'word',
 'anarchism',
 'many',
 'later',
 'anarchists',
 'have',
 'regarded',
 'this',
 'book',
 'as',
 'the',
 'first',
 'major',
 'anarchist',
 'text',
 'and',
 'godwin',
 'as',
 'the',
 'founder',
 'of',
 'philosophical',
 'anarchism',
 'but',
 'at',
 'this',
 'point',
 'no',
 'anarchist',
 'movement',
 'yet',
 'existed',
 'and',
 'the',
 'term',
 'anarchiste',
 'was',
 'known',
 'mainly',
 'as',
 'an',
 'insult',
 'hurled',
 'by',
 'the',
 'bourgeois',
 'girondins',
 'at',
 'more',
 'radical',
 'elements',
 'in',
 'the',
 'french',
 'revolution',
 'the',
 'first',
 'self',
 'labelled',
 'anarchist',
 'pierre',
 'joseph',
 'proudhon',
 'it',
 'is',
 'commonly',
 'held',
 'that',
 'it',
 'wasn',
 't',
 'until',
 'pierre',
 'joseph',
 'proudhon',
 'published',
 'what',
 'is',
 'property',
 'in',
 'one',
 'eight',
 'four',
 'zero',
 'that',
 'the',
 'term',
 'anarchist',
 'was',
 'adopted',
 'as',
 'a',
 'self',
 'description',
 'it',
 'is',
 'for',
 'this',
 'reason',
 'that',
 'some',
 'claim',
 'proudhon',
 'as',
 'the',
 'founder',
 'of',
 'modern',
 'anarchist',
 'theory',
 'in',
 'what',
 'is',
 'property',
 'proudhon',
 'answers',
 'with',
 'the',
 'famous',
 'accusation',
 'property',
 'is',
 'theft',
 'in',
 'this',
 'work',
 'he',
 'opposed',
 'the',
 'institution',
 'of',
 'decreed',
 'property',
 'propri',
 't',
 'where',
 'owners',
 'have',
 'complete',
 'rights',
 'to',
 'use',
 'and',
 'abuse',
 'their',
 'property',
 'as',
 'they',
 'wish',
 'such',
 'as',
 'exploiting',
 'workers',
 'for',
 'profit',
 'in',
 'its',
 'place',
 'proudhon',
 'supported',
 'what',
 'he',
 'called',
 'possession',
 'individuals',
 'can',
 'have',
 'limited',
 'rights',
 'to',
 'use',
 'resources',
 'capital',
 'and',
 'goods',
 'in',
 'accordance',
 'with',
 'principles',
 'of',
 'equality',
 'and',
 'justice',
 'proudhon',
 's',
 'vision',
 'of',
 'anarchy',
 'which',
 'he',
 'called',
 'mutualism',
 'mutuellisme',
 'involved',
 'an',
 'exchange',
 'economy',
 'where',
 'individuals',
 'and',
 'groups',
 'could',
 'trade',
 'the',
 'products',
 'of',
 'their',
 'labor',
 'using',
 'labor',
 'notes',
 'which',
 'represented',
 'the',
 'amount',
 'of',
 'working',
 'time',
 'involved',
 'in',
 'production',
 'this',
 'would',
 'ensure',
 'that',
 'no',
 'one',
 'would',
 'profit',
 'from',
 'the',
 'labor',
 'of',
 'others',
 'workers',
 'could',
 'freely',
 'join',
 'together',
 'in',
 'co',
 'operative',
 'workshops',
 'an',
 'interest',
 'free',
 'bank',
 'would',
 'be',
 'set',
 'up',
 'to',
 'provide',
 'everyone',
 'with',
 'access',
 'to',
 'the',
 'means',
 'of',
 'production',
 'proudhon',
 's',
 'ideas',
 'were',
 'influential',
 'within',
 'french',
 'working',
 'class',
 'movements',
 'and',
 'his',
 'followers',
 'were',
 'active',
 'in',
 'the',
 'revolution',
 'of',
 'one',
 'eight',
 'four',
 'eight',
 'in',
 'france',
 'proudhon',
 's',
 'philosophy',
 'of',
 'property',
 'is',
 'complex',
 'it',
 'was',
 'developed',
 'in',
 'a',
 'number',
 'of',
 'works',
 'over',
 'his',
 'lifetime',
 'and',
 'there',
 'are',
 'differing',
 'interpretations',
 'of',
 'some',
 'of',
 'his',
 'ideas',
 'for',
 'more',
 'detailed',
 'discussion',
 'see',
 'here',
 'max',
 'stirner',
 's',
 'egoism',
 'in',
 'his',
 'the',
 'ego',
 'and',
 'its',
 'own',
 'stirner',
 'argued',
 'that',
 'most',
 'commonly',
 'accepted',
 'social',
 'institutions',
 'including',
 'the',
 'notion',
 'of',
 'state',
 'property',
 'as',
 'a',
 'right',
 'natural',
 'rights',
 'in',
 'general',
 'and',
 'the',
 'very',
 'notion',
 'of',
 'society',
 'were',
 'mere',
 'illusions',
 'or',
 'ghosts',
 'in',
 'the',
 'mind',
 'saying',
 'of',
 'society',
 'that',
 'the',
 'individuals',
 'are',
 'its',
 'reality',
 'he',
 'advocated',
 'egoism',
 'and',
 'a',
 'form',
 'of',
 'amoralism',
 'in',
 'which',
 'individuals',
 'would',
 'unite',
 'in',
 'associations',
 'of',
 'egoists',
 'only',
 'when',
 'it',
 'was',
 'in',
 'their',
 'self',
 'interest',
 'to',
 'do',
 'so',
 'for',
 'him',
 'property',
 'simply',
 'comes',
 'about',
 'through',
 'might',
 'whoever',
 'knows',
 'how',
 'to',
 'take',
 'to',
 'defend',
 'the',
 'thing',
 'to',
 'him',
 'belongs',
 'property',
 'and',
 'what',
 'i',
 'have',
 'in',
 'my',
 'power',
 'that',
 'is',
 'my',
 'own',
 'so',
 'long',
 'as',
 'i',
 'assert',
 'myself',
 'as',
 'holder',
 'i',
 'am',
 'the',
 'proprietor',
 'of',
 'the',
 'thing',
 'stirner',
 'never',
 'called',
 'himself',
 'an',
 'anarchist',
 'he',
 'accepted',
 'only',
 'the',
 'label',
 'egoist',
 'nevertheless',
 'his',
 'ideas',
 'were',
 'influential',
 'on',
 'many',
 'individualistically',
 'inclined',
 'anarchists',
 'although',
 'interpretations',
 'of',
 'his',
 'thought',
 'are',
 'diverse',
 ...]

In [8]:
def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']

In [ ]:


In [9]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    if data_index == len(data):
      buffer[:] = data[:span]
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
'->', labels[i, 0], reverse_dictionary[labels[i, 0]])


3081 originated -> 12 as
3081 originated -> 5234 anarchism
12 as -> 6 a
12 as -> 3081 originated
6 a -> 195 term
6 a -> 12 as
195 term -> 6 a
195 term -> 2 of

In [10]:
batch


Out[10]:
array([3081, 3081,   12,   12,    6,    6,  195,  195])

In [11]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

In [12]:
graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.
  init = tf.global_variables_initializer()

In [ ]:


In [13]:
# Step 5: Begin training.
num_steps = 100001

In [14]:
with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = '%s %s,' % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step  0 :  253.553253174
Nearest to also: pike, straddle, advantage, symplectic, opens, graphically, mitford, biochemists,
Nearest to UNK: signified, clarence, bsp, effective, surfing, spelled, step, gamma,
Nearest to the: wr, dio, trinidad, supers, beaufort, cession, casey, might,
Nearest to may: nicknames, crook, flaccus, seuss, equilibria, psychomotor, beetle, toho,
Nearest to five: despotic, stoney, illicit, banana, determinants, rabbani, countess, historically,
Nearest to world: adversary, komm, unmodified, freising, antisemitism, decree, invalidate, laurents,
Nearest to often: awe, implicated, plebiscite, carotene, lockout, champaign, debug, modulator,
Nearest to of: passwords, unsafe, exhorted, recent, incompressible, pusan, whatsoever, andriessen,
Nearest to time: allure, lashes, jesse, struggled, testaments, logudorese, shortages, actuated,
Nearest to by: tora, fixation, marcomanni, publicly, fondness, gamemaster, ennis, enclosed,
Nearest to over: lr, apoptotic, hesychastic, exploiting, mandrakesoft, celluloid, greece, spool,
Nearest to there: prodigious, thiele, inflate, refugee, treatment, motor, tarnished, belarusians,
Nearest to is: firenze, awarded, pictish, minuet, wootz, uber, sidney, restoration,
Nearest to can: latin, categorised, suction, diffeomorphic, assisted, musab, taking, previous,
Nearest to be: catarina, rad, wangenheim, aoe, pretender, spacey, angles, dtds,
Nearest to united: industries, lawgiver, weaned, booed, predictably, mexicanus, front, presupposes,
Average loss at step  2000 :  113.525471392
Average loss at step  4000 :  53.0324085891
Average loss at step  6000 :  33.3160460205
Average loss at step  8000 :  23.7365244334
Average loss at step  10000 :  17.7732049627
Nearest to also: camera, yardage, looks, symplectic, paintings, intensely, advantage, opens,
Nearest to UNK: and, archie, gb, lymphoma, the, one, finalist, mathbf,
Nearest to the: a, UNK, his, of, hieroglyphic, and, its, one,
Nearest to may: young, passive, nicknames, fao, practical, decree, gb, places,
Nearest to five: zero, nine, eight, three, molinari, one, huns, vocals,
Nearest to world: archie, vocals, antisemitism, decree, territory, adversary, fugitive, nsu,
Nearest to often: shells, virtue, plebiscite, quotes, personality, mechanization, hydronium, modulator,
Nearest to of: and, in, the, eight, to, tissue, heavier, for,
Nearest to time: gb, manner, gone, struggled, tarkovsky, lyre, savannah, fresnel,
Nearest to by: in, tora, with, and, was, archie, tissue, as,
Nearest to over: greece, agave, eden, celluloid, main, alright, normally, thinking,
Nearest to there: mya, durango, offer, motor, roosevelt, treatment, refugee, institute,
Nearest to is: are, was, as, in, phi, has, frames, and,
Nearest to can: taking, spent, categorised, latin, suction, shape, assisted, begins,
Nearest to be: is, vilayet, molinari, felt, phi, innumerable, adult, guinea,
Nearest to united: mathbf, front, industries, versus, recorded, ready, vocals, phi,
Average loss at step  12000 :  14.072145799
Average loss at step  14000 :  11.7931788908
Average loss at step  16000 :  9.6314672631
Average loss at step  18000 :  8.40073504961
Average loss at step  20000 :  8.16026431
Nearest to also: camera, intensely, cuban, looks, symplectic, often, yardage, scribe,
Nearest to UNK: operatorname, bckgr, dasyprocta, peptide, two, archie, circ, gb,
Nearest to the: a, his, one, its, circ, their, this, s,
Nearest to may: nicknames, young, wrestling, ludwig, passive, noir, trough, marking,
Nearest to five: eight, nine, zero, seven, six, three, two, four,
Nearest to world: antisemitism, archie, vocals, decree, compiler, adversary, chalker, territory,
Nearest to often: UNK, shells, also, swim, quotes, hydronium, virtue, plebiscite,
Nearest to of: and, in, for, with, eight, zero, at, from,
Nearest to time: circ, manner, operatorname, gb, ducas, testaments, struggled, jesse,
Nearest to by: in, was, with, for, is, from, and, as,
Nearest to over: circ, agave, greece, three, main, for, blind, eden,
Nearest to there: mya, he, durango, prodigious, roosevelt, they, refugee, motor,
Nearest to is: was, are, has, by, as, were, circ, and,
Nearest to can: categorised, spent, to, taking, assisted, planes, shape, latin,
Nearest to be: is, have, was, vilayet, were, grapheme, molinari, underwater,
Nearest to united: mathbf, versus, front, ready, industries, recorded, circ, and,
Average loss at step  22000 :  6.99858245897
Average loss at step  24000 :  6.92992219472
Average loss at step  26000 :  6.67232291806
Average loss at step  28000 :  6.40561693621
Average loss at step  30000 :  5.89894958699
Nearest to also: often, which, camera, abet, not, cuban, intensely, looks,
Nearest to UNK: dasyprocta, abet, operatorname, peptide, agouti, gb, two, circ,
Nearest to the: its, their, a, circ, his, one, this, birkenau,
Nearest to may: should, nicknames, would, seven, wrestling, marking, noir, ludwig,
Nearest to five: eight, six, seven, zero, four, nine, three, two,
Nearest to world: antisemitism, archie, decree, compiler, adversary, vocals, abet, chalker,
Nearest to often: also, shells, swim, awe, sponsors, identified, hydronium, quotes,
Nearest to of: in, and, for, from, with, eight, at, gum,
Nearest to time: manner, circ, operatorname, ducas, testaments, gb, xx, jesse,
Nearest to by: with, in, was, from, for, and, is, as,
Nearest to over: main, agave, for, circ, apoptotic, three, greece, blind,
Nearest to there: they, he, it, mya, durango, these, refugee, prodigious,
Nearest to is: was, are, has, were, circ, as, by, be,
Nearest to can: to, categorised, would, spent, taking, planes, could, hispanic,
Nearest to be: is, have, was, were, khz, by, been, are,
Nearest to united: versus, mathbf, ready, front, amalthea, and, lineages, industries,
Average loss at step  32000 :  5.95941952085
Average loss at step  34000 :  5.72331445134
Average loss at step  36000 :  5.76381445503
Average loss at step  38000 :  5.51742550099
Average loss at step  40000 :  5.25644893974
Nearest to also: often, which, sometimes, still, abet, now, not, camera,
Nearest to UNK: four, abet, operatorname, dasyprocta, agouti, three, seven, peptide,
Nearest to the: its, his, a, their, circ, this, one, birkenau,
Nearest to may: should, would, will, nicknames, can, eight, cannot, noir,
Nearest to five: six, eight, four, three, seven, zero, nine, two,
Nearest to world: antisemitism, decree, archie, compiler, adversary, vocals, abet, gulf,
Nearest to often: also, awe, shells, swim, not, who, identified, sometimes,
Nearest to of: in, and, for, circ, zero, eight, gum, from,
Nearest to time: manner, circ, operatorname, xx, testaments, ducas, rang, jesse,
Nearest to by: was, with, in, and, is, circ, for, archie,
Nearest to over: lr, circ, agave, main, apoptotic, thinking, adherent, greece,
Nearest to there: they, it, he, mya, these, now, that, durango,
Nearest to is: was, are, has, were, circ, managing, by, as,
Nearest to can: would, will, could, to, categorised, spent, must, may,
Nearest to be: have, was, were, by, is, been, khz, are,
Nearest to united: recitative, mathbf, versus, ready, amalthea, lineages, front, stacy,
Average loss at step  42000 :  5.3820665319
Average loss at step  44000 :  5.26191980755
Average loss at step  46000 :  5.20956753623
Average loss at step  48000 :  5.23812037492
Average loss at step  50000 :  4.97161613584
Nearest to also: which, often, still, usually, now, sometimes, not, who,
Nearest to UNK: kapoor, abet, dasyprocta, five, agouti, operatorname, four, recitative,
Nearest to the: their, its, a, his, this, birkenau, kapoor, circ,
Nearest to may: would, should, will, can, must, cannot, nicknames, could,
Nearest to five: four, six, three, eight, seven, kapoor, zero, two,
Nearest to world: decree, antisemitism, archie, compiler, adversary, beginners, vocals, gulf,
Nearest to often: also, kapoor, awe, not, sometimes, shells, swim, usually,
Nearest to of: in, for, eight, from, circ, and, nine, albury,
Nearest to time: manner, circ, operatorname, testaments, xx, kapoor, ducas, rang,
Nearest to by: was, with, be, in, and, is, into, for,
Nearest to over: lr, agave, circ, three, main, thinking, apoptotic, for,
Nearest to there: it, they, he, franchisee, mya, now, these, usually,
Nearest to is: was, has, are, were, eight, by, wideawake, four,
Nearest to can: would, will, could, must, may, should, to, cannot,
Nearest to be: have, by, been, was, were, is, khz, are,
Nearest to united: recitative, mathbf, viridian, predictably, ready, front, versus, kapoor,
Average loss at step  52000 :  5.02900808537
Average loss at step  54000 :  5.19085570538
Average loss at step  56000 :  5.03335100329
Average loss at step  58000 :  5.05933012307
Average loss at step  60000 :  4.95613686156
Nearest to also: which, often, now, usually, still, sometimes, who, abet,
Nearest to UNK: kapoor, ursus, dasyprocta, operatorname, abet, agouti, wct, circ,
Nearest to the: its, their, pulau, a, his, kapoor, circ, wct,
Nearest to may: would, can, should, will, must, could, might, cannot,
Nearest to five: six, four, eight, three, seven, nine, kapoor, zero,
Nearest to world: thibetanus, decree, antisemitism, archie, compiler, ursus, adversary, gulf,
Nearest to often: also, sometimes, usually, kapoor, awe, not, generally, now,
Nearest to of: in, for, nine, and, circ, eight, five, ursus,
Nearest to time: manner, circ, operatorname, ducas, xx, testaments, kapoor, rang,
Nearest to by: was, be, with, ursus, is, in, been, as,
Nearest to over: lr, agave, circ, thinking, three, adherent, after, main,
Nearest to there: it, they, he, franchisee, mya, now, these, usually,
Nearest to is: was, are, has, circ, ursus, wideawake, kapoor, dasyprocta,
Nearest to can: would, will, could, may, must, should, cannot, to,
Nearest to be: have, been, by, was, were, khz, are, ursus,
Nearest to united: recitative, mathbf, kapoor, viridian, predictably, front, versus, lineages,
Average loss at step  62000 :  4.99800972199
Average loss at step  64000 :  4.85762190235
Average loss at step  66000 :  4.59882547861
Average loss at step  68000 :  4.97043374181
Average loss at step  70000 :  4.89273414469
Nearest to also: which, often, now, usually, still, sometimes, microcebus, who,
Nearest to UNK: kapoor, ursus, operatorname, dasyprocta, agouti, seven, four, thaler,
Nearest to the: their, its, this, his, a, pulau, kapoor, circ,
Nearest to may: can, would, will, should, could, must, might, cannot,
Nearest to five: four, six, eight, three, seven, nine, zero, two,
Nearest to world: decree, thibetanus, archie, antisemitism, ursus, gulf, adversary, compiler,
Nearest to often: also, usually, sometimes, kapoor, now, generally, not, awe,
Nearest to of: for, and, circ, in, gnat, wct, gum, ursus,
Nearest to time: manner, circ, testaments, xx, operatorname, ducas, microcebus, cycled,
Nearest to by: was, be, ursus, thaler, with, archie, into, in,
Nearest to over: lr, agave, circ, thinking, three, adherent, main, apoptotic,
Nearest to there: it, they, he, now, franchisee, these, mya, usually,
Nearest to is: was, are, has, circ, managing, watterson, ursus, wideawake,
Nearest to can: will, would, could, may, must, should, cannot, might,
Nearest to be: been, have, by, were, khz, are, is, was,
Nearest to united: recitative, mathbf, predictably, kapoor, lineages, viridian, front, xxii,
Average loss at step  72000 :  4.75785528028
Average loss at step  74000 :  4.81776275086
Average loss at step  76000 :  4.730286412
Average loss at step  78000 :  4.80672069532
Average loss at step  80000 :  4.79440665197
Nearest to also: often, which, still, usually, now, sometimes, chiuchow, rasterization,
Nearest to UNK: kapoor, ursus, abet, dasyprocta, four, six, busan, seven,
Nearest to the: their, its, pulau, a, kapoor, circ, ursus, his,
Nearest to may: can, would, will, should, must, might, could, cannot,
Nearest to five: four, six, seven, three, eight, zero, nine, two,
Nearest to world: decree, thibetanus, antisemitism, gulf, archie, ursus, uncountably, compiler,
Nearest to often: also, usually, sometimes, generally, now, kapoor, not, awe,
Nearest to of: in, circ, ursus, busan, for, from, nine, agouti,
Nearest to time: manner, operatorname, circ, microcebus, pontificia, kapoor, rang, ducas,
Nearest to by: was, be, with, into, thaler, ursus, through, archie,
Nearest to over: lr, agave, circ, busan, thinking, apoptotic, main, liked,
Nearest to there: it, they, he, now, these, usually, franchisee, pontificia,
Nearest to is: was, are, has, managing, circ, ursus, wideawake, makes,
Nearest to can: could, will, would, may, must, cannot, should, might,
Nearest to be: been, have, were, by, khz, was, is, ursus,
Nearest to united: university, recitative, predictably, front, lineages, sensual, xxii, viridian,
Average loss at step  82000 :  4.78076984107
Average loss at step  84000 :  4.76875169909
Average loss at step  86000 :  4.77049468791
Average loss at step  88000 :  4.7566210072
Average loss at step  90000 :  4.73579098916
Nearest to also: often, which, usually, now, still, sometimes, chiuchow, rasterization,
Nearest to UNK: kapoor, ursus, operatorname, agouti, dasyprocta, abet, callithrix, wct,
Nearest to the: its, a, their, pulau, this, his, wct, kapoor,
Nearest to may: can, would, will, should, could, might, must, cannot,
Nearest to five: four, seven, eight, six, three, nine, zero, kapoor,
Nearest to world: thibetanus, decree, gulf, antisemitism, uncountably, archie, compiler, ursus,
Nearest to often: usually, sometimes, also, generally, now, not, kapoor, widely,
Nearest to of: in, following, and, eight, ursus, six, circ, including,
Nearest to time: manner, ducas, xx, cycled, pontificia, microcebus, circ, operatorname,
Nearest to by: was, be, ursus, as, thaler, through, in, with,
Nearest to over: lr, hesychastic, agave, circ, about, liked, thinking, main,
Nearest to there: it, they, he, now, usually, these, pontificia, franchisee,
Nearest to is: was, has, are, circ, managing, wideawake, be, ursus,
Nearest to can: will, may, could, would, must, cannot, should, might,
Nearest to be: been, have, were, by, is, are, khz, was,
Nearest to united: recitative, university, predictably, front, sensual, xxii, lineages, viridian,
Average loss at step  92000 :  4.68671517932
Average loss at step  94000 :  4.72344186556
Average loss at step  96000 :  4.6911469897
Average loss at step  98000 :  4.59995775843
Average loss at step  100000 :  4.70548725915
Nearest to also: often, which, usually, still, now, sometimes, chiuchow, who,
Nearest to UNK: kapoor, dasyprocta, ursus, cebus, operatorname, abet, agouti, thaler,
Nearest to the: pulau, its, their, a, this, kapoor, ursus, circ,
Nearest to may: can, would, will, should, could, might, must, cannot,
Nearest to five: four, three, seven, six, eight, zero, nine, kapoor,
Nearest to world: thibetanus, gulf, decree, archie, ursus, compiler, antisemitism, uncountably,
Nearest to often: usually, sometimes, also, generally, now, widely, kapoor, not,
Nearest to of: and, circ, including, in, eight, ursus, busan, wct,
Nearest to time: manner, ducas, microcebus, pontificia, cycled, operatorname, circ, kapoor,
Nearest to by: be, was, ursus, as, during, with, thaler, archie,
Nearest to over: lr, hesychastic, agave, circ, about, thinking, liked, busan,
Nearest to there: it, they, he, now, usually, these, which, still,
Nearest to is: was, has, are, circ, be, ursus, wideawake, became,
Nearest to can: will, could, would, may, must, cannot, should, might,
Nearest to be: been, have, by, were, is, was, khz, are,
Nearest to united: recitative, university, predictably, front, xxii, sensual, lineages, stacy,

In [21]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

try:
  # pylint: disable=g-import-not-at-top
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
  plot_only = 500
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels)

except ImportError:
  print('Please install sklearn, matplotlib, and scipy to show embeddings.')

In [ ]: