Deep Learning

Assignment 5

The goal of this assignment is to train a Word2Vec skip-gram model over Text8 data.


In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

Download the data from the source website if necessary.


In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


Found and verified text8.zip

Read the data into a string.


In [3]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    # f.namelist() return a list of the names of the files in the zip file.
    data = tf.compat.as_str(f.read(f.namelist()[0])).split() # tf.compat.as_str() convert input as a string
    # string.split() return a list of splitted strings by space (not including space)
  return data
  
words = read_data(filename)
print('Data size %d' % len(words))


Data size 17005207

Build the dictionary and replace rare words with UNK (unknown) token.

rare words mean the words that don't show up frequently in the dataset. (<50000)


In [7]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  # Counter(words).most_common(num) return a list that contains num many tuples that are the most num common vocabulary 
  # in words. The tuples have the structure of ('word',times) 
  dictionary = dict() # empty dictionary; dict is a class in python; dictionary = {}
  for word, _ in count:
    dictionary[word] = len(dictionary)
  # since the words in count are ordered from the most common to the least, the len(dictionary) is the words' rank
  data = list() # data = []
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:  # the words that are not belong to the most 50000 common
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
    # data is a list that maps all the words shown-up times.
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  # dictionary.values() return a seq of values from the dictionary.
  # zip method, see below.
  # reverse_dictionary: the values and keys are reversed.
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5241, 3081, 12, 6, 195, 2, 3135, 46, 59, 156]

zip() method: This function takes two equal-length collections, and merges them together in pairs.

Function to generate a training batch for the skip-gram model.


In [5]:
data_index = 0

# big window: skip_window word skip_window
# batch_size:8, the size of batch, batch is a batch of words from which we choose words to train.
# Here we choose batch_size%num_skips many words from 8 words (batch_size).
# num_skips:2(4), one word has num_skips many targets. Or one word would be reused for num_skip times. And batch_size//num_skips should equate 0.  
# skip_window:1(2), the number of words on the left or right of the word. Also skip_window is the index of central word in a big window. 
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target is at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid: # make sure target is not the central word
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]]) # 8 sequential words in a sentense

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0 # each time generate batch from 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['as', 'anarchism', 'a', 'originated', 'as', 'term', 'of', 'a']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['a', 'originated', 'anarchism', 'term', 'as', 'term', 'of', 'originated']

Train a skip-gram model.


In [6]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left or right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation dataset to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # How many random set of words to evaluate similarity on.
valid_window = 100 # From how many words that validation samples are picked in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample in sampled softmax


# graph: words -> embeddings -> words
# 1st layer: tf.nn.embedding_lookup
# 2nd layer: tf.nn.sampled_softmax_loss
graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  # the rule to map word to embedding vectors is given by tf.embedding_lookup(), and it's a 1to1 rule.
  # but the value of embedding vectors is initialized randomly.
  # tf.random_uniform(shape,minvalue,maxvalue)
  # we have vocabulary_size = 50000 words
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                               train_labels, num_sampled, vocabulary_size))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  # norm(50000,1)
  normalized_embeddings = embeddings / norm
  # embedding/norm is qual to tf.div, which is element-wise operation
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
  # valid_embeddings has the 16 words' vectors in embeddings.
  # normalized_embeddings has 50000 words' vectors in embeddings.
  # so the matrix multiplication of similarity is like the inner production of the embedding vectors of 16 words
  # and ones of 50000 words.
  # So it's like find the similarity of the embedding vectors of 16 words from 50000 words.
  # inner product: A.B = |A|*|B|*cos(A,B)

In [7]:
print(similarity.get_shape())
print(valid_embeddings.get_shape())
print(normalized_embeddings.get_shape())
train_dataset
batch_data, batch_labels = generate_batch(batch_size, num_skips, skip_window)
type(batch_data)


(16, 50000)
(16, 128)
(50000, 128)
Out[7]:
numpy.ndarray

In [8]:
num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(
      batch_size, num_skips, skip_window)
    # batch_data is the ranks, ranks are also considers as the IDs of the words
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      #print(type(sim)) #-> numpy.ndarray
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        # sim[i,:] the similaries of i-th word from the 16 words and every word from 50000 words.
        # argsort() get the sorted ndarray and return the indices
        # [1:top_k+1] avoids the word itself as the highest score
        # indices are the ranks of the frequency in data
        log = 'Nearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
          # recursively add close_word to log
        print(log)
  final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step 0: 7.912311
Nearest to however: shipwreck, ferus, walks, lakoff, propagate, wim, signature, sap,
Nearest to after: subordinate, embarrassment, supranational, malpractice, permanence, seabirds, arcadius, variste,
Nearest to about: overrun, whimsical, acquires, monorails, cyprian, goldsmiths, suppliers, dominions,
Nearest to they: despotism, durability, entails, basketball, unhappy, usaid, airplay, hummel,
Nearest to not: asserted, qadhafi, herndon, advance, sucker, neberg, limited, rajonas,
Nearest to zero: lata, lumpur, killian, brahmin, agitation, isn, paradiso, unbelief,
Nearest to it: telekom, chrono, neoclassical, hymnody, classicists, lemurs, constructed, gutierrez,
Nearest to i: disadvantageous, songwriting, glick, specialise, colombo, necromancer, gardnerian, winnie,
Nearest to with: finish, bead, cyanides, discussed, mill, dartmouth, nosferatu, prohibitions,
Nearest to only: cheats, bane, handhelds, hurry, claudian, lazuli, emergence, telstra,
Nearest to years: wages, confirmed, groupoid, choice, calculates, balaguer, alto, intron,
Nearest to no: snowflake, iverson, surfers, dijon, cursing, museum, xxxiii, announced,
Nearest to into: burdens, delta, logan, greenish, fantasia, lowell, pomona, wavell,
Nearest to had: scrolling, freestyle, geri, heteronyms, creeds, jana, jacketed, curvature,
Nearest to six: worshippers, tyndall, navigable, fractures, hypothesizing, deficiency, mehmet, sidecar,
Nearest to other: fermionic, domitia, deleted, enduring, nightfall, umi, hellish, garbage,
Average loss at step 2000: 4.360159
Average loss at step 4000: 3.865892
Average loss at step 6000: 3.790551
Average loss at step 8000: 3.681730
Average loss at step 10000: 3.608492
Nearest to however: propagate, shipwreck, tfl, solicit, ni, but, darmstadt, ferus,
Nearest to after: shamash, supranational, canned, before, coincidental, cob, seabirds, vaud,
Nearest to about: cyprian, acquires, overrun, layoffs, woensel, dion, availability, aceh,
Nearest to they: he, we, despotism, who, lech, she, there, tile,
Nearest to not: it, also, mozart, etiology, capra, atrophy, zaman, rajonas,
Nearest to zero: nine, five, seven, eight, six, three, four, two,
Nearest to it: he, this, there, which, not, she, feodor, you,
Nearest to i: songwriting, disadvantageous, glick, freight, djing, el, coolant, necromancer,
Nearest to with: in, between, rios, of, by, for, respective, on,
Nearest to only: bane, cheats, hoard, games, pesky, yaum, claudian, icmp,
Nearest to years: alcal, incredibly, ace, urquhart, confirmed, renarrative, kbit, female,
Nearest to no: snowflake, recapture, overman, iverson, nitro, alienating, prescribe, announced,
Nearest to into: burdens, from, delta, affluence, logan, single, fantasia, censorware,
Nearest to had: has, have, was, introduced, discriminatory, reassigned, were, resume,
Nearest to six: five, seven, eight, four, three, nine, two, zero,
Nearest to other: fermionic, malice, cond, enduring, flugelhorn, bhangra, stead, anthropology,
Average loss at step 12000: 3.608212
Average loss at step 14000: 3.569944
Average loss at step 16000: 3.409416
Average loss at step 18000: 3.456980
Average loss at step 20000: 3.544935
Nearest to however: but, propagate, tfl, darmstadt, shipwreck, karamanlis, let, solicit,
Nearest to after: shamash, before, when, during, from, supranational, for, canned,
Nearest to about: ioan, cyprian, phimosis, ecumenical, acquires, damp, monorails, dhea,
Nearest to they: he, we, there, who, despotism, you, it, she,
Nearest to not: also, it, they, etiology, capra, replenished, strongly, there,
Nearest to zero: five, seven, six, three, four, eight, nine, two,
Nearest to it: he, this, there, she, which, they, you, not,
Nearest to i: ii, songwriting, we, cm, ciii, prentice, disadvantageous, they,
Nearest to with: between, in, into, fourteenth, for, respective, melchior, by,
Nearest to only: cheats, conversational, presque, falsificationism, centimeter, sandro, really, games,
Nearest to years: days, kbit, urquhart, incredibly, alcal, cheerful, till, profession,
Nearest to no: iverson, snowflake, alienating, funk, recapture, amides, susima, generally,
Nearest to into: from, affluence, single, through, with, burdens, delta, logan,
Nearest to had: has, have, was, were, when, pls, having, agnostic,
Nearest to six: eight, four, seven, nine, five, three, zero, two,
Nearest to other: many, some, stead, fermionic, carlos, carolyn, malice, are,
Average loss at step 22000: 3.498957
Average loss at step 24000: 3.491244
Average loss at step 26000: 3.485455
Average loss at step 28000: 3.480572
Average loss at step 30000: 3.502394
Nearest to however: but, propagate, where, they, tfl, darmstadt, although, that,
Nearest to after: before, during, when, shamash, until, for, from, disavowed,
Nearest to about: cyprian, ioan, acquires, relocate, inside, ulrike, attu, euskal,
Nearest to they: we, he, there, who, it, you, she, not,
Nearest to not: they, probably, there, it, still, this, atrophy, capra,
Nearest to zero: five, seven, eight, six, four, three, nine, two,
Nearest to it: he, she, there, this, they, which, also, not,
Nearest to i: ii, we, cm, you, songwriting, el, iii, they,
Nearest to with: between, wellesley, in, by, when, respective, including, for,
Nearest to only: games, tanakh, savanna, gotlanders, grandsons, gollum, really, anderson,
Nearest to years: days, months, kbit, urquhart, times, cheerful, kyoto, bengals,
Nearest to no: any, amides, iverson, it, snowflake, gv, alienating, a,
Nearest to into: from, through, affluence, logan, in, with, rbis, burdens,
Nearest to had: has, have, was, were, having, altruists, since, is,
Nearest to six: eight, four, seven, nine, five, three, two, zero,
Nearest to other: various, melinda, many, those, some, such, are, nylon,
Average loss at step 32000: 3.500690
Average loss at step 34000: 3.496553
Average loss at step 36000: 3.455496
Average loss at step 38000: 3.305411
Average loss at step 40000: 3.425501
Nearest to however: but, propagate, that, though, although, they, kettering, darmstadt,
Nearest to after: before, shamash, during, viscous, when, censorial, from, cob,
Nearest to about: ioan, acquires, attu, relocate, antagonist, ulrike, monorails, cyprian,
Nearest to they: we, he, you, there, it, not, she, i,
Nearest to not: they, it, still, probably, often, vassar, widely, capra,
Nearest to zero: seven, five, eight, two, six, nine, three, four,
Nearest to it: he, she, there, this, they, still, which, not,
Nearest to i: ii, we, you, cm, they, he, t, terrier,
Nearest to with: between, rios, by, semi, clavell, when, fairs, fourteenth,
Nearest to only: savanna, grandsons, gotlanders, carmelite, vp, gollum, hygiene, really,
Nearest to years: days, months, times, urquhart, kbit, jewishencyclopedia, kyoto, cheerful,
Nearest to no: any, amides, nitro, susima, snowflake, clanking, imparted, iverson,
Nearest to into: from, through, logan, back, affluence, delta, rbis, within,
Nearest to had: has, have, was, were, having, since, been, ferruccio,
Nearest to six: seven, eight, four, five, nine, three, two, one,
Nearest to other: various, those, some, hunting, gaeltacht, individualists, enormous, melinda,
Average loss at step 42000: 3.435364
Average loss at step 44000: 3.447884
Average loss at step 46000: 3.453778
Average loss at step 48000: 3.350678
Average loss at step 50000: 3.379875
Nearest to however: but, although, though, that, when, while, where, which,
Nearest to after: before, when, during, while, shamash, for, if, loathing,
Nearest to about: ulrike, relocate, antagonist, gheg, acquires, ioan, whole, bia,
Nearest to they: he, we, there, you, she, it, who, not,
Nearest to not: they, still, vassar, now, generally, subgroups, who, atrophy,
Nearest to zero: eight, seven, five, six, four, three, two, nine,
Nearest to it: he, she, there, this, they, still, now, promotes,
Nearest to i: we, ii, you, cm, they, t, tansley, terrier,
Nearest to with: between, fourteenth, darya, wellesley, clavell, against, hygienic, while,
Nearest to only: really, carmelite, always, savanna, grandsons, lip, radially, scientifically,
Nearest to years: days, months, times, ways, kbit, centuries, urquhart, attract,
Nearest to no: any, peabody, gv, amides, nothing, alienating, susima, quantify,
Nearest to into: through, from, back, logan, within, across, around, delta,
Nearest to had: has, have, was, were, having, been, since, sens,
Nearest to six: eight, seven, four, five, nine, three, two, zero,
Nearest to other: various, many, different, some, hunting, those, including, malice,
Average loss at step 52000: 3.437261
Average loss at step 54000: 3.425769
Average loss at step 56000: 3.436059
Average loss at step 58000: 3.398487
Average loss at step 60000: 3.395781
Nearest to however: but, although, though, which, that, when, despite, while,
Nearest to after: before, when, during, shamash, without, while, despite, viscous,
Nearest to about: ulrike, relocate, ioan, antagonist, over, gheg, whole, coronary,
Nearest to they: we, there, you, he, she, i, it, cumbria,
Nearest to not: still, now, probably, atrophy, they, who, usually, we,
Nearest to zero: five, seven, four, six, eight, three, nine, two,
Nearest to it: he, she, there, this, which, still, they, what,
Nearest to i: we, ii, you, t, cm, they, tansley, iii,
Nearest to with: between, fourteenth, into, while, inelastic, when, wellesley, payoffs,
Nearest to only: really, always, first, journeyman, scientifically, grandsons, lip, pontus,
Nearest to years: days, months, times, centuries, decades, minutes, year, urquhart,
Nearest to no: any, peabody, a, gv, nothing, alienating, quantify, cognates,
Nearest to into: from, through, within, logan, across, back, with, around,
Nearest to had: has, have, was, were, having, been, never, subsequently,
Nearest to six: eight, four, five, nine, seven, three, zero, one,
Nearest to other: various, different, many, some, those, hunting, yak, more,
Average loss at step 62000: 3.243026
Average loss at step 64000: 3.256715
Average loss at step 66000: 3.398159
Average loss at step 68000: 3.397893
Average loss at step 70000: 3.356833
Nearest to however: but, although, though, that, when, where, while, which,
Nearest to after: before, during, when, while, viscous, without, mauryan, until,
Nearest to about: ulrike, relocate, asparagus, over, antagonist, approximately, transpired, remedied,
Nearest to they: we, there, he, you, she, it, diatomaceous, cumbria,
Nearest to not: still, now, generally, probably, never, usually, frequently, also,
Nearest to zero: five, six, four, eight, seven, two, three, nine,
Nearest to it: he, she, there, this, they, still, samsara, which,
Nearest to i: we, ii, you, cm, tansley, licking, t, bacall,
Nearest to with: between, wellesley, including, fourteenth, while, into, when, without,
Nearest to only: always, exactly, grandsons, really, never, lip, not, avercamp,
Nearest to years: days, months, decades, centuries, times, minutes, year, urquhart,
Nearest to no: any, significant, quantify, peabody, funk, than, cognates, periodically,
Nearest to into: from, through, within, logan, with, across, back, around,
Nearest to had: has, have, was, were, having, been, pls, recently,
Nearest to six: eight, seven, nine, four, five, three, two, zero,
Nearest to other: various, different, many, benzene, including, spur, some, hunting,
Average loss at step 72000: 3.371886
Average loss at step 74000: 3.346062
Average loss at step 76000: 3.315682
Average loss at step 78000: 3.349079
Average loss at step 80000: 3.376295
Nearest to however: although, but, that, though, while, when, where, they,
Nearest to after: before, when, during, without, while, until, despite, viscous,
Nearest to about: approximately, ulrike, coolidge, over, remedied, relocate, least, antagonist,
Nearest to they: we, he, you, there, she, it, cumbria, these,
Nearest to not: still, now, generally, usually, it, we, vassar, probably,
Nearest to zero: five, seven, four, six, eight, three, nine, two,
Nearest to it: he, she, there, this, they, we, still, itself,
Nearest to i: ii, we, you, iii, tansley, t, cm, iv,
Nearest to with: between, wellesley, payoffs, in, fourteenth, when, including, into,
Nearest to only: grandsons, always, best, exactly, lip, really, avercamp, savanna,
Nearest to years: days, months, decades, minutes, times, centuries, year, ways,
Nearest to no: any, peabody, quantify, nothing, humanist, alienating, little, significant,
Nearest to into: through, from, within, across, logan, back, during, with,
Nearest to had: have, has, was, were, having, been, fled, never,
Nearest to six: eight, four, seven, five, three, nine, two, zero,
Nearest to other: various, hunting, others, potent, including, different, many, some,
Average loss at step 82000: 3.405456
Average loss at step 84000: 3.411629
Average loss at step 86000: 3.387811
Average loss at step 88000: 3.354794
Average loss at step 90000: 3.360676
Nearest to however: but, although, though, that, while, when, insufficiently, where,
Nearest to after: before, during, when, while, without, until, despite, from,
Nearest to about: coolidge, antagonist, ulrike, relocate, over, around, regarding, asparagus,
Nearest to they: we, you, he, there, she, it, but, cumbria,
Nearest to not: still, strongly, now, we, nor, grotto, vassar, belgrano,
Nearest to zero: five, eight, six, seven, four, two, three, nine,
Nearest to it: he, she, there, they, this, therefore, itself, often,
Nearest to i: ii, we, you, t, iii, newman, tansley, iv,
Nearest to with: between, in, by, including, wellesley, wet, into, fourteenth,
Nearest to only: grandsons, always, really, either, exactly, lip, avercamp, no,
Nearest to years: days, months, decades, minutes, centuries, year, times, hours,
Nearest to no: any, peabody, little, nothing, significant, quantify, cognates, periodically,
Nearest to into: through, from, across, within, around, back, logan, during,
Nearest to had: has, have, was, having, were, decided, adhered, fled,
Nearest to six: eight, seven, five, four, nine, three, two, zero,
Nearest to other: various, individual, others, potent, hunting, genevieve, different, including,
Average loss at step 92000: 3.398688
Average loss at step 94000: 3.256711
Average loss at step 96000: 3.356798
Average loss at step 98000: 3.242885
Average loss at step 100000: 3.357523
Nearest to however: although, but, though, that, where, when, and, which,
Nearest to after: before, when, during, without, while, despite, until, loathing,
Nearest to about: relocate, ulrike, around, energetic, approximately, on, nicopolis, coronary,
Nearest to they: we, he, there, you, she, it, i, cumbria,
Nearest to not: still, never, now, generally, strongly, nor, actually, also,
Nearest to zero: five, six, eight, four, seven, two, nine, three,
Nearest to it: he, she, this, there, they, bolland, which, often,
Nearest to i: we, you, ii, iii, they, t, tansley, newman,
Nearest to with: between, including, into, using, makes, in, fourteenth, within,
Nearest to only: really, still, ves, lip, exactly, carmelite, stemmed, storm,
Nearest to years: days, months, decades, centuries, minutes, year, times, weeks,
Nearest to no: any, peabody, nothing, lip, quantify, cognates, significant, taxed,
Nearest to into: through, from, within, across, in, back, logan, during,
Nearest to had: has, have, was, having, were, since, attempted, fled,
Nearest to six: seven, four, eight, nine, five, three, two, zero,
Nearest to other: various, others, hunting, honors, including, genevieve, individual, spur,

In [9]:
num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

In [10]:
def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)


/usr/lib/python3/dist-packages/matplotlib/collections.py:549: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':

Problem

An alternative to skip-gram is another Word2Vec model called CBOW (Continuous Bag of Words). In the CBOW model, instead of predicting a context word from a word vector, you predict a word from the sum of all the word vectors in its context. Implement and evaluate a CBOW model trained on the text8 dataset.


CBOW Model


In [130]:
#original data: [word1, word2, word3, word4,], let's say word3 is UNK.
#data:[rank_of_word1, rank_of_word2, 0_for_UNK, rank_of_word4,]
#count:[['UNK',frequency_of_UNK],['most_frequent_word',frequency],['second_most_frequent_word',frequency],]
#dictionary:{'UNK':0,'most_frequent_word':1,'word':rank,}
#reverse_dictionary
data_index = 0

def stuff_buffer(window_size):
    global data_index
    buffer = collections.deque(maxlen=window_size)
    for _ in range(window_size):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return buffer

def CBOW_generate_batch(batch_size, window_size):
    assert batch_size % window_size == 0
    batch_input = list()
    batch_target = list()
    input_nm = list()
    for _ in range(batch_size//window_size):
        buffer = stuff_buffer(window_size)
        for _ in range(window_size):
            input_size = random.randint(2,window_size-1)
            input_nm.append(input_size)
            # at least use two words to predict the third one.
            # at most use window_size-1 words to predict the rest one
            input_pos = random.randint(0,window_size-(input_size+1))
            # input_size+1 as a block, then from it choose the target 
            target_pos = random.randint(input_pos,input_pos+input_size)
            for i in range(input_pos,input_pos+(input_size+1)):
                if i == target_pos:
                    batch_target.append(buffer[target_pos])
                else:
                    batch_input.append(buffer[i])
    return batch_input, batch_target, input_nm

def CBOW_generate_batch2(batch_size, window_size):
    input_ = np.ndarray(shape=(batch_size*(window_size-1)), dtype=np.int32)
    target_ = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    batch_input = list()
    batch_target = list()
    block_pos = 0
    for _ in range(batch_size):
        #print('block_pos',block_pos)
        buffer = stuff_buffer(window_size)
        target_pos = random.randint(0,window_size-1)
        #print('target_pos',target_pos)
        for i in range(block_pos,block_pos+window_size):
            #print('i%window_size',i%window_size)
            if target_pos == i%window_size:
                batch_target.append(buffer[target_pos])
                #print('add target')
            else:
                batch_input.append(buffer[i%window_size])
                #print('add input')
        block_pos += window_size
        #print('--------------')
    input_ = np.asarray(batch_input)
    target_ = np.asarray(batch_target).reshape([-1,1])
    return input_, target_

for batch_size, window_size in [(3,3)]:
    #input_, target_, nm_ = CBOW_generate_batch(8,4)
    input_, target_ = CBOW_generate_batch2(batch_size,window_size)
    input_d = [reverse_dictionary[rank] for rank in input_]
    target_d = [reverse_dictionary[rank] for rank in target_.reshape(3)]
    print('data: ',[reverse_dictionary[rank] for rank in data[:9]])
    print('input: ',input_d)
    print('target: ',target_d)
    #print('number of input block:', nm_)


data:  ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used']
input:  ['anarchism', 'originated', 'a', 'of', 'abuse', 'first']
target:  ['as', 'term', 'used']

train a CBOW model


In [143]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
window_size = 4 
valid_size = 16 # How many random set of words to evaluate similarity on.
valid_window = 100 # From how many words that validation samples are picked in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample in sampled softmax


# graph: words -> embeddings -> words
# 1st layer: tf.nn.embedding_lookup
# 2nd layer: fully  connected layer
# 2nd layer: tf.nn.sampled_softmax_loss. The above FC is contained in this method.
graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  input_dataset = tf.placeholder(tf.int32,shape=[batch_size*(window_size-1)])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  # the rule to map word to embedding vectors is given by tf.embedding_lookup(), and it's a 1to1 rule.
  # but the value of embedding vectors is initialized randomly.
  # tf.random_uniform(shape,minvalue,maxvalue)
  # we have vocabulary_size = 50000 words
  softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                                    stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  train_dataset = tf.Variable(tf.zeros([batch_size]))
    
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, input_dataset)
  matrix_embed = tf.reshape(embed,[batch_size,window_size-1,embedding_size])
  input_embed = tf.reduce_sum(matrix_embed,1)/(window_size-1)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, input_embed,
                               train_labels, num_sampled, vocabulary_size))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.AdagradOptimizer(0.5).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  # norm(50000,1)
  normalized_embeddings = embeddings / norm
  # embedding/norm is qual to tf.div, which is element-wise operation
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
  # valid_embeddings has the 16 words' vectors in embeddings.
  # normalized_embeddings has 50000 words' vectors in embeddings.
  # so the matrix multiplication of similarity is like the inner production of the embedding vectors of 16 words
  # and ones of 50000 words.
  # So it's like find the similarity of the embedding vectors of 16 words from 50000 words.
  # inner product: A.B = |A|*|B|*cos(A,B)

In [178]:
num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    input_, target_ = CBOW_generate_batch2(batch_size,window_size)
    #print(input_.shape)
    #print(target_.shape)
    feed_dict = {input_dataset:input_,train_labels:target_}
    # batch_data is the ranks, ranks are also considers as the IDs of the words
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      #print(type(sim)) #-> numpy.ndarray
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        # sim[i,:] the similaries of i-th word from the 16 words and every word from 50000 words.
        # argsort() get the sorted ndarray and return the indices
        # [1:top_k+1] avoids the word itself as the highest score
        # indices are the ranks of the frequency in data
        log = 'Nearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
          # recursively add close_word to log
        print(log)
  final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step 0: 7.467328
Nearest to world: elsevier, spotter, minarchists, infielder, commercialized, wally, taxa, forearm,
Nearest to there: abiotic, vocabularies, gravely, thr, deadites, kobe, eigenvalues, airmen,
Nearest to see: archives, splines, extremophiles, shifts, withdrawing, meinhof, larger, prostaglandin,
Nearest to and: poset, actinidia, creatively, topics, altruistic, peseta, pechenegs, ning,
Nearest to which: hammer, trusts, epistemic, colloquially, snooker, triumphal, umi, themis,
Nearest to on: licinius, multihull, degenerate, homesick, burckhardt, vultures, moabite, hmong,
Nearest to been: falsified, revealed, inflicted, judge, falk, occur, cartooning, ultrasound,
Nearest to the: days, maneuvers, bub, chart, individuals, insurgency, alison, outgrowth,
Nearest to from: incorrect, modalities, jb, incurable, theoretician, could, pluralistic, shenouda,
Nearest to but: heinlein, mossi, electrophysiology, arion, authored, aligned, apg, bitmaps,
Nearest to were: murray, ovule, exhaust, ssel, released, paratroopers, portrayals, campaigners,
Nearest to b: wroc, torrey, motet, winners, prompt, hazard, emergent, regulated,
Nearest to other: bolsheviks, amazonian, inhabitant, hellenic, pagan, uhf, hydrocodone, discussion,
Nearest to system: admirers, gila, tarnished, anywhere, whitewater, perverse, hearted, deserved,
Nearest to th: taxed, embodiment, massacre, earthman, antiderivative, lansdowne, mayhew, blacksmith,
Nearest to his: cliches, remake, wilkins, sudetenland, ordinarily, questioner, forti, leonard,
Average loss at step 2000: 4.393222
Average loss at step 4000: 3.900583
Average loss at step 6000: 3.767956
Average loss at step 8000: 3.717157
Average loss at step 10000: 3.682990
Nearest to world: galaxy, twentieth, overuse, rio, forearm, urdu, infielder, carbonated,
Nearest to there: given, kobe, it, thr, dog, gravely, sectors, superpower,
Nearest to see: archives, splines, taboos, projecting, expecting, shifts, perfect, chassis,
Nearest to and: or, poset, wagers, altruistic, from, but, with, peseta,
Nearest to which: that, jennifer, piercer, snooker, hammer, colloquially, cajun, toxicity,
Nearest to on: in, at, into, predynastic, homesick, vultures, tractor, rebranded,
Nearest to been: occur, falsified, clade, narrative, revealed, ayin, hom, saxo,
Nearest to the: its, a, their, an, this, despised, sarmatian, any,
Nearest to from: however, nibble, in, deuteronomic, and, jb, christine, shenouda,
Nearest to but: amigas, and, transformation, newspaper, aligned, ethnicities, they, lye,
Nearest to were: are, was, streamlined, degenerative, liberating, exhaust, ovule, irresponsibility,
Nearest to b: regulated, prompt, hazard, torrey, northumbria, winners, motet, axiomatic,
Nearest to other: hellenic, eastern, bolsheviks, bowed, manu, mandaic, trial, chorus,
Nearest to system: gila, admirers, accident, tarnished, affleck, anywhere, oy, amelia,
Nearest to th: earthman, undercarriage, embodiment, decimal, tartarus, rump, present, genesee,
Nearest to his: s, her, their, photosphere, remake, forti, electricity, vis,
Average loss at step 12000: 3.606360
Average loss at step 14000: 3.491843
Average loss at step 16000: 3.578378
Average loss at step 18000: 3.532221
Average loss at step 20000: 3.515618
Nearest to world: overuse, forearm, dacko, twentieth, usurpation, galaxy, carbonated, spotter,
Nearest to there: it, kobe, given, still, barbarism, sectors, kongo, they,
Nearest to see: splines, archives, meinhof, rutskoy, gamemasters, projecting, autopsy, pl,
Nearest to and: or, peseta, topics, atalh, altruistic, abolishing, while, sail,
Nearest to which: that, this, piercer, jennifer, toxicity, triumphal, transport, hammer,
Nearest to on: in, piraeus, discordians, predynastic, vultures, rebranded, suppressing, through,
Nearest to been: occur, become, falsified, narrative, ayin, clade, magazine, saxo,
Nearest to the: its, their, a, our, this, bela, dunes, his,
Nearest to from: nibble, jb, however, ammon, omim, christine, deuteronomic, liege,
Nearest to but: amigas, centimetre, dead, animosity, castille, and, elder, ethnicities,
Nearest to were: are, was, streamlined, liberating, degenerative, angola, chandragupta, theater,
Nearest to b: d, kaifu, cyclonic, torrey, prompt, regulated, winners, hazard,
Nearest to other: hellenic, eastern, chorus, manu, mandaic, bolsheviks, various, classics,
Nearest to system: admirers, accident, gila, vara, amelia, albedo, affleck, tarnished,
Nearest to th: embodiment, earthman, undercarriage, workgroup, tartarus, genesee, decimal, rump,
Nearest to his: her, their, s, the, codon, its, photosphere, him,
Average loss at step 22000: 3.529031
Average loss at step 24000: 3.473337
Average loss at step 26000: 3.505110
Average loss at step 28000: 3.508207
Average loss at step 30000: 3.456926
Nearest to world: dacko, overuse, galaxy, usurpation, forearm, carbonated, declaration, spotter,
Nearest to there: it, still, kobe, given, they, storming, barbarism, generally,
Nearest to see: splines, archives, rutskoy, pl, atalh, meinhof, projecting, gil,
Nearest to and: or, but, topics, peseta, while, leland, squad, wagers,
Nearest to which: that, this, toxicity, transport, when, breeder, also, piercer,
Nearest to on: through, vultures, piraeus, discordians, interventionism, predynastic, upon, cistern,
Nearest to been: become, falsified, occur, narrative, ayin, clade, was, be,
Nearest to the: its, a, any, their, our, this, dunes, handedness,
Nearest to from: nibble, jb, omim, ammon, deuteronomic, through, stout, tights,
Nearest to but: however, and, amigas, although, hardwoods, textbook, ethnicities, castille,
Nearest to were: are, was, streamlined, degenerative, liberating, being, have, angola,
Nearest to b: d, torrey, runways, prompt, winners, kaifu, hazard, cyclonic,
Nearest to other: various, hellenic, these, chorus, manu, classics, mandaic, sofie,
Nearest to system: gila, admirers, accident, vara, unfair, amelia, affleck, fossils,
Nearest to th: embodiment, workgroup, earthman, undercarriage, subspace, present, genesee, tartarus,
Nearest to his: her, their, s, its, him, photosphere, codon, mistress,
Average loss at step 32000: 3.495057
Average loss at step 34000: 3.445962
Average loss at step 36000: 3.421649
Average loss at step 38000: 3.368012
Average loss at step 40000: 3.395115
Nearest to world: dacko, usurpation, overuse, forearm, declaration, galaxy, lamas, carbonated,
Nearest to there: it, they, still, kobe, given, storming, confirmed, unclear,
Nearest to see: splines, archives, meinhof, occidental, hengest, gamemasters, sulla, atalh,
Nearest to and: or, but, while, migrate, fetch, irritant, peseta, squad,
Nearest to which: that, this, transport, also, when, humbert, toxicity, oldcastle,
Nearest to on: upon, through, piraeus, newsletter, vultures, cistern, in, mesogens,
Nearest to been: become, ayin, falsified, narrative, clade, occur, was, be,
Nearest to the: its, our, their, dunes, bela, his, multidimensional, a,
Nearest to from: nibble, jb, fragile, through, liege, deuteronomic, christine, tights,
Nearest to but: however, although, and, textbook, amigas, castille, though, microbial,
Nearest to were: are, was, have, had, degenerative, being, liberating, seuss,
Nearest to b: d, torrey, kaifu, cyclonic, winners, runways, philanthropist, prompt,
Nearest to other: various, hellenic, these, different, chorus, sofie, mandaic, manu,
Nearest to system: gila, unfair, admirers, accident, tablature, formal, fossils, data,
Nearest to th: embodiment, workgroup, earthman, subspace, undercarriage, st, present, tartarus,
Nearest to his: her, their, s, him, its, your, himself, photosphere,
Average loss at step 42000: 3.388097
Average loss at step 44000: 3.387375
Average loss at step 46000: 3.224628
Average loss at step 48000: 3.359205
Average loss at step 50000: 3.343150
Nearest to world: dacko, usurpation, overuse, declaration, forearm, galaxy, lamas, palacio,
Nearest to there: it, they, still, unclear, confirmed, stupid, storming, raised,
Nearest to see: splines, archives, meinhof, occidental, sulla, gamemasters, atalh, hengest,
Nearest to and: or, while, but, cadbury, atalh, altruistic, hospitalization, squad,
Nearest to which: that, this, also, when, where, toxicity, humbert, transport,
Nearest to on: upon, through, piraeus, cistern, vultures, cartridge, in, newsletter,
Nearest to been: become, ayin, narrative, clade, falsified, be, occur, saxo,
Nearest to the: its, any, a, our, electromotive, every, dunes, their,
Nearest to from: nibble, jb, shenouda, fragile, deuteronomic, christine, liege, through,
Nearest to but: however, although, though, and, doing, while, because, amigas,
Nearest to were: are, was, degenerative, had, chandragupta, mucous, have, being,
Nearest to b: d, torrey, kaifu, runways, cyclonic, neoconservatism, oscillator, philanthropist,
Nearest to other: various, these, hellenic, different, manu, sofie, mimeograph, chorus,
Nearest to system: gila, unfair, accident, tablature, amelia, pegbox, windows, affleck,
Nearest to th: embodiment, workgroup, twentieth, earthman, subspace, undercarriage, nd, st,
Nearest to his: her, their, s, its, your, him, my, himself,
Average loss at step 52000: 3.286163
Average loss at step 54000: 3.334660
Average loss at step 56000: 3.304453
Average loss at step 58000: 3.337852
Average loss at step 60000: 3.331277
Nearest to world: usurpation, dacko, overuse, lamas, galaxy, ldapv, scarlet, declaration,
Nearest to there: it, they, unclear, longer, still, defenders, stupid, raised,
Nearest to see: splines, archives, meinhof, sulla, gil, hengest, occidental, gamemasters,
Nearest to and: or, but, while, squad, irritant, spines, gertrude, altruistic,
Nearest to which: that, this, where, also, toxicity, always, these, usually,
Nearest to on: upon, through, piraeus, cistern, against, newsletter, predynastic, suppressing,
Nearest to been: become, be, ayin, narrative, was, clade, falsified, gone,
Nearest to the: their, its, this, our, a, any, dunes, his,
Nearest to from: nibble, through, liege, fragile, shenouda, jb, deuteronomic, ammon,
Nearest to but: however, although, and, though, while, because, amigas, reached,
Nearest to were: are, was, had, being, mucous, degenerative, chandragupta, heredity,
Nearest to b: d, torrey, neoconservatism, cyclonic, runways, kaifu, separate, cyanide,
Nearest to other: various, different, hellenic, these, others, mimeograph, chorus, giambattista,
Nearest to system: gila, systems, unfair, tablature, condorcet, windows, perverse, pegbox,
Nearest to th: embodiment, twentieth, nd, workgroup, st, subspace, clockmaker, taxed,
Nearest to his: her, their, its, s, your, my, him, himself,
Average loss at step 62000: 3.309377
Average loss at step 64000: 3.322316
Average loss at step 66000: 3.339799
Average loss at step 68000: 3.284195
Average loss at step 70000: 3.285799
Nearest to world: overuse, usurpation, dacko, galaxy, scarlet, lamas, neck, declaration,
Nearest to there: it, they, unclear, longer, raised, stupid, confirmed, storming,
Nearest to see: splines, sulla, archives, retribution, gil, occidental, succeeds, atalh,
Nearest to and: including, while, but, or, although, atalh, peseta, dirichlet,
Nearest to which: that, this, where, these, toxicity, what, also, humbert,
Nearest to on: upon, through, piraeus, mesogens, cistern, cartridge, against, violates,
Nearest to been: become, ayin, was, narrative, be, falsified, clade, previously,
Nearest to the: its, their, our, electromotive, bub, churchman, dunes, majestic,
Nearest to from: nibble, fragile, through, deuteronomic, liege, christine, jb, into,
Nearest to but: however, although, though, because, and, while, microbial, plea,
Nearest to were: are, was, had, being, have, degenerative, chandragupta, mucous,
Nearest to b: d, torrey, cyclonic, kaifu, pled, neoconservatism, separate, runways,
Nearest to other: various, hellenic, different, these, mimeograph, chorus, others, classics,
Nearest to system: systems, gila, convention, tablature, unfair, condorcet, pegbox, fossils,
Nearest to th: twentieth, nd, embodiment, workgroup, nineteenth, st, present, clockmaker,
Nearest to his: her, their, my, its, your, s, him, himself,
Average loss at step 72000: 3.255667
Average loss at step 74000: 3.272760
Average loss at step 76000: 3.274036
Average loss at step 78000: 3.254915
Average loss at step 80000: 3.123782
Nearest to world: overuse, usurpation, tricity, dacko, galaxy, tab, lamas, ldapv,
Nearest to there: it, longer, they, unclear, raised, stupid, confirmed, discussing,
Nearest to see: splines, sulla, occidental, gil, archives, includes, retribution, gamemasters,
Nearest to and: or, but, while, partnering, spines, broadleaf, sewage, peseta,
Nearest to which: that, this, where, when, also, toxicity, gale, although,
Nearest to on: upon, through, piraeus, mesogens, cistern, violates, duplicates, against,
Nearest to been: become, ayin, narrative, be, gone, oceanographic, come, dislocations,
Nearest to the: our, their, its, dunes, his, any, a, bela,
Nearest to from: nibble, fragile, through, capricornus, deuteronomic, liege, into, christine,
Nearest to but: however, although, and, though, because, while, microbial, dimensionless,
Nearest to were: are, was, had, being, mucous, degenerative, chandragupta, heredity,
Nearest to b: d, kaifu, torrey, pled, cyclonic, agm, chocolates, philanthropist,
Nearest to other: various, hellenic, different, these, mimeograph, chorus, numerous, others,
Nearest to system: systems, gila, convention, condorcet, fossils, tablature, pegbox, modification,
Nearest to th: twentieth, embodiment, nd, nineteenth, workgroup, subspace, st, taxed,
Nearest to his: her, their, s, my, its, your, him, himself,
Average loss at step 82000: 3.234319
Average loss at step 84000: 3.209216
Average loss at step 86000: 3.209563
Average loss at step 88000: 3.247112
Average loss at step 90000: 3.207671
Nearest to world: usurpation, overuse, dacko, tab, lamas, galaxy, winning, scarlet,
Nearest to there: it, they, unclear, longer, stupid, raised, cyanide, confirmed,
Nearest to see: splines, includes, gil, sulla, occidental, hengest, retribution, gamemasters,
Nearest to and: or, but, while, although, cadbury, leland, peseta, philosophiae,
Nearest to which: that, this, where, what, also, these, toxicity, declarative,
Nearest to on: upon, through, piraeus, mesogens, elegant, duplicates, multihull, in,
Nearest to been: become, be, narrative, ayin, gone, was, oceanographic, occurred,
Nearest to the: its, dunes, their, our, electromotive, any, catalonia, complexes,
Nearest to from: nibble, fragile, tights, capricornus, through, jb, liege, deuteronomic,
Nearest to but: however, although, though, and, because, microbial, while, hardwoods,
Nearest to were: are, was, had, mucous, being, degenerative, heredity, weld,
Nearest to b: d, torrey, kaifu, pled, cyclonic, grant, separate, chocolates,
Nearest to other: various, different, hellenic, these, certain, sledge, classics, numerous,
Nearest to system: systems, gila, convention, condorcet, fossils, branch, modification, mode,
Nearest to th: twentieth, nd, nineteenth, embodiment, st, subspace, workgroup, rd,
Nearest to his: her, their, my, its, your, s, him, himself,
Average loss at step 92000: 3.238388
Average loss at step 94000: 3.267902
Average loss at step 96000: 3.198015
Average loss at step 98000: 3.247550
Average loss at step 100000: 3.224793
Nearest to world: overuse, galaxy, dacko, tab, usurpation, scarlet, palacio, neck,
Nearest to there: they, it, unclear, understated, longer, raised, cyanide, stupid,
Nearest to see: includes, sulla, splines, hengest, gil, retribution, gamemasters, occidental,
Nearest to and: or, but, while, cadbury, including, philosophiae, leland, peseta,
Nearest to which: that, this, where, toxicity, these, what, always, sanskrit,
Nearest to on: upon, through, piraeus, cistern, mesogens, violates, against, duplicates,
Nearest to been: become, ayin, was, be, gone, narrative, oceanographic, occurred,
Nearest to the: its, our, dunes, this, their, a, each, majestic,
Nearest to from: nibble, fragile, tights, liege, through, deuteronomic, ammon, jb,
Nearest to but: however, although, though, because, and, microbial, while, than,
Nearest to were: are, was, being, mucous, had, weld, have, tri,
Nearest to b: d, torrey, separate, pled, neoconservatism, chocolates, runways, cyclonic,
Nearest to other: various, different, hellenic, these, numerous, others, mimeograph, sledge,
Nearest to system: systems, gila, condorcet, convention, modification, branch, mode, tablature,
Nearest to th: twentieth, nd, nineteenth, st, embodiment, subspace, rd, taxed,
Nearest to his: her, their, my, its, your, him, himself, s,
Input word1: you
Input word2: can
Input word3: eat
['eat', 'will', 'would']
Input word1: we
Input word2: so
Input word3: much
['much', 'you', 'they']
Input word1: can
Input word2: make
Input word3: dinner
['may', 'could', 'will']
Input word1: just
Input word2: for
Input word3: fun
['fun', 'intricate', 'puritans']
Input word1: 
Input word2: 
Input word3: 
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-178-46dd2e931fac> in <module>()
     43       word2 = input('Input word2: ')
     44       word3 = input('Input word3: ')
---> 45       eval_words = [dictionary[word] for word in [word1,word2,word3]]
     46       embed = tf.nn.embedding_lookup(embeddings, eval_words)
     47       matrix_embed = tf.reshape(embed,[-1,window_size-1,embedding_size])

<ipython-input-178-46dd2e931fac> in <listcomp>(.0)
     43       word2 = input('Input word2: ')
     44       word3 = input('Input word3: ')
---> 45       eval_words = [dictionary[word] for word in [word1,word2,word3]]
     46       embed = tf.nn.embedding_lookup(embeddings, eval_words)
     47       matrix_embed = tf.reshape(embed,[-1,window_size-1,embedding_size])

KeyError: ''

In [179]:
num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)


/usr/lib/python3/dist-packages/matplotlib/collections.py:549: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':