In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from tensorflow.python.client import device_lib

%matplotlib inline

In [5]:
local_device_protos = device_lib.list_local_devices()
[x.name for x in local_device_protos if x.device_type == 'GPU']


Out[5]:
['/gpu:0']

In [6]:
path_to_data="../../data/text8/text8"

with open(path_to_data) as f:
    words = tf.compat.as_str(f.read()).split()

In [7]:
print('Data size', len(words))


Data size 17005207

In [8]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()

    for word, _ in count:
        dictionary[word] = len(dictionary)
  
    data = list()
    unk_count = 0
  
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return data, count, dictionary, reverse_dictionary

In [9]:
data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5240, 3084, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']

In [10]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
  
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
  
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
    
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
           
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
            
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
        
    return batch, labels

In [11]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])


3084 originated -> 12 as
3084 originated -> 5240 anarchism
12 as -> 3084 originated
12 as -> 6 a
6 a -> 195 term
6 a -> 12 as
195 term -> 2 of
195 term -> 6 a

In [12]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

In [13]:
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

In [14]:
graph = tf.Graph()

In [15]:
with graph.as_default():

    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                         biases=nce_biases,
                         labels=train_labels,
                         inputs=embed,
                         num_sampled=num_sampled,
                         num_classes=vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.initialize_all_variables()

In [16]:
# Step 5: Begin training.
num_steps = 100001

In [17]:
with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print("Initialized")

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
                
    final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step  0 :  335.014160156
Nearest to and: bayard, mysticism, holiness, pacino, quod, fittingly, stand, falling,
Nearest to often: sponsoring, endowed, mamluks, whistling, surprisingly, stripe, unquestioned, remedy,
Nearest to at: very, quite, codec, preis, jolie, cronos, satisfiability, guant,
Nearest to used: heinlein, seleucid, fibrous, fethry, kantian, jeroboam, rubenstein, perpetuation,
Nearest to if: deutschlands, suo, busiest, impurity, tupper, bottles, recurrence, mitigating,
Nearest to as: inaction, wallachia, cone, surplus, swung, lederman, shimazu, successors,
Nearest to would: renewal, prejudicial, sforza, criticisms, endosymbionts, harold, evaluate, scattering,
Nearest to are: poinsot, tuff, cookbook, dovich, onlookers, memel, ingersoll, codifying,
Nearest to up: healing, amerindians, kashubians, light, lamm, evaluation, jaina, keefe,
Nearest to has: belgians, albury, chron, attendances, characteristic, baja, straddling, turnout,
Nearest to however: eval, mansur, hacking, aragon, interaction, rightness, voight, metonic,
Nearest to in: genuinely, choreography, paula, censoring, jervis, mcveigh, sothoth, perturbed,
Nearest to states: halld, finds, opold, nickname, onetime, hussein, linebackers, endlessly,
Nearest to united: birmingham, establishes, saskatoon, iscariot, delusions, trivium, wangenheim, inwo,
Nearest to other: collingwood, liberties, zaman, cowper, futurism, portillo, fanciful, debris,
Nearest to this: christadelphians, maclaurin, khrushchev, arkham, anorexia, helpers, watcher, encoder,
Average loss at step  2000 :  114.169114818
Average loss at step  4000 :  52.6105252609
Average loss at step  6000 :  33.3022174361
Average loss at step  8000 :  23.1612688984
Average loss at step  10000 :  17.8339090509
Nearest to and: in, of, phi, for, gland, UNK, one, to,
Nearest to often: psi, genetic, cl, pharaohs, yield, surprisingly, fails, alchemists,
Nearest to at: in, where, gabriel, giving, gland, appellant, phi, and,
Nearest to used: austin, victoriae, vs, cosmos, psi, reginae, kantian, changing,
Nearest to if: ridge, phi, exodus, dean, corner, busiest, look, item,
Nearest to as: phi, in, and, agave, rainy, is, from, reginae,
Nearest to would: gland, vs, harold, evaluate, cut, renewal, continues, cl,
Nearest to are: is, were, and, honey, homomorphism, mood, have, linguistics,
Nearest to up: healing, light, melbourne, es, evaluation, health, amerindians, innocence,
Nearest to has: was, phi, whilst, standards, notice, violence, demographics, characteristic,
Nearest to however: homomorphism, succeeds, aragon, prevalent, homer, victoriae, naturalization, gollancz,
Nearest to in: and, of, from, on, with, by, for, euclidean,
Nearest to states: linebackers, one, finds, austin, vs, victoriae, interested, reginae,
Nearest to united: without, birmingham, establishes, prior, anglicanism, subdivisions, size, powerful,
Nearest to other: collingwood, liberties, methanol, three, weaker, movement, clumsy, russia,
Nearest to this: a, gb, phi, brownian, the, defeated, enabled, alchemists,
Average loss at step  12000 :  14.0087529283
Average loss at step  14000 :  11.7601765676
Average loss at step  16000 :  9.93130928016
Average loss at step  18000 :  8.70055134237
Average loss at step  20000 :  7.83882313907
Nearest to and: or, in, dasyprocta, phi, agouti, gland, UNK, of,
Nearest to often: which, psi, yield, endowed, pharaohs, palaeologus, gst, fails,
Nearest to at: in, and, where, on, gabriel, dasyprocta, amigas, gland,
Nearest to used: apatosaurus, victoriae, cosmos, vs, austin, satsuma, psi, jeroboam,
Nearest to if: argo, when, was, phi, fa, teleological, triple, busiest,
Nearest to as: phi, and, in, imran, for, dasyprocta, is, by,
Nearest to would: to, can, gland, evaluate, vs, outwards, harold, renewal,
Nearest to are: were, is, was, and, have, truetype, dasyprocta, in,
Nearest to up: healing, argo, melbourne, dasyprocta, light, es, amerindians, health,
Nearest to has: was, is, had, have, cache, whilst, notice, phi,
Nearest to however: dasyprocta, aragon, homomorphism, chick, prevalent, from, only, homer,
Nearest to in: and, of, on, from, dasyprocta, for, at, nine,
Nearest to states: hussein, linebackers, finds, nickname, vs, victoriae, dasyprocta, reginae,
Nearest to united: without, birmingham, octavius, cranmer, establishes, subdivisions, clashing, prior,
Nearest to other: collingwood, methanol, therein, clumsy, eager, three, movement, UNK,
Nearest to this: the, a, brownian, gb, dasyprocta, phi, which, his,
Average loss at step  22000 :  7.23760149932
Average loss at step  24000 :  7.07596875465
Average loss at step  26000 :  6.64265816331
Average loss at step  28000 :  6.18477435422
Average loss at step  30000 :  6.19468612206
Nearest to and: or, in, dasyprocta, phi, agouti, of, from, but,
Nearest to often: which, also, wernicke, psi, yield, fails, cl, palaeologus,
Nearest to at: in, and, on, where, dasyprocta, agouti, phi, gland,
Nearest to used: apatosaurus, cosmos, victoriae, satsuma, vs, jeroboam, austin, leipzig,
Nearest to if: when, argo, was, triomphe, is, akita, phi, triple,
Nearest to as: phi, imran, for, and, dasyprocta, is, by, ashmore,
Nearest to would: to, can, could, will, and, should, may, gland,
Nearest to are: were, is, have, was, dasyprocta, by, truetype, be,
Nearest to up: healing, him, argo, melbourne, dasyprocta, light, es, health,
Nearest to has: was, had, is, have, cache, whilst, were, notice,
Nearest to however: dasyprocta, and, aragon, homomorphism, from, that, prevalent, only,
Nearest to in: and, on, of, at, from, by, nine, for,
Nearest to states: hussein, nickname, finds, linebackers, victoriae, vs, household, dasyprocta,
Nearest to united: birmingham, subdivisions, without, octavius, cranmer, establishes, clashing, anglicanism,
Nearest to other: collingwood, methanol, therein, dasyprocta, modern, clumsy, eager, movement,
Nearest to this: the, which, a, it, dasyprocta, phi, gb, agp,
Average loss at step  32000 :  5.85833043373
Average loss at step  34000 :  5.82648939252
Average loss at step  36000 :  5.67319590318
Average loss at step  38000 :  5.25065627742
Average loss at step  40000 :  5.44992486489
Nearest to and: or, dasyprocta, agouti, but, seven, in, UNK, six,
Nearest to often: which, also, still, psi, were, cl, that, wernicke,
Nearest to at: in, on, dasyprocta, where, agouti, and, from, gland,
Nearest to used: victoriae, apatosaurus, cosmos, woodstock, animistic, chile, austin, satsuma,
Nearest to if: when, argo, triomphe, phi, akita, drop, is, triple,
Nearest to as: phi, dasyprocta, imran, by, is, when, ashmore, and,
Nearest to would: can, will, could, to, may, should, gland, prejudicial,
Nearest to are: were, is, have, was, but, dasyprocta, be, altenberg,
Nearest to up: amerindians, healing, him, argo, dasyprocta, melbourne, light, arianespace,
Nearest to has: had, was, have, is, cache, whilst, were, assur,
Nearest to however: nuke, that, dasyprocta, aragon, only, and, homomorphism, prevalent,
Nearest to in: on, and, at, during, dasyprocta, from, of, agouti,
Nearest to states: hussein, nickname, finds, sys, linebackers, household, victoriae, dasyprocta,
Nearest to united: subdivisions, birmingham, octavius, cranmer, without, establishes, clashing, anglicanism,
Nearest to other: modern, collingwood, zaman, three, therein, dasyprocta, methanol, many,
Nearest to this: which, the, it, dasyprocta, phi, that, a, gb,
Average loss at step  42000 :  5.32151976061
Average loss at step  44000 :  5.28189440691
Average loss at step  46000 :  5.28836487234
Average loss at step  48000 :  5.00656609607
Average loss at step  50000 :  5.15721595621
Nearest to and: or, thibetanus, but, dasyprocta, agouti, phi, in, seven,
Nearest to often: which, also, still, generally, commonly, cl, psi, fails,
Nearest to at: in, on, dasyprocta, where, agouti, and, phi, gland,
Nearest to used: victoriae, apatosaurus, cosmos, known, woodstock, solicitation, satsuma, chile,
Nearest to if: when, argo, is, phi, triomphe, cavern, altenberg, drop,
Nearest to as: imran, phi, dasyprocta, when, thibetanus, for, by, ashmore,
Nearest to would: will, can, could, may, to, should, gland, halakhic,
Nearest to are: were, is, have, be, was, dasyprocta, altenberg, eight,
Nearest to up: amerindians, him, healing, argo, dasyprocta, light, melbourne, battlefield,
Nearest to has: had, was, is, have, cache, whilst, phi, assur,
Nearest to however: that, nuke, dasyprocta, but, homomorphism, aragon, only, agouti,
Nearest to in: at, during, on, from, and, under, of, thibetanus,
Nearest to states: hussein, nickname, finds, sys, linebackers, abhidharma, victoriae, household,
Nearest to united: subdivisions, octavius, cranmer, birmingham, clashing, without, establishes, saskatoon,
Nearest to other: three, many, modern, thibetanus, therein, zaman, clumsy, methanol,
Nearest to this: which, the, it, dasyprocta, agp, phi, that, gb,
Average loss at step  52000 :  5.17330951679
Average loss at step  54000 :  5.09720219982
Average loss at step  56000 :  5.05310117221
Average loss at step  58000 :  5.10586823809
Average loss at step  60000 :  4.95870176995
Nearest to and: or, but, thibetanus, agouti, dasyprocta, microcebus, tamarin, callithrix,
Nearest to often: also, which, still, commonly, generally, now, that, today,
Nearest to at: in, on, tamarin, dasyprocta, where, and, agouti, saguinus,
Nearest to used: victoriae, solicitation, apatosaurus, woodstock, known, michelob, animistic, chile,
Nearest to if: when, argo, is, callithrix, cavern, altenberg, drop, phi,
Nearest to as: imran, dasyprocta, when, thibetanus, callithrix, phi, in, wallachia,
Nearest to would: will, could, can, may, should, to, must, might,
Nearest to are: were, is, have, be, sdp, dasyprocta, cebus, spicy,
Nearest to up: him, amerindians, healing, argo, dasyprocta, battlefield, wct, sdi,
Nearest to has: had, have, was, is, wct, cache, whilst, tamarin,
Nearest to however: that, nuke, michelob, dasyprocta, but, only, transgenic, cebus,
Nearest to in: at, during, on, and, thibetanus, from, dasyprocta, callithrix,
Nearest to states: hussein, nickname, finds, callithrix, sys, abhidharma, glossary, household,
Nearest to united: subdivisions, cranmer, octavius, clashing, birmingham, microsite, without, southern,
Nearest to other: many, modern, tamarin, therein, thibetanus, three, callithrix, dasyprocta,
Nearest to this: which, it, the, that, dasyprocta, agp, phi, gb,
Average loss at step  62000 :  4.79535409725
Average loss at step  64000 :  4.80929700851
Average loss at step  66000 :  4.967029405
Average loss at step  68000 :  4.90107219601
Average loss at step  70000 :  4.7602586025
Nearest to and: or, thibetanus, but, microcebus, agouti, dasyprocta, tamarin, callithrix,
Nearest to often: commonly, also, still, generally, which, now, sometimes, that,
Nearest to at: in, on, tamarin, dasyprocta, three, where, saguinus, agouti,
Nearest to used: victoriae, known, solicitation, woodstock, michelob, animistic, chile, apatosaurus,
Nearest to if: when, argo, callithrix, is, altenberg, drop, cavern, until,
Nearest to as: imran, dasyprocta, dinar, when, thibetanus, in, callithrix, is,
Nearest to would: will, can, could, may, should, to, might, must,
Nearest to are: were, is, have, be, dinar, while, cebus, was,
Nearest to up: him, amerindians, argo, thaler, healing, down, dasyprocta, them,
Nearest to has: had, have, was, is, wct, whilst, tamarin, cache,
Nearest to however: but, michelob, that, nuke, dasyprocta, although, agouti, thibetanus,
Nearest to in: during, at, on, dasyprocta, thibetanus, callithrix, since, microcebus,
Nearest to states: hussein, nickname, finds, callithrix, abhidharma, sys, dasyprocta, victoriae,
Nearest to united: subdivisions, cranmer, clashing, octavius, birmingham, microsite, without, tender,
Nearest to other: many, some, dinar, modern, tamarin, thaler, others, thibetanus,
Nearest to this: which, it, the, that, dasyprocta, agp, phi, gb,
Average loss at step  72000 :  4.79033533561
Average loss at step  74000 :  4.77651757854
Average loss at step  76000 :  4.86675146759
Average loss at step  78000 :  4.80461449349
Average loss at step  80000 :  4.80664194095
Nearest to and: or, thibetanus, tamarin, agouti, iit, callithrix, dasyprocta, but,
Nearest to often: commonly, also, still, generally, sometimes, which, now, usually,
Nearest to at: in, on, tamarin, dasyprocta, during, three, agouti, iit,
Nearest to used: known, victoriae, solicitation, michelob, iit, involved, woodstock, chile,
Nearest to if: when, callithrix, altenberg, argo, cavern, drop, until, spilling,
Nearest to as: imran, when, dasyprocta, dinar, vec, thibetanus, callithrix, michelob,
Nearest to would: will, can, may, could, should, might, to, must,
Nearest to are: were, is, have, be, while, altenberg, vec, include,
Nearest to up: him, amerindians, them, down, healing, argo, thaler, battlefield,
Nearest to has: had, have, was, is, wct, whilst, having, assur,
Nearest to however: but, that, michelob, nuke, dasyprocta, although, when, while,
Nearest to in: during, at, thibetanus, on, under, since, callithrix, dasyprocta,
Nearest to states: hussein, nickname, iit, sys, abhidharma, callithrix, finds, verizon,
Nearest to united: subdivisions, cranmer, birmingham, clashing, southern, octavius, tender, microsite,
Nearest to other: many, some, others, modern, dinar, various, tamarin, thibetanus,
Nearest to this: which, it, the, dasyprocta, agp, that, phi, gb,
Average loss at step  82000 :  4.81088560724
Average loss at step  84000 :  4.77382646096
Average loss at step  86000 :  4.75934794199
Average loss at step  88000 :  4.67697274017
Average loss at step  90000 :  4.74235904837
Nearest to and: or, but, thibetanus, agouti, dasyprocta, tamarin, while, microcebus,
Nearest to often: commonly, also, generally, still, sometimes, usually, now, which,
Nearest to at: in, on, tamarin, dasyprocta, during, three, gabriel, saguinus,
Nearest to used: known, victoriae, solicitation, michelob, iit, involved, sinus, found,
Nearest to if: when, is, altenberg, callithrix, drop, argo, microcebus, can,
Nearest to as: imran, when, dasyprocta, dinar, thibetanus, vec, phi, wallachia,
Nearest to would: will, can, may, could, should, might, must, to,
Nearest to are: were, is, have, be, while, include, altenberg, sdp,
Nearest to up: him, down, them, amerindians, argo, healing, battlefield, thaler,
Nearest to has: had, have, was, is, having, wct, whilst, tamarin,
Nearest to however: but, that, michelob, dasyprocta, nuke, although, when, while,
Nearest to in: during, at, under, and, thibetanus, on, since, nine,
Nearest to states: hussein, nickname, iit, sys, abhidharma, callithrix, lar, finds,
Nearest to united: subdivisions, cranmer, southern, clashing, tender, octavius, birmingham, trivium,
Nearest to other: many, some, others, various, modern, dinar, three, tamarin,
Nearest to this: it, which, the, dasyprocta, agp, some, phi, gb,
Average loss at step  92000 :  4.70350556457
Average loss at step  94000 :  4.62097262824
Average loss at step  96000 :  4.72830743539
Average loss at step  98000 :  4.62341887021
Average loss at step  100000 :  4.67902187061
Nearest to and: or, but, microcebus, thibetanus, agouti, dasyprocta, while, tamarin,
Nearest to often: commonly, generally, usually, also, sometimes, still, now, widely,
Nearest to at: in, on, during, dasyprocta, tamarin, saguinus, agouti, gabriel,
Nearest to used: known, victoriae, solicitation, michelob, iit, found, involved, thibetanus,
Nearest to if: when, altenberg, callithrix, while, is, though, drop, where,
Nearest to as: imran, when, dasyprocta, dinar, thibetanus, callithrix, vec, wallachia,
Nearest to would: will, may, can, could, should, might, must, to,
Nearest to are: were, is, have, while, be, include, altenberg, sdp,
Nearest to up: him, them, down, amerindians, argo, out, thaler, battlefield,
Nearest to has: had, have, was, is, wct, having, tamarin, johansen,
Nearest to however: but, that, michelob, dasyprocta, although, while, nuke, and,
Nearest to in: during, at, under, since, on, throughout, from, callithrix,
Nearest to states: hussein, nickname, sys, iit, abhidharma, countries, verizon, lar,
Nearest to united: subdivisions, southern, cranmer, octavius, clashing, birmingham, tender, trivium,
Nearest to other: many, some, others, various, dinar, different, tamarin, thibetanus,
Nearest to this: it, which, the, dasyprocta, agp, that, some, phi,

In [18]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)

    
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels)



In [ ]: