In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from tensorflow.python.client import device_lib

%matplotlib inline

In [2]:
local_device_protos = device_lib.list_local_devices()
[x.name for x in local_device_protos if x.device_type == 'GPU']


Out[2]:
['/gpu:0']

In [3]:
# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)


Found and verified text8.zip

In [4]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    
    return data

words = read_data(filename)
print('Data size', len(words))


Data size 17005207

In [5]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()

    for word, _ in count:
        dictionary[word] = len(dictionary)
  
    data = list()
    unk_count = 0
  
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return data, count, dictionary, reverse_dictionary

In [6]:
data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5239, 3082, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']

In [7]:
data_index = 0

In [8]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
  
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
    
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
        
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
    
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
  
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
   
    return batch, labels

In [9]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])


3082 originated -> 12 as
3082 originated -> 5239 anarchism
12 as -> 6 a
12 as -> 3082 originated
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of

In [10]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

In [11]:
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

In [12]:
graph = tf.Graph()

In [13]:
with graph.as_default():

    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/gpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                         biases=nce_biases,
                         labels=train_labels,
                         inputs=embed,
                         num_sampled=num_sampled,
                         num_classes=vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.initialize_all_variables()


WARNING:tensorflow:From <ipython-input-13-11e9672ed0e1>:44: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.

In [14]:
num_steps = 100001

In [ ]:
with tf.Session(config=tf.ConfigProto(log_device_placement=True), graph=graph) as session:
      # We must initialize all variables before we use them.
    init.run()
    print("Initialized")

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
                
    final_embeddings = normalized_embeddings.eval()


Initialized
Average loss at step  0 :  259.940246582
Nearest to that: tremblay, utilizing, enchanted, dismantle, dinosaur, exception, priors, myoglobin,
Nearest to may: orchard, meyers, leno, druidry, peritoneum, usable, pauses, roleplaying,
Nearest to its: results, scalia, vigour, adonijah, berman, foibles, humber, approximation,
Nearest to more: buckyballs, botanical, mitosis, hendrik, linemen, win, primitives, superpowers,
Nearest to UNK: ooze, aafc, ntfs, gretzky, highschool, partitioning, donelson, schoolmaster,
Nearest to they: steering, congreso, darker, provisions, yerushalayim, fisc, realm, resonance,
Nearest to d: wainwright, toonopedia, tinbergen, shamans, imitate, dagestan, cabinda, adversus,
Nearest to during: romagna, motions, zigzag, swanson, tsim, kaplan, widest, rupp,
Nearest to often: offshoot, snacks, corvinus, elected, overdose, fremen, ducktales, mohenjo,
Nearest to between: wladislaus, propto, quest, nikaya, unconnected, branching, ailey, bats,
Nearest to state: lazarus, uncountably, coupon, ect, circe, gabonese, louisville, practise,
Nearest to for: ikea, domes, fpss, amazonas, atoll, expresses, microscopic, buddhist,
Nearest to see: thera, gallardo, periodic, pyrenees, burner, crtc, midrashic, intrigued,
Nearest to no: matured, suprarenal, lazarus, line, bronx, eyepiece, beggar, forgive,
Nearest to such: homily, seventies, whore, candu, sla, aleinu, mov, wednesday,
Nearest to after: recognizing, concurrency, kenya, nebulous, parish, gresham, doe, wiccans,
Average loss at step  2000 :  113.381620845
Average loss at step  4000 :  52.7048133955
Average loss at step  6000 :  33.4920249097
Average loss at step  8000 :  23.6978426812
Average loss at step  10000 :  17.7604874439
Nearest to that: and, archie, didn, ry, enchanted, released, trouble, fins,
Nearest to may: pharaohs, usually, victoria, zero, probably, nine, malignancies, vs,
Nearest to its: the, surrendered, approximation, berman, zermelo, his, results, address,
Nearest to more: appointed, linemen, win, hotel, put, zone, mathbf, botanical,
Nearest to UNK: one, and, mathbf, linebackers, archie, the, a, agave,
Nearest to they: he, realm, strains, aeneas, zero, it, clan, settle,
Nearest to d: toonopedia, reported, cabinda, ads, clarke, propaganda, bills, civilization,
Nearest to during: motions, reginae, newly, all, sheridan, bn, parks, festivals,
Nearest to often: inspiration, languages, elected, casualties, ve, graph, mystery, cyanide,
Nearest to between: quest, subject, effect, ufo, supplies, quantum, proving, onto,
Nearest to state: practise, introduced, objections, aires, gabonese, agave, wire, sherman,
Nearest to for: in, of, with, and, to, antimatter, structural, from,
Nearest to see: six, periodic, outsider, well, finalist, pyrenees, syphilis, garlic,
Nearest to no: line, cardinality, class, dim, reason, beliefs, massachusetts, again,
Nearest to such: contribution, jung, archie, homily, religion, word, senses, seventies,
Nearest to after: from, sets, kenya, hoare, emergency, constantinople, in, bilingual,
Average loss at step  12000 :  14.0305621388
Average loss at step  14000 :  11.6425184925
Average loss at step  16000 :  10.0025892993
Average loss at step  18000 :  8.43416487277
Average loss at step  20000 :  8.01691550207
Nearest to that: and, which, operatorname, anchoring, not, agouti, aoc, impressions,
Nearest to may: victoria, usually, scriptures, peritoneum, circ, pharaohs, to, would,
Nearest to its: the, his, their, origen, ignatius, agouti, marry, tyrant,
Nearest to more: appointed, primitives, agouti, frying, win, linemen, zone, explodes,
Nearest to UNK: agouti, dasyprocta, and, one, operatorname, linebackers, archie, three,
Nearest to they: he, it, realm, dasyprocta, there, who, strains, settle,
Nearest to d: and, toonopedia, sargon, one, clarke, propaganda, ads, m,
Nearest to during: motions, and, in, dasyprocta, reginae, newly, sheridan, all,
Nearest to often: antoninus, operatorname, inspiration, and, languages, hello, graph, ve,
Nearest to between: of, for, agouti, quest, ufo, quantum, proving, subject,
Nearest to state: apatosaurus, practise, agouti, dasyprocta, funnel, aires, operatorname, hijacking,
Nearest to for: in, of, with, and, from, agouti, as, to,
Nearest to see: six, periodic, pyrenees, waite, outsider, operatorname, burner, is,
Nearest to no: libby, not, anchoring, line, it, agouti, this, shirkuh,
Nearest to such: homily, contribution, senses, farmed, jung, seventies, tempe, religion,
Nearest to after: from, sets, constantinople, hoare, kenya, in, for, nicobar,
Average loss at step  22000 :  6.99141346753
Average loss at step  24000 :  6.90114425349
Average loss at step  26000 :  6.79805140388
Average loss at step  28000 :  6.31811893368
Average loss at step  30000 :  5.93085589266
Nearest to that: which, tonnage, operatorname, agouti, anchoring, trapezohedron, aoc, also,
Nearest to may: can, would, could, usually, victoria, scriptures, circ, to,
Nearest to its: the, their, his, arin, agouti, marry, ignatius, origen,
Nearest to more: appointed, primitives, agouti, win, considered, linemen, frying, explodes,
Nearest to UNK: agouti, dasyprocta, operatorname, bcl, four, three, archie, two,
Nearest to they: he, it, there, who, realm, settle, pear, not,
Nearest to d: b, toonopedia, sargon, and, american, clarke, m, r,
Nearest to during: in, motions, and, dasyprocta, sheridan, reginae, newly, all,
Nearest to often: antoninus, operatorname, inspiration, now, zero, languages, graph, it,
Nearest to between: for, with, agouti, ufo, cordoba, over, seven, quest,
Nearest to state: apatosaurus, practise, abet, agouti, hijacking, funnel, gabonese, af,
Nearest to for: with, in, and, of, from, agouti, to, heinz,
Nearest to see: burner, six, periodic, hodge, pyrenees, waite, is, adapa,
Nearest to no: it, suprarenal, not, anchoring, libby, agouti, this, line,
Nearest to such: homily, tempe, farmed, jung, well, senses, contribution, textures,
Nearest to after: from, with, in, for, hoare, kenya, agouti, sets,
Average loss at step  32000 :  5.9782226367
Average loss at step  34000 :  5.69418454897
Average loss at step  36000 :  5.76266445196
Average loss at step  38000 :  5.48095271897
Average loss at step  40000 :  5.27526280099
Nearest to that: which, this, operatorname, but, agouti, tonnage, however, it,
Nearest to may: can, would, could, will, scriptures, zero, victoria, usually,
Nearest to its: their, his, the, agouti, arin, a, reconstruction, replicating,
Nearest to more: appointed, primitives, contended, considered, agouti, most, recycling, win,
Nearest to UNK: agouti, dasyprocta, four, operatorname, three, vma, abandonware, seven,
Nearest to they: he, it, there, who, not, you, settle, realm,
Nearest to d: b, toonopedia, m, sargon, r, prequel, clarke, eight,
Nearest to during: in, motions, romagna, dasyprocta, and, newly, sheridan, reginae,
Nearest to often: antoninus, now, also, operatorname, usually, inspiration, zero, barney,
Nearest to between: in, agouti, with, over, cordoba, ufo, about, for,
Nearest to state: apatosaurus, abet, agouti, practise, recitative, dasyprocta, hijacking, gabonese,
Nearest to for: of, with, heinz, in, or, from, and, operatorname,
Nearest to see: burner, periodic, hodge, waite, pyrenees, outsider, merrill, adapa,
Nearest to no: it, libby, matured, anchoring, suprarenal, a, not, agouti,
Nearest to such: well, homily, tempe, textures, known, senses, jung, farmed,
Nearest to after: from, in, before, with, three, agouti, hoare, five,

In [16]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)

    
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels)