In [1]:
import collections
import numpy as np
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
print ("Packages loaded.")
In [2]:
# Configuration
batch_size = 20
embedding_size = 2 # This is just for visualization
num_sampled = 15 # Number of negative examples to sample.
In [3]:
# Sample sentences
sentences = ["the quick brown fox jumped over the lazy dog",
"I love cats and dogs",
"we all love cats and dogs",
"cats and dogs are great",
"sung likes cats",
"she loves dogs",
"cats can be very independent",
"cats are great companions when they want to be",
"cats are playful",
"cats are natural hunters",
"It's raining cats and dogs",
"dogs and cats love sung"]
# 'sentences' is 'list'
print ("'sentences' is %s and length is %d."
% (type(sentences), len(sentences)))
In [4]:
words = " ".join(sentences).split()
print ("'words' is %s and length is %d." % (type(words), len(words)))
print (words)
In [5]:
count = collections.Counter(words).most_common()
print ("'count' is %s and length is %d." % (type(count), len(count)))
print (("Word count of top five is %s") % (count[:5]))
print (count)
In [6]:
print (words[0:5])
print (count[0:3])
In [7]:
rdic = [i[0] for i in count] #reverse dic, idx -> word
dic = {w: i for i, w in enumerate(rdic)} #dic, word -> id
voc_size = len(dic) # Number of vocabulary
print ("'rdic' is %s and length is %d." % (type(rdic), len(rdic)))
print ("'dic' is %s and length is %d." % (type(dic), len(dic)))
In [8]:
print (rdic)
In [9]:
print (dic)
In [10]:
print (rdic[0])
print (dic['cats'])
In [11]:
data = [dic[word] for word in words]
print ("'data' is %s and length is %d." % (type(data), len(data)))
print('Sample data: numbers: %s / words: %s'% (data[:10], [rdic[t] for t in data[:10]]))
In [12]:
# See what's in the data
print (data)
In [13]:
# ([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox),
cbow_pairs = [];
for i in range(1, len(data)-1) :
cbow_pairs.append([[data[i-1], data[i+1]], data[i]]);
print('Context pairs: %s' % (cbow_pairs[:10]))
In [14]:
print ("'cbow_pairs' is %s and length is %d."
% (type(cbow_pairs), len(cbow_pairs)))
In [15]:
# (quick, the), (quick, brown), (brown, quick), (brown, fox), ...
skip_gram_pairs = [];
for c in cbow_pairs:
skip_gram_pairs.append([c[1], c[0][0]])
skip_gram_pairs.append([c[1], c[0][1]])
print ("'skip_gram_pairs' is %s and length is %d."
% (type(skip_gram_pairs), len(skip_gram_pairs)))
print('skip-gram pairs', skip_gram_pairs[:5])
In [16]:
def generate_batch(size):
assert size < len(skip_gram_pairs)
x_data=[]
y_data = []
r = np.random.choice(range(len(skip_gram_pairs)), size, replace=False)
for i in r:
x_data.append(skip_gram_pairs[i][0]) # n dim
y_data.append([skip_gram_pairs[i][1]]) # n, 1 dim
return x_data, y_data
# generate_batch test
print ('Batches (x, y)', generate_batch(3))
In [17]:
# Input data
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
# need to shape [batch_size, 1] for nn.nce_loss
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
# missing GPU implementation?
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
embeddings = tf.Variable(
tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs) # lookup table
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
nce_biases = tf.Variable(tf.zeros([voc_size]))
# Compute the average NCE loss for the batch.
loss = tf.reduce_mean(
tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
num_sampled, voc_size))
# Use the adam optimizer
train_op = tf.train.AdamOptimizer(0.01).minimize(loss)
print ("Network ready")
In [18]:
# Launch the graph in a session
with tf.Session() as sess:
# Initializing all variables
tf.initialize_all_variables().run()
for step in range(3000):
batch_inputs, batch_labels = generate_batch(batch_size)
_, loss_val = sess.run([train_op, loss],
feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})
if step % 500 == 0:
print("Loss at %d: %.5f" % (step, loss_val))
# Report the loss
# Final embeddings are ready for you to use.
# Need to normalize for practical use
trained_embeddings = embeddings.eval()
In [19]:
trained_embeddings.shape
Out[19]:
In [20]:
# Show word2vec if dim is 2
if trained_embeddings.shape[1] == 2:
labels = rdic[:20] # Show top 20 words
for i, label in enumerate(labels):
x, y = trained_embeddings[i,:]
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom')
plt.show()