I followed along Aneesh Joshi's blog post on word2vec in this notebook.


In [1]:
import numpy as np
import tensorflow as tf

step 1. read in the data, create word dictionary, created one-hot vectors for each word


In [2]:
# load data
with open('darksouls_training.txt', 'r') as fh:
    training = [sent.replace('.','').replace('\n', '').lower() for sent in fh.readlines()]
# with open('darksouls_test.txt', 'r') as fh:
#     test = [sent.replace('.','').replace('\n', '').lower() for sent in fh.readlines()]

In [5]:
# create vocabulary
word_list = []
for sent in training:
    for word in sent.split(' '):
        word_list.append(word)
# for sent in test:
#     for word in sent.split(' '):
#         word_list.append(word)
voc = set(word_list)

In [6]:
# create one-hot vector for each word
word2int = {}
int2word = {}
for ind, word in enumerate(voc):
    word2int[word] = ind
    int2word[ind] = word

In [7]:
# split the sentences
sent_train = []
for sent in training:
    sent_train.append(sent.split(' '))
# sent_test = []
# for sent in test:
#     sent_test.append(sent.split(' '))

In [8]:
# create word pairs
data_train = []
WINDOW_SIZE = 5
for sentence in sent_train:
    for ind, word in enumerate(sentence):
        for nb_word in sentence[max(ind - WINDOW_SIZE, 0) : min(ind + WINDOW_SIZE, len(sentence)) + 1] :
            if nb_word != word:
                data_train.append([word, nb_word])

In [10]:
# convert to one-hot
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

In [12]:
data_train[0]


Out[12]:
['postrelease', 'the']

In [14]:
x_train = []
y_train = []

for word_pair in data_train:
    x_train.append(to_one_hot(word2int[word_pair[0]], len(voc)))
    y_train.append(to_one_hot(word2int[word_pair[1]], len(voc)))
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

step 2. create tensorflow word2vec model


In [24]:
x = tf.placeholder(dtype=tf.float32, shape=(None, len(voc)))
y_label = tf.placeholder(dtype=tf.float32, shape=(None, len(voc)))

In [31]:
# hidden layer
EMBEDDING_DIM = 5
W1 = tf.Variable(tf.random_normal([len(voc), EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM]))
hidden_rep = tf.add(tf.matmul(x, W1), b1)

W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, len(voc), ]))
b2 = tf.Variable(tf.random_normal([len(voc)]))
pred = tf.nn.softmax(tf.add(tf.matmul(hidden_rep, W2), b2))

In [65]:
# run the model
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

#loss function
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(pred),
                                                   reduction_indices=1))
# training step
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy_loss)
# epoch number
n_epoch = 10000
for epoch in xrange(n_epoch):
    sess.run(train_step,
             feed_dict={x: x_train, y_label:y_train})
    if epoch % 100 == 0:
        print('epoch {}: loss is '.format(epoch), sess.run(cross_entropy_loss,
                                                           feed_dict={x: x_train, y_label: y_train}))


('epoch 0: loss is ', 9.6988478)
('epoch 100: loss is ', 7.0252614)
('epoch 200: loss is ', 6.4158936)
('epoch 300: loss is ', 6.1022382)
('epoch 400: loss is ', 5.9212928)
('epoch 500: loss is ', 5.7993145)
('epoch 600: loss is ', 5.7100825)
('epoch 700: loss is ', 5.6409087)
('epoch 800: loss is ', 5.5849442)
('epoch 900: loss is ', 5.5382609)
('epoch 1000: loss is ', 5.498414)
('epoch 1100: loss is ', 5.4637589)
('epoch 1200: loss is ', 5.4331913)
('epoch 1300: loss is ', 5.4058681)
('epoch 1400: loss is ', 5.3811574)
('epoch 1500: loss is ', 5.3585715)
('epoch 1600: loss is ', 5.3377142)
('epoch 1700: loss is ', 5.3182721)
('epoch 1800: loss is ', 5.2999716)
('epoch 1900: loss is ', 5.2826042)
('epoch 2000: loss is ', 5.2659693)
('epoch 2100: loss is ', 5.249928)
('epoch 2200: loss is ', 5.2343483)
('epoch 2300: loss is ', 5.2191362)
('epoch 2400: loss is ', 5.204206)
('epoch 2500: loss is ', 5.1894779)
('epoch 2600: loss is ', 5.1749139)
('epoch 2700: loss is ', 5.1604729)
('epoch 2800: loss is ', 5.146121)
('epoch 2900: loss is ', 5.1318474)
('epoch 3000: loss is ', 5.1176348)
('epoch 3100: loss is ', 5.1034875)
('epoch 3200: loss is ', 5.0893998)
('epoch 3300: loss is ', 5.075387)
('epoch 3400: loss is ', 5.0614438)
('epoch 3500: loss is ', 5.0476041)
('epoch 3600: loss is ', 5.0338593)
('epoch 3700: loss is ', 5.0202312)
('epoch 3800: loss is ', 5.0067334)
('epoch 3900: loss is ', 4.9933681)
('epoch 4000: loss is ', 4.9801502)
('epoch 4100: loss is ', 4.9670796)
('epoch 4200: loss is ', 4.9541721)
('epoch 4300: loss is ', 4.9414344)
('epoch 4400: loss is ', 4.9288616)
('epoch 4500: loss is ', 4.9164505)
('epoch 4600: loss is ', 4.9041944)
('epoch 4700: loss is ', 4.8921089)
('epoch 4800: loss is ', 4.8801861)
('epoch 4900: loss is ', 4.8684297)
('epoch 5000: loss is ', 4.8568211)
('epoch 5100: loss is ', 4.8453736)
('epoch 5200: loss is ', 4.8340864)
('epoch 5300: loss is ', 4.8229489)
('epoch 5400: loss is ', 4.8119636)
('epoch 5500: loss is ', 4.8011231)
('epoch 5600: loss is ', 4.7904363)
('epoch 5700: loss is ', 4.7798896)
('epoch 5800: loss is ', 4.7694983)
('epoch 5900: loss is ', 4.7592487)
('epoch 6000: loss is ', 4.749135)
('epoch 6100: loss is ', 4.7391682)
('epoch 6200: loss is ', 4.729341)
('epoch 6300: loss is ', 4.7196498)
('epoch 6400: loss is ', 4.710103)
('epoch 6500: loss is ', 4.7006865)
('epoch 6600: loss is ', 4.6914148)
('epoch 6700: loss is ', 4.6822829)
('epoch 6800: loss is ', 4.6732841)
('epoch 6900: loss is ', 4.6644254)
('epoch 7000: loss is ', 4.6556988)
('epoch 7100: loss is ', 4.6471109)
('epoch 7200: loss is ', 4.6386619)
('epoch 7300: loss is ', 4.6303449)
('epoch 7400: loss is ', 4.6221704)
('epoch 7500: loss is ', 4.6141214)
('epoch 7600: loss is ', 4.6062188)
('epoch 7700: loss is ', 4.598444)
('epoch 7800: loss is ', 4.590807)
('epoch 7900: loss is ', 4.5833068)
('epoch 8000: loss is ', 4.5759311)
('epoch 8100: loss is ', 4.5686908)
('epoch 8200: loss is ', 4.5615811)
('epoch 8300: loss is ', 4.554605)
('epoch 8400: loss is ', 4.5477576)
('epoch 8500: loss is ', 4.5410385)
('epoch 8600: loss is ', 4.5344453)
('epoch 8700: loss is ', 4.527976)
('epoch 8800: loss is ', 4.5216336)
('epoch 8900: loss is ', 4.5154109)
('epoch 9000: loss is ', 4.5093064)
('epoch 9100: loss is ', 4.5033216)
('epoch 9200: loss is ', 4.4974566)
('epoch 9300: loss is ', 4.4917049)
('epoch 9400: loss is ', 4.486064)
('epoch 9500: loss is ', 4.4805408)
('epoch 9600: loss is ', 4.475121)
('epoch 9700: loss is ', 4.4698205)
('epoch 9800: loss is ', 4.4646115)
('epoch 9900: loss is ', 4.4595108)

In [38]:
saver = tf.train.Saver()
saver.save(sess, 'dark_souls_word2vec_model.ckpt')


Out[38]:
'dark_souls_word2vec_model.ckpt'

In [ ]: