In [4]:
import numpy as np
import tensorflow as tf
import hdf5storage
f = hdf5storage.loadmat('data.mat')
In [5]:
data = f["data"]
train = data["trainData"][0][0].transpose() - 1
trainX = train[:,:3]
targetX = train[:,3]
valid = data["validData"][0][0].transpose() - 1
validX = valid[:,:3]
validY = valid[:,3]
test = data["testData"][0][0].transpose() - 1
testX = test[:,:3]
testY = test[:,3]
vocab = data["vocab"][0][0][0]
In [6]:
data["trainData"][0][0] - 1
Out[6]:
In [7]:
vocab[[27,25,89,143]]
Out[7]:
In [8]:
#data["vocab"][0][0][0].shape
for i in range(len(vocab)):
vocab[i] = vocab[i][0]
print(vocab)
In [9]:
for row in train[:5,:]:
print(row)
In [10]:
def gen_data(train):
batch = np.ndarray(shape=(len(train)*4), dtype=np.int32)
labels = np.ndarray(shape=(len(train)*4, 1), dtype=np.int32)
buffer = list()
labelbufffer = list()
for row in train:
buffer.append(row[1])
buffer.append(row[1])
labelbufffer.append([row[0]])
labelbufffer.append([row[2]])
buffer.append(row[2])
buffer.append(row[2])
labelbufffer.append([row[1]])
labelbufffer.append([row[3]])
batch = np.array(buffer)
labels = np.array(labelbufffer)
return batch,labels
In [11]:
b,l = gen_data(train)
print(len(b))
In [12]:
b.shape[0]/200
Out[12]:
In [13]:
import math
train_size = 1490200
batch_size = 200
num_batches = 7451
embedding_size = 128 # Dimension of the embedding vector.
epochs = 10
num_sampled = 25
vocabulary_size = 250
In [14]:
tf.__version__
Out[14]:
In [15]:
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# Weights and Biases for nce loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Placeholders for inputs
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Compute the NCE loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
num_sampled, vocabulary_size))
# We use the SGD optimizer.
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
In [16]:
def gen_batches(b,l, batch_size=200):
inputs = []
labels = []
num_batches = int(train_size/batch_size)
for i in range(num_batches):
input_buff = b[i*batch_size:(i+1)*batch_size]
label_buff = l[i*batch_size:(i+1)*batch_size]
inputs.append(input_buff)
labels.append(label_buff)
return zip(inputs,labels)
In [ ]:
In [17]:
with tf.Session() as session:
tf.global_variables_initializer().run()
for epoch in range(epochs):
for inps, labls in gen_batches(b,l):
feed_dict = {train_inputs: inps, train_labels: labls}
_, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)
print("Epoch #",epoch+1," Loss : ", cur_loss)
word_embeddings = embeddings.eval()
In [18]:
word_embeddings
Out[18]:
In [ ]:
In [19]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
In [ ]: