In [4]:
import numpy as np
import tensorflow as tf
import hdf5storage

f = hdf5storage.loadmat('data.mat')

In [5]:
data = f["data"]
train = data["trainData"][0][0].transpose() - 1
trainX = train[:,:3]
targetX = train[:,3]

valid = data["validData"][0][0].transpose() - 1
validX = valid[:,:3]
validY = valid[:,3]

test = data["testData"][0][0].transpose() - 1
testX = test[:,:3]
testY = test[:,3]

vocab = data["vocab"][0][0][0]

In [6]:
data["trainData"][0][0] - 1


Out[6]:
array([[ 27, 183, 182, ..., 222, 151,  90],
       [ 25,  43,  31, ...,  89, 197,  31],
       [ 89, 248,  75, ...,  95, 132, 222],
       [143, 116, 121, ..., 203, 143, 199]], dtype=int32)

In [7]:
vocab[[27,25,89,143]]


Out[7]:
array([array(['going'], 
      dtype='<U5'),
       array(['to'], 
      dtype='<U2'),
       array(['be'], 
      dtype='<U2'), array(['.'], 
      dtype='<U1')], dtype=object)

In [8]:
#data["vocab"][0][0][0].shape

for i in range(len(vocab)):
    vocab[i] = vocab[i][0]
print(vocab)


['all' 'set' 'just' 'show' 'being' 'money' 'over' 'both' 'years' 'four'
 'through' 'during' 'go' 'still' 'children' 'before' 'police' 'office'
 'million' 'also' 'less' 'had' ',' 'including' 'should' 'to' 'only' 'going'
 'under' 'has' 'might' 'do' 'them' 'good' 'around' 'get' 'very' 'big' 'dr.'
 'game' 'every' 'know' 'they' 'not' 'world' 'now' 'him' 'school' 'several'
 'like' 'did' 'university' 'companies' 'these' 'she' 'team' 'found' 'where'
 'right' 'says' 'people' 'house' 'national' 'some' 'back' 'see' 'street'
 'are' 'year' 'home' 'best' 'out' 'even' 'what' 'said' 'for' 'federal'
 'since' 'its' 'may' 'state' 'does' 'john' 'between' 'new' ';' 'three'
 'public' '?' 'be' 'we' 'after' 'business' 'never' 'use' 'here' 'york'
 'members' 'percent' 'put' 'group' 'come' 'by' '$' 'on' 'about' 'last'
 'her' 'of' 'could' 'days' 'against' 'times' 'women' 'place' 'think'
 'first' 'among' 'own' 'family' 'into' 'each' 'one' 'down' 'because' 'long'
 'another' 'such' 'old' 'next' 'your' 'market' 'second' 'city' 'little'
 'from' 'would' 'few' 'west' 'there' 'political' 'two' 'been' '.' 'their'
 'much' 'music' 'too' 'way' 'white' ':' 'was' 'war' 'today' 'more' 'ago'
 'life' 'that' 'season' 'company' '-' 'but' 'part' 'court' 'former'
 'general' 'with' 'than' 'those' 'he' 'me' 'high' 'made' 'this' 'work' 'up'
 'us' 'until' 'will' 'ms.' 'while' 'officials' 'can' 'were' 'country' 'my'
 'called' 'and' 'program' 'have' 'then' 'is' 'it' 'an' 'states' 'case'
 'say' 'his' 'at' 'want' 'in' 'any' 'as' 'if' 'united' 'end' 'no' ')'
 'make' 'government' 'when' 'american' 'same' 'how' 'mr.' 'other' 'take'
 'which' 'department' '--' 'you' 'many' 'nt' 'day' 'week' 'play' 'used'
 "'s" 'though' 'our' 'who' 'yesterday' 'director' 'most' 'president' 'law'
 'man' 'a' 'night' 'off' 'center' 'i' 'well' 'or' 'without' 'so' 'time'
 'five' 'the' 'left']

In [9]:
for row in train[:5,:]:
    print(row)


[ 27  25  89 143]
[183  43 248 116]
[182  31  75 121]
[116 246 200 185]
[222 189 248   5]

In [10]:
def gen_data(train):
    batch = np.ndarray(shape=(len(train)*4), dtype=np.int32)
    labels = np.ndarray(shape=(len(train)*4, 1), dtype=np.int32)
    buffer = list()
    labelbufffer = list()
    for row in train:
        buffer.append(row[1])
        buffer.append(row[1])
        labelbufffer.append([row[0]])
        labelbufffer.append([row[2]])
        buffer.append(row[2])
        buffer.append(row[2])
        labelbufffer.append([row[1]])
        labelbufffer.append([row[3]])       
        
    batch = np.array(buffer)
    labels = np.array(labelbufffer)
    return batch,labels

In [11]:
b,l = gen_data(train)
print(len(b))


1490200

In [12]:
b.shape[0]/200


Out[12]:
7451.0

In [13]:
import math

train_size = 1490200


batch_size = 200
num_batches = 7451
embedding_size = 128  # Dimension of the embedding vector.
epochs = 10
num_sampled = 25
vocabulary_size = 250

In [14]:
tf.__version__


Out[14]:
'0.12.1'

In [15]:
embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# Weights and Biases for nce loss
nce_weights = tf.Variable(
  tf.truncated_normal([vocabulary_size, embedding_size],
                      stddev=1.0 / math.sqrt(embedding_size)))

nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Placeholders for inputs
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

# Compute the NCE loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
  tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                 num_sampled, vocabulary_size))

# We use the SGD optimizer.
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

In [16]:
def gen_batches(b,l, batch_size=200):
    inputs = []
    labels = []
    
    num_batches = int(train_size/batch_size)
    for i in range(num_batches):
        input_buff = b[i*batch_size:(i+1)*batch_size]
        label_buff = l[i*batch_size:(i+1)*batch_size]
        
        inputs.append(input_buff)
        labels.append(label_buff)
      
    return zip(inputs,labels)

In [ ]:


In [17]:
with tf.Session() as session:
    tf.global_variables_initializer().run()
    
    for epoch in range(epochs):
        for inps, labls in gen_batches(b,l):
            feed_dict = {train_inputs: inps, train_labels: labls}
            _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)
        
        print("Epoch #",epoch+1," Loss : ", cur_loss)
        
    word_embeddings  = embeddings.eval()


Epoch # 1  Loss :  2.16318
Epoch # 2  Loss :  3.02129
Epoch # 3  Loss :  1.87356
Epoch # 4  Loss :  2.58635
Epoch # 5  Loss :  2.14795
Epoch # 6  Loss :  2.47761
Epoch # 7  Loss :  2.07374
Epoch # 8  Loss :  2.85126
Epoch # 9  Loss :  2.13043
Epoch # 10  Loss :  2.6385

In [18]:
word_embeddings


Out[18]:
array([[ 0.07745709, -0.05804505,  0.00131782, ..., -0.31897098,
         0.48136714, -0.01462711],
       [-0.79447001, -0.71987045,  0.66330582, ..., -1.2404027 ,
        -0.75644547,  0.52904773],
       [ 0.71211201, -0.17814596,  0.17807025, ..., -0.04730324,
        -0.04074759, -0.16970505],
       ..., 
       [ 0.2184823 , -1.01309025,  0.04281931, ..., -1.19828022,
        -0.08744483,  0.11709936],
       [-0.14953709, -0.2999098 ,  0.08950499, ...,  0.17020635,
         0.38466185,  0.04196095],
       [ 0.53140539, -0.40258363, -0.48989901, ..., -0.60220033,
         0.23682934,  0.25157347]], dtype=float32)

In [ ]:


In [19]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)

In [ ]: