importing require packages


In [21]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random

Instantiate Embeddings


In [2]:
embeddings = Embeddings(100, 4, 1, 4)


Loading the embeddings from the cache

getting data from preprocessing


In [3]:
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

generating training data


In [4]:
window_size = 3
vocab_size = len(word2index)
print(vocab_size)
#sorted(window_size,reverse=True)
#sentence_max_length = max([len(sentence) for sentence in tokenized_indexed_sentence ])


170760

In [5]:
seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_indexed_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)#[]
        seq_out.append(word2vec_weights[y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)


Number of samples :  3381370

Defining model


In [6]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights]))
model.add(LSTM(512,return_sequences =True))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(word2vec_weights.shape[1], activation='relu'))
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, None, 100)         17076000  
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 512)         1255424   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               51300     
=================================================================
Total params: 20,481,924
Trainable params: 20,481,924
Non-trainable params: 0
_________________________________________________________________

In [7]:
model_weights_path = "../weights/lstm-2-512-batchsize-128-epochs-15"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

Train Model


In [8]:
model_fit_summary = model.fit(seq_in, seq_out, epochs=15, verbose=1, validation_split=0.2, batch_size=128, callbacks=[checkpoint])


Train on 1 samples, validate on 1 samples
Epoch 1/15
Epoch 00000: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.00-0.00.hdf5
1/1 [==============================] - 2s - loss: 2.5437 - acc: 0.0000e+00 - val_loss: 4.5901 - val_acc: 0.0000e+00
Epoch 2/15
Epoch 00001: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.01-0.00.hdf5
1/1 [==============================] - 3s - loss: 2.4388 - acc: 1.0000 - val_loss: 4.5755 - val_acc: 0.0000e+00
Epoch 3/15
Epoch 00002: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.02-0.00.hdf5
1/1 [==============================] - 3s - loss: 2.2880 - acc: 1.0000 - val_loss: 4.5908 - val_acc: 0.0000e+00
Epoch 4/15
Epoch 00003: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.03-0.00.hdf5
1/1 [==============================] - 5s - loss: 2.1385 - acc: 1.0000 - val_loss: 4.6593 - val_acc: 0.0000e+00
Epoch 5/15
Epoch 00004: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.04-0.00.hdf5
1/1 [==============================] - 3s - loss: 1.9581 - acc: 1.0000 - val_loss: 4.8363 - val_acc: 0.0000e+00
Epoch 6/15
Epoch 00005: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.05-0.00.hdf5
1/1 [==============================] - 2s - loss: 1.8593 - acc: 1.0000 - val_loss: 5.0636 - val_acc: 0.0000e+00
Epoch 7/15
Epoch 00006: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.06-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.8721 - acc: 1.0000 - val_loss: 5.1063 - val_acc: 0.0000e+00
Epoch 8/15
Epoch 00007: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.07-0.00.hdf5
1/1 [==============================] - 6s - loss: 1.8634 - acc: 1.0000 - val_loss: 5.0172 - val_acc: 0.0000e+00
Epoch 9/15
Epoch 00008: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.08-0.00.hdf5
1/1 [==============================] - 1s - loss: 1.8261 - acc: 1.0000 - val_loss: 4.8945 - val_acc: 0.0000e+00
Epoch 10/15
Epoch 00009: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.09-0.00.hdf5
1/1 [==============================] - 3s - loss: 1.8317 - acc: 1.0000 - val_loss: 4.7707 - val_acc: 0.0000e+00
Epoch 11/15
Epoch 00010: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.10-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.8087 - acc: 1.0000 - val_loss: 4.6840 - val_acc: 0.0000e+00
Epoch 12/15
Epoch 00011: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.11-0.00.hdf5
1/1 [==============================] - 5s - loss: 1.7886 - acc: 1.0000 - val_loss: 4.6261 - val_acc: 0.0000e+00
Epoch 13/15
Epoch 00012: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.12-0.00.hdf5
1/1 [==============================] - 2s - loss: 1.7964 - acc: 1.0000 - val_loss: 4.5961 - val_acc: 0.0000e+00
Epoch 14/15
Epoch 00013: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.13-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.7958 - acc: 1.0000 - val_loss: 4.5890 - val_acc: 0.0000e+00
Epoch 15/15
Epoch 00014: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.14-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.7889 - acc: 1.0000 - val_loss: 4.5976 - val_acc: 0.0000e+00

model predict


In [39]:
start = 101
pattern = list(seq_in[start])
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(10):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(word2index[pred_word])
    pattern = pattern[1:len(pattern)]


" l so that
was was was was was was was was was was 

Accuracy


In [10]:
def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in[:5], seq_out[:5]):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity >= 0.85:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

In [11]:
seq_out[0]


Out[11]:
array([ 0.27063617, -1.88592136, -4.60332966,  1.38032007, -3.50511742,
        0.88621974, -0.93677127, -0.79569846, -3.14880967, -1.55611396,
        1.76569748, -1.56616974, -1.98014486, -0.53436381,  0.75873446,
       -0.45234403,  1.88288879,  0.71259999, -1.32030022, -0.96863592,
        1.75420606, -0.84065282, -0.16819586, -1.39510345,  1.12865663,
       -2.25509238, -1.10569012,  0.45855987,  1.38602078,  1.99246311,
       -0.84836662,  1.90263259, -2.77856398, -1.62563455,  3.46212029,
       -2.01167464,  3.70865345,  1.20190263, -0.10993056, -0.10728802,
        1.09963083,  2.20335579, -1.30530989,  0.05224802, -0.61327147,
       -2.37654543,  0.88630396,  1.29029536,  1.82292056, -0.13894933,
       -0.37759912, -2.38005781,  0.1164887 , -0.08346321, -0.20829169,
       -0.32738602,  1.08387029,  1.02966964, -1.70243227, -1.14024544,
        0.93973899,  2.65973043, -3.04378986, -0.42154336,  2.16397929,
        0.52978057, -0.14344002, -0.3650367 , -0.76624018,  1.28313541,
        0.81438124, -1.13663208,  1.64061928, -2.34302354, -0.09878732,
        0.13976237,  1.15372658, -2.66882372, -1.91749787,  1.4919076 ,
        0.25347689,  1.3125174 ,  1.26598108,  1.53122878, -0.29941618,
       -0.71239001, -0.6470781 ,  0.03714816,  1.96788037, -2.39959288,
        0.43021342,  0.47725865,  2.3280437 ,  0.05286156, -2.84247327,
       -1.7078588 , -0.40347713, -1.13539743, -0.94796562,  0.78792363], dtype=float32)

In [22]:
model_results = model_fit_summary.history

In [23]:
model_results.update(model_fit_summary.params)

In [24]:
model_results["train_accuracy"] = accuracy()


Accuracy 0.2

In [28]:
# n = no. of predictions
# accuracy = accuracy(400)
print(model_results)


{'val_acc': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'do_validation': True, 'metrics': ['loss', 'acc', 'val_loss', 'val_acc'], 'samples': 1, 'acc': [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 'epochs': 15, 'val_loss': [4.5900764465332031, 4.5754680633544922, 4.5907573699951172, 4.6593356132507324, 4.8363356590270996, 5.0636167526245117, 5.1062922477722168, 5.0171608924865723, 4.8945426940917969, 4.7707195281982422, 4.683952808380127, 4.6261215209960938, 4.5961475372314453, 4.588953971862793, 4.5975632667541504], 'batch_size': 128, 'train_accuracy': None, 'verbose': 1, 'loss': [2.5437352657318115, 2.4388267993927002, 2.2880129814147949, 2.1385490894317627, 1.9580711126327515, 1.8592877388000488, 1.8720529079437256, 1.863358736038208, 1.8261202573776245, 1.831728458404541, 1.8086607456207275, 1.788567066192627, 1.796411395072937, 1.7958264350891113, 1.7889491319656372]}

In [26]:
text_file_path = "../weights/lstm-2-512-batchsize-128-epochs-15/model_results.json"

In [27]:
with open(text_file_path, "w") as f:
        json.dump(model_results, f)

In [ ]: