importing require packages

from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random

Instantiate Embeddings

embeddings = Embeddings(100, 4, 1, 4)

Loading the embeddings from the cache

getting data from preprocessing

word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

generating training data

window_size = 3
vocab_size = len(word2index)
#sentence_max_length = max([len(sentence) for sentence in tokenized_indexed_sentence ])


seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_indexed_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  3381370

Defining model

# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights]))
model.add(LSTM(512,return_sequences =True))
model.add(Dense(word2vec_weights.shape[1], activation='relu'))
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])

Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         17076000  
lstm_1 (LSTM)                (None, None, 512)         1255424   
dropout_1 (Dropout)          (None, None, 512)         0         
lstm_2 (LSTM)                (None, 512)               2099200   
dropout_2 (Dropout)          (None, 512)               0         
dense_1 (Dense)              (None, 100)               51300     
Total params: 20,481,924
Trainable params: 20,481,924
Non-trainable params: 0

model_weights_path = "../weights/lstm-2-512-batchsize-128-epochs-15"
if not os.path.exists(model_weights_path):
checkpoint_path = model_weights_path + '/weights.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

Train Model

model_fit_summary =, seq_out, epochs=15, verbose=1, validation_split=0.2, batch_size=128, callbacks=[checkpoint])

Train on 1 samples, validate on 1 samples
Epoch 1/15
Epoch 00000: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.00-0.00.hdf5
1/1 [==============================] - 2s - loss: 2.5437 - acc: 0.0000e+00 - val_loss: 4.5901 - val_acc: 0.0000e+00
Epoch 2/15
Epoch 00001: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.01-0.00.hdf5
1/1 [==============================] - 3s - loss: 2.4388 - acc: 1.0000 - val_loss: 4.5755 - val_acc: 0.0000e+00
Epoch 3/15
Epoch 00002: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.02-0.00.hdf5
1/1 [==============================] - 3s - loss: 2.2880 - acc: 1.0000 - val_loss: 4.5908 - val_acc: 0.0000e+00
Epoch 4/15
Epoch 00003: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.03-0.00.hdf5
1/1 [==============================] - 5s - loss: 2.1385 - acc: 1.0000 - val_loss: 4.6593 - val_acc: 0.0000e+00
Epoch 5/15
Epoch 00004: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.04-0.00.hdf5
1/1 [==============================] - 3s - loss: 1.9581 - acc: 1.0000 - val_loss: 4.8363 - val_acc: 0.0000e+00
Epoch 6/15
Epoch 00005: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.05-0.00.hdf5
1/1 [==============================] - 2s - loss: 1.8593 - acc: 1.0000 - val_loss: 5.0636 - val_acc: 0.0000e+00
Epoch 7/15
Epoch 00006: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.06-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.8721 - acc: 1.0000 - val_loss: 5.1063 - val_acc: 0.0000e+00
Epoch 8/15
Epoch 00007: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.07-0.00.hdf5
1/1 [==============================] - 6s - loss: 1.8634 - acc: 1.0000 - val_loss: 5.0172 - val_acc: 0.0000e+00
Epoch 9/15
Epoch 00008: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.08-0.00.hdf5
1/1 [==============================] - 1s - loss: 1.8261 - acc: 1.0000 - val_loss: 4.8945 - val_acc: 0.0000e+00
Epoch 10/15
Epoch 00009: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.09-0.00.hdf5
1/1 [==============================] - 3s - loss: 1.8317 - acc: 1.0000 - val_loss: 4.7707 - val_acc: 0.0000e+00
Epoch 11/15
Epoch 00010: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.10-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.8087 - acc: 1.0000 - val_loss: 4.6840 - val_acc: 0.0000e+00
Epoch 12/15
Epoch 00011: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.11-0.00.hdf5
1/1 [==============================] - 5s - loss: 1.7886 - acc: 1.0000 - val_loss: 4.6261 - val_acc: 0.0000e+00
Epoch 13/15
Epoch 00012: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.12-0.00.hdf5
1/1 [==============================] - 2s - loss: 1.7964 - acc: 1.0000 - val_loss: 4.5961 - val_acc: 0.0000e+00
Epoch 14/15
Epoch 00013: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.13-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.7958 - acc: 1.0000 - val_loss: 4.5890 - val_acc: 0.0000e+00
Epoch 15/15
Epoch 00014: saving model to ../weights/lstm-2-512-batchsize-128-epochs-15/weights.14-0.00.hdf5
1/1 [==============================] - 4s - loss: 1.7889 - acc: 1.0000 - val_loss: 4.5976 - val_acc: 0.0000e+00

model predict

start = 101
pattern = list(seq_in[start])
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(10):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern = pattern[1:len(pattern)]

" l so that
was was was was was was was was was was 


def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in[:5], seq_out[:5]):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity >= 0.85:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

array([ 0.27063617, -1.88592136, -4.60332966,  1.38032007, -3.50511742,
        0.88621974, -0.93677127, -0.79569846, -3.14880967, -1.55611396,
        1.76569748, -1.56616974, -1.98014486, -0.53436381,  0.75873446,
       -0.45234403,  1.88288879,  0.71259999, -1.32030022, -0.96863592,
        1.75420606, -0.84065282, -0.16819586, -1.39510345,  1.12865663,
       -2.25509238, -1.10569012,  0.45855987,  1.38602078,  1.99246311,
       -0.84836662,  1.90263259, -2.77856398, -1.62563455,  3.46212029,
       -2.01167464,  3.70865345,  1.20190263, -0.10993056, -0.10728802,
        1.09963083,  2.20335579, -1.30530989,  0.05224802, -0.61327147,
       -2.37654543,  0.88630396,  1.29029536,  1.82292056, -0.13894933,
       -0.37759912, -2.38005781,  0.1164887 , -0.08346321, -0.20829169,
       -0.32738602,  1.08387029,  1.02966964, -1.70243227, -1.14024544,
        0.93973899,  2.65973043, -3.04378986, -0.42154336,  2.16397929,
        0.52978057, -0.14344002, -0.3650367 , -0.76624018,  1.28313541,
        0.81438124, -1.13663208,  1.64061928, -2.34302354, -0.09878732,
        0.13976237,  1.15372658, -2.66882372, -1.91749787,  1.4919076 ,
        0.25347689,  1.3125174 ,  1.26598108,  1.53122878, -0.29941618,
       -0.71239001, -0.6470781 ,  0.03714816,  1.96788037, -2.39959288,
        0.43021342,  0.47725865,  2.3280437 ,  0.05286156, -2.84247327,
       -1.7078588 , -0.40347713, -1.13539743, -0.94796562,  0.78792363], dtype=float32)

model_results = model_fit_summary.history

model_results["train_accuracy"] = accuracy()

Accuracy 0.2

# n = no. of predictions
# accuracy = accuracy(400)

{'val_acc': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'do_validation': True, 'metrics': ['loss', 'acc', 'val_loss', 'val_acc'], 'samples': 1, 'acc': [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 'epochs': 15, 'val_loss': [4.5900764465332031, 4.5754680633544922, 4.5907573699951172, 4.6593356132507324, 4.8363356590270996, 5.0636167526245117, 5.1062922477722168, 5.0171608924865723, 4.8945426940917969, 4.7707195281982422, 4.683952808380127, 4.6261215209960938, 4.5961475372314453, 4.588953971862793, 4.5975632667541504], 'batch_size': 128, 'train_accuracy': None, 'verbose': 1, 'loss': [2.5437352657318115, 2.4388267993927002, 2.2880129814147949, 2.1385490894317627, 1.9580711126327515, 1.8592877388000488, 1.8720529079437256, 1.863358736038208, 1.8261202573776245, 1.831728458404541, 1.8086607456207275, 1.788567066192627, 1.796411395072937, 1.7958264350891113, 1.7889491319656372]}

text_file_path = "../weights/lstm-2-512-batchsize-128-epochs-15/model_results.json"

with open(text_file_path, "w") as f:
        json.dump(model_results, f)

