importing require packages


In [1]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random


Using TensorFlow backend.

Instantiate Embeddings


In [3]:
word_embedding_dimension = 300
word_embedding_window_size = 4
embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4)


Loading embeddings....
Loading Squad Data
Loading the embeddings from the cache
Starting tokenized, pos squad data.....
Combining Squad Data
Creating Tokenized Squad Corpus
Creating Tokenized Squad Corpus

getting data from preprocessing


In [4]:
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

generating training data


In [5]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)
#sorted(window_size,reverse=True)
#sentence_max_length = max([len(sentence) for sentence in tokenized_indexed_sentence ])


42049

In [6]:
seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_indexed_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)#[]
        seq_out.append(word2vec_weights[y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)


Number of samples :  610848

In [10]:
pattern = list(seq_in[97])

In [14]:
type(print("\"",' '.join(index2word[index] for index in pattern)))


" would super bowl 50 have
Out[14]:
NoneType

In [17]:
word2vec_weights.shape


Out[17]:
(42049, 300)

Defining model


In [15]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights]))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(word2vec_weights.shape[1], activation='sigmoid'))
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, None, 300)         12614700  
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 512)         1665024   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               153900    
=================================================================
Total params: 16,532,824
Trainable params: 16,532,824
Non-trainable params: 0
_________________________________________________________________

In [18]:
model_weights_path = "../weights/LSTM_1_Layer"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

Train Model


In [ ]:
model_fit_summary = model.fit(seq_in, seq_out, epochs=1, verbose=1, validation_split=0.2, batch_size=256, callbacks=[checkpoint])


Train on 488678 samples, validate on 122170 samples
Epoch 1/1
116480/488678 [======>.......................] - ETA: 2093s - loss: 0.1635 - acc: 0.0764

model predict


In [ ]:
start = 97
pattern = list(seq_in[start])
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(10):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0])[0][0]
    sys.stdout.write(pred_word)
    pattern.append(word2index[pred_word])
    pattern = pattern[1:len(pattern)]

Accuracy


In [ ]:
def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in[:5], seq_out[:5]):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity >= 0.85:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

In [ ]:
# n = no. of predictions
accuracy(n)

In [ ]:
model_results = model_fit_summary.history
model_results.update(model_fit_summary.params)
model_results["word_embedding_dimension"] = word_embedding_dimension
model_results["word_embedding_window_size"] = word_embedding_window_size
model_results["window_size"] = window_size

model_results["train_accuracy"] = accuracy()
text_file_path = "../weights/lstm-2-512-batchsize-128-epochs-15/model_results.json"
with open(text_file_path, "w") as f:
        json.dump(model_results, f)

In [ ]:


In [ ]: