importing require packages


In [3]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random

Instantiate Embeddings


In [4]:
embeddings = Embeddings(300, 4, 1, 4)


Loading the embeddings from the cache

getting data from preprocessing


In [5]:
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

generating training data


In [6]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)


42047

In [7]:
seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_indexed_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)#[]
        seq_out.append(word2vec_weights[y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)


Number of samples :  47295

In [8]:
seq_in.shape


Out[8]:
(47295, 5)

Defining model


In [9]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights]))
model.add(LSTM(1024,return_sequences =True))
model.add(Dropout(0.2))
model.add(LSTM(512))
#model.add(Dropout(0.2))
model.add(Dense(word2vec_weights.shape[1], activation='relu'))
model.load_weights("../weights/lstm-2-1024-512-batchsize-128-epochs-25/weights.24-0.22.hdf5")
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, None, 100)         4204700   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 1024)        4608000   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 1024)        0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               3147776   
_________________________________________________________________
dense_1 (Dense)              (None, 100)               51300     
=================================================================
Total params: 12,011,776
Trainable params: 12,011,776
Non-trainable params: 0
_________________________________________________________________

In [10]:
model_weights_path = "../weights/lstm-2-1024-512-batchsize-128-epochs-25"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

Train Model


In [11]:
#model_fit_summary = model.fit(seq_in, seq_out, epochs=25, verbose=1, validation_split=0.2, batch_size=128, callbacks=[checkpoint])

model predict


In [14]:
start = 0
sentence_test = "In which regions in particular did"
indexed_sentences = embeddings.tokenize_index_sentence(sentence_test)
print("indexed_sentences ",indexed_sentences)
sent = np.array(indexed_sentences)
#pattern = list(seq_in[start])
pattern = list(sent[start])
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(5):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(word2index[pred_word])
    pattern = pattern[:len(pattern)]


indexed_sentences  [[3, 15, 949, 3, 1878, 7]]
" in which regions in particular did
sonderbundskrieg corrientes corrientes profitabl profitabl 

In [15]:
#e_model = embeddings.get_model()

In [16]:
#e_model.similar_by_word("profitabl")


Out[16]:
[('coca-cola', 0.7328959703445435),
 ('khubilai', 0.7250775098800659),
 ('zimbabwe', 0.7238492965698242),
 ('pitatus', 0.717526912689209),
 ('underpays', 0.7077121734619141),
 ('somer', 0.6942623257637024),
 ('letterman', 0.6938465237617493),
 ('psychiatrist', 0.6875547170639038),
 ('vetting', 0.6872419118881226),
 ('chomsky', 0.6800742745399475)]

Accuracy


In [19]:
def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in, seq_out):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity == 1:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

In [20]:
#seq_out[0]

In [21]:
accuracy()


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-21-2dfa93d7db72> in <module>()
----> 1 accuracy()

<ipython-input-19-b175fdb52cc0> in accuracy()
      3     correct = 0
      4     for sub_sample_in, sub_sample_out in zip(seq_in, seq_out):
----> 5         ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
      6         ytrue = sub_sample_out
      7         pred_word = word2vec_model.similar_by_vector(ypred)[0][0]

/home/user/venvs/autofill/lib/python3.5/site-packages/keras/models.py in predict_on_batch(self, x)
    914         if self.model is None:
    915             self.build()
--> 916         return self.model.predict_on_batch(x)
    917 
    918     def train_on_batch(self, x, y, class_weight=None,

/home/user/venvs/autofill/lib/python3.5/site-packages/keras/engine/training.py in predict_on_batch(self, x)
   1694             ins = x
   1695         self._make_predict_function()
-> 1696         outputs = self.predict_function(ins)
   1697         if len(outputs) == 1:
   1698             return outputs[0]

/home/user/venvs/autofill/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2227         session = get_session()
   2228         updated = session.run(self.outputs + [self.updates_op],
-> 2229                               feed_dict=feed_dict)
   2230         return updated[:len(self.outputs)]
   2231 

/home/user/venvs/autofill/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    776     try:
    777       result = self._run(None, fetches, feed_dict, options_ptr,
--> 778                          run_metadata_ptr)
    779       if run_metadata:
    780         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/home/user/venvs/autofill/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    980     if final_fetches or final_targets:
    981       results = self._do_run(handle, final_targets, final_fetches,
--> 982                              feed_dict_string, options, run_metadata)
    983     else:
    984       results = []

/home/user/venvs/autofill/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1030     if handle is None:
   1031       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1032                            target_list, options, run_metadata)
   1033     else:
   1034       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/home/user/venvs/autofill/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1037   def _do_call(self, fn, *args):
   1038     try:
-> 1039       return fn(*args)
   1040     except errors.OpError as e:
   1041       message = compat.as_text(e.message)

/home/user/venvs/autofill/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1019         return tf_session.TF_Run(session, options,
   1020                                  feed_dict, fetch_list, target_list,
-> 1021                                  status, run_metadata)
   1022 
   1023     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [22]:
#model_results = model_fit_summary.history


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-fdb260eebd22> in <module>()
----> 1 model_results = model_fit_summary.history

NameError: name 'model_fit_summary' is not defined

In [23]:
#model_results.update(model_fit_summary.params)

In [67]:
#model_results["train_accuracy"] = accuracy()


Accuracy 0.0
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-67-8b164948de0e> in <module>()
----> 1 model_results["train_accuracy"] = accuracy()

NameError: name 'model_results' is not defined

In [28]:
# n = no. of predictions
# accuracy = accuracy(400)
#print(model_results)


{'val_acc': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'do_validation': True, 'metrics': ['loss', 'acc', 'val_loss', 'val_acc'], 'samples': 1, 'acc': [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 'epochs': 15, 'val_loss': [4.5900764465332031, 4.5754680633544922, 4.5907573699951172, 4.6593356132507324, 4.8363356590270996, 5.0636167526245117, 5.1062922477722168, 5.0171608924865723, 4.8945426940917969, 4.7707195281982422, 4.683952808380127, 4.6261215209960938, 4.5961475372314453, 4.588953971862793, 4.5975632667541504], 'batch_size': 128, 'train_accuracy': None, 'verbose': 1, 'loss': [2.5437352657318115, 2.4388267993927002, 2.2880129814147949, 2.1385490894317627, 1.9580711126327515, 1.8592877388000488, 1.8720529079437256, 1.863358736038208, 1.8261202573776245, 1.831728458404541, 1.8086607456207275, 1.788567066192627, 1.796411395072937, 1.7958264350891113, 1.7889491319656372]}

In [26]:
#text_file_path = "../weights/lstm-2-1024-512-batchsize-128-epochs-25/model_results.json"

In [27]:
#with open(text_file_path, "w") as f:
        #json.dump(model_results, f)

In [ ]: