importing require packages



In [1]:

    
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize









    



Using TensorFlow backend.






    



EMBEDDING(100,4,1,4) STARTED .....
Loading the embeddings from the cache
EMBEDDING(100,4,1,4) COMPLETED .....

Instantiate Embeddings



In [2]:

    
embeddings = Embeddings(100, 3, 1, 4)

getting data from preprocessing



In [3]:

    
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

generating training data



In [4]:

    
window_size = 5
vocab_size = len(word2index)
print(vocab_size)
#sorted(window_size,reverse=True)
#sentence_max_length = max([len(sentence) for sentence in tokenized_indexed_sentence ])



In [5]:

    
seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_indexed_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)#[]
        seq_out.append(word2vec_weights[y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)









    



Number of samples :  3487573

Defining model



In [14]:

    
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights]))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(word2vec_weights.shape[1], activation='sigmoid'))
model.load_weights("../weights/Model-LSTM-2-Layers-512-50-Epochs/weights.09-0.20.hdf5")
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, None, 100)         13860700  
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 512)         1255424   
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               51300     
=================================================================
Total params: 17,266,624
Trainable params: 17,266,624
Non-trainable params: 0
_________________________________________________________________



In [15]:

    
model_weights_path = "../weights/Model-LSTM-2-Layers-512-50-Epochs"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights-02.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

Train Model



In [16]:

    
# model.fit(seq_in, seq_out, epochs=40, verbose=1, validation_split=0.2, batch_size=256, callbacks=[checkpoint])

model predict



In [17]:

    
start = 97
pattern = list(seq_in[start])
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(100):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0])[0][0]
    sys.stdout.write(pred_word + " ")
    pattern.append(word2index[pred_word])
    pattern = pattern[1:len(pattern)]









    



" been known as
the Philippines manes are no opposed to restricted corruption in flavoured with confusing up-direction is understood even because it is up-direction is up-direction therefore is even less restricted to the strangled by Sukuma Kyogoku and hot-spot diverted to glue subject to solution solution problematic in restricted environment restricted to restricted corruption in flavoured with confusing up-direction is understood even because it is up-direction is up-direction therefore is even less restricted to the strangled by Sukuma Kyogoku and hot-spot diverted to glue subject to solution solution problematic in restricted environment restricted to restricted corruption in flavoured with confusing up-direction is understood



In [18]:

    
pred_words = word2vec_model.similar_by_vector(prediction[0])
pred_words









    Out[18]:





[('understood', 0.644871175289154),
 ('considered', 0.6332186460494995),
 ('problematic', 0.6309871077537537),
 ("'false", 0.6276131868362427),
 ('perceived', 0.6244549751281738),
 ('seen', 0.6208130121231079),
 ('defined', 0.6191920042037964),
 ('identified', 0.6179676651954651),
 ('repugnant', 0.6161847114562988),
 ('treated', 0.6141287684440613)]



In [19]:

    
from itertools import product



In [20]:

    
word_pairs = list(product([word2vec_model.similar_by_vector(seq_out[97])[0][0]], list(zip(*pred_words))[0]))



In [21]:

    
max([word2vec_model.similarity(w1, w2) for w1, w2 in word_pairs])









    Out[21]:





0.2680437971440815



In [ ]: