In [31]:
import numpy as np
import os
import sys
import h5py
import datetime
import json

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from embeddings import Embeddings

Setting Parameters

In [35]:
model_name = 'POS_LSTM_' + '_1024_1024_' + loss_function + "_" + activation + "_" + str(window_size) + "_" + str(batch_size) #MODEL_NAME #POS-LSTM

In [12]:
word_embedding_dimension = 100
word_embedding_window_size = 4
batch_size = BATCH_SIZE # 32, 64, 128
epochs = EPOCH_SIZE # 10, 15, 30
window_size = WINDOW_SIZE # 3, 4, 5
accuracy_threshold = 0.85
activation = ACTIVATION_FUNCTION # sigmoid, relu, softmax
custom_accuracy = 0
loss_function = LOSS_FUNCTION # mse

In [3]:
embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4)
tokenized_pos_sentences = embeddings.get_pos_categorical_indexed_sentences()
pos2index, index2pos = embeddings.get_pos_vocabulary()
no_of_unique_tags = len(pos2index)

Loading the embeddings from the cache

In [4]:
seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_pos_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  68424

In [25]:
x_data = seq_in
y_data = seq_out

In [24]:
# Changes to the model to be done here
model = Sequential()
model.add(LSTM(512, input_shape=(x_data.shape[1], x_data.shape[2]), return_sequences=True))
model.add(Dense(no_of_unique_tags, activation='relu'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 3, 512)            1089536   
lstm_2 (LSTM)                (None, 512)               2099200   
dense_1 (Dense)              (None, 19)                9747      
Total params: 3,198,483
Trainable params: 3,198,483
Non-trainable params: 0

In [26]:
model_weights_path = "../weights/"+ model_name
if not os.path.exists(model_weights_path):
checkpoint_path = model_weights_path + '/pos_weights.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

In [27]:
model_fit_summary =, y_data, epochs=epochs, batch_size=batch_size, verbose=1, validation_split=0.25, callbacks=[checkpoint])

Train on 75 samples, validate on 25 samples
Epoch 1/5
Epoch 00000: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.00-0.28.hdf5
75/75 [==============================] - 0s - loss: 9.7370 - acc: 0.1867 - val_loss: 8.5557 - val_acc: 0.2800
Epoch 2/5
Epoch 00001: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.01-0.20.hdf5
75/75 [==============================] - 0s - loss: 5.1961 - acc: 0.3867 - val_loss: 5.1698 - val_acc: 0.2000
Epoch 3/5
Epoch 00002: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.02-0.16.hdf5
75/75 [==============================] - 0s - loss: 3.1124 - acc: 0.5067 - val_loss: 5.0623 - val_acc: 0.1600
Epoch 4/5
Epoch 00003: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.03-0.08.hdf5
75/75 [==============================] - 0s - loss: 3.0978 - acc: 0.3067 - val_loss: 5.0362 - val_acc: 0.0800
Epoch 5/5
Epoch 00004: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.04-0.08.hdf5
75/75 [==============================] - 0s - loss: 3.1098 - acc: 0.2400 - val_loss: 5.0249 - val_acc: 0.0800

In [29]:
check_ori = 0
check_pre = 0
counter = 0
test_start = 0
test_end = 10000
list_for_hist_words = []
list_for_hist_index = []
list_for_hist_words_ori = []
list_for_hist_index_ori = []
for i in range(test_start, test_end):
    test_no = i
    to_predict = x_data[test_no:test_no+1]
    y_ans = model.predict(to_predict)
    for word, corr_int in pos2index.items():
        if corr_int == np.argmax(y_ans):
            #print ("pridicted: ",word, corr_int)
            check_pre = corr_int
        if corr_int == np.argmax(y_data[test_no:test_no+1]):
            #print ("original: ",word, corr_int)
            check_ori = corr_int
    if check_ori == check_pre :
        counter += 1

print("Correct predictions: ",counter, '\nTotal Predictions: ',test_end - test_start)
custom_accuracy = counter/(test_end-test_start)

Correct predictions:  19 
Total Predictions:  100

In [32]:
model_results = model_fit_summary.history
model_results["word_embedding_dimension"] = word_embedding_dimension
model_results["word_embedding_window_size"] = word_embedding_window_size
model_results["window_size"] = window_size
model_results["batch_size"] = batch_size
model_results["epochs"] = epochs
model_results["model_name"] = model_name
model_results["accuracy_threshold"] = accuracy_threshold
model_results["activation"] = activation 
model_results["custom_accuracy"] = custom_accuracy
model_results["loss_function"] = loss_function
model_results["layers"] = []
model_results["dropouts"] = []
for layer in model.layers:
    if hasattr(layer, "units"):
        layer_summary = {}
        layer_summary["units"] = layer.get_config()["units"]
        layer_summary["name"] =
    if hasattr(layer, "rate"):
        dropout_summary = {}
        dropout_summary["rate"] = layer.get_config()["rate"]
text_file_path = "../weights/{0}/model_results.json".format(model_name)
with open(text_file_path, "w") as f:
        json.dump(model_results, f)

In [15]:
# import matplotlib.pyplot as plt
# %matplotlib inline

In [16]:
# pos_vocab = [v for (k,v) in index2pos.items()]

In [17]:
# plt.figure(figsize=(16,5))
# plt.hist(list_for_hist_index, width=1, color='r', alpha=0.5)
# plt.hist(list_for_hist_index_ori, width=1, color='b', alpha=0.5)
# plt.xticks(range(len(pos_vocab)),pos_vocab, rotation='vertical')

In [18]:
# list_x = []
# list_y = []
# data_all = []
# for i in range(0,1500):
#     list_x.append((index2pos[np.argmax(x_data[i][0])], index2pos[np.argmax(x_data[i][1])]))
#     list_y.append(index2pos[np.argmax(y_data[i])])
#     data_all.append((str(list_x[i]),list_y[i]))

In [19]:
# from nltk import ConditionalFreqDist as cfd
# from nltk.collocations import *
# import plotly.offline as plot
# import plotly.graph_objs as go
# plot.offline.init_notebook_mode(connected=True)
# import pandas as pd

In [20]:
# cfd_res = cfd(data_all)

In [21]:
# df = pd.DataFrame(cfd_res).fillna(0)
# mat = df.as_matrix()
# #mat

In [22]:
# trace = go.Heatmap(z = mat,
#                    x=df.columns,
#                    y=list(df.index))
# data=[trace]
# plot.iplot(data, filename='labelled-heatmap')

In [ ]: