In [8]:
import numpy as np
import os
import sys
import h5py
import datetime
import json

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from embeddings import Embeddings


Using TensorFlow backend.
EMBEDDING(100,4,1,4) STARTED .....
Loading the embeddings from the cache
EMBEDDING(100,4,1,4) COMPLETED .....

Setting Parameters


In [13]:
word_embedding_dimension = 100
word_embedding_window_size = 4
batch_size = 128 #BATCH_SIZE # 32, 64, 128
epochs = 15 #EPOCH_SIZE # 10, 15, 30
window_size = 3 #WINDOW_SIZE # 3, 4, 5
accuracy_threshold = 0.85
activation = 'relu' #ACTIVATION_FUNCTION # sigmoid, relu, softmax
custom_accuracy = 0
loss_function = 'mse' #LOSS_FUNCTION # mse

In [20]:
model_name = 'POS_LSTM_' + '_1024_1024_' + loss_function + "_" + activation + "_" + str(window_size) + "_" + str(batch_size) #MODEL_NAME #POS-LSTM

In [15]:
embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4)
tokenized_pos_sentences = embeddings.get_pos_categorical_indexed_sentences()
pos2index, index2pos = embeddings.get_pos_vocabulary()
no_of_unique_tags = len(pos2index)


Loading the embeddings from the cache

In [16]:
seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_pos_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)
        seq_out.append(y)
# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)


Number of samples :  68424

In [17]:
x_data = seq_in
y_data = seq_out

In [18]:
# Changes to the model to be done here
model = Sequential()
model.add(LSTM(1024, input_shape=(x_data.shape[1], x_data.shape[2]), return_sequences=True))
#model.add(Dropout(0.2))
model.add(LSTM(1024))
#model.add(Dropout(0.2))
model.add(Dense(no_of_unique_tags, activation='relu'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 3, 1024)           4276224   
_________________________________________________________________
lstm_2 (LSTM)                (None, 1024)              8392704   
_________________________________________________________________
dense_1 (Dense)              (None, 19)                19475     
=================================================================
Total params: 12,688,403
Trainable params: 12,688,403
Non-trainable params: 0
_________________________________________________________________

In [19]:
model_weights_path = "../weights/"+ model_name
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/pos_weights.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

In [27]:
model_fit_summary = model.fit(x_data, y_data, epochs=epochs, batch_size=batch_size, verbose=1, validation_split=0.25, callbacks=[checkpoint])


Train on 75 samples, validate on 25 samples
Epoch 1/5
Epoch 00000: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.00-0.28.hdf5
75/75 [==============================] - 0s - loss: 9.7370 - acc: 0.1867 - val_loss: 8.5557 - val_acc: 0.2800
Epoch 2/5
Epoch 00001: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.01-0.20.hdf5
75/75 [==============================] - 0s - loss: 5.1961 - acc: 0.3867 - val_loss: 5.1698 - val_acc: 0.2000
Epoch 3/5
Epoch 00002: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.02-0.16.hdf5
75/75 [==============================] - 0s - loss: 3.1124 - acc: 0.5067 - val_loss: 5.0623 - val_acc: 0.1600
Epoch 4/5
Epoch 00003: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.03-0.08.hdf5
75/75 [==============================] - 0s - loss: 3.0978 - acc: 0.3067 - val_loss: 5.0362 - val_acc: 0.0800
Epoch 5/5
Epoch 00004: saving model to ../weights/POS_LSTMmse_0_relu_3_128/pos_weights.04-0.08.hdf5
75/75 [==============================] - 0s - loss: 3.1098 - acc: 0.2400 - val_loss: 5.0249 - val_acc: 0.0800

In [29]:
check_ori = 0
check_pre = 0
counter = 0
test_start = 0
test_end = 10000
list_for_hist_words = []
list_for_hist_index = []
list_for_hist_words_ori = []
list_for_hist_index_ori = []
for i in range(test_start, test_end):
    test_no = i
    to_predict = x_data[test_no:test_no+1]
    y_ans = model.predict(to_predict)
    
    for word, corr_int in pos2index.items():
        if corr_int == np.argmax(y_ans):
            #print ("pridicted: ",word, corr_int)
            check_pre = corr_int
            list_for_hist_words.append(word)
            list_for_hist_index.append(corr_int)
        if corr_int == np.argmax(y_data[test_no:test_no+1]):
            #print ("original: ",word, corr_int)
            check_ori = corr_int
            list_for_hist_words_ori.append(word)
            list_for_hist_index_ori.append(corr_int)
    if check_ori == check_pre :
        counter += 1
    #print('\n')

print("Correct predictions: ",counter, '\nTotal Predictions: ',test_end - test_start)
custom_accuracy = counter/(test_end-test_start)


Correct predictions:  19 
Total Predictions:  100

In [32]:
model_results = model_fit_summary.history
model_results.update(model_fit_summary.params)
model_results["word_embedding_dimension"] = word_embedding_dimension
model_results["word_embedding_window_size"] = word_embedding_window_size
model_results["window_size"] = window_size
model_results["batch_size"] = batch_size
model_results["epochs"] = epochs
model_results["model_name"] = model_name
model_results["accuracy_threshold"] = accuracy_threshold
model_results["activation"] = activation 
model_results["custom_accuracy"] = custom_accuracy
model_results["loss_function"] = loss_function
model_results["layers"] = []
model_results["dropouts"] = []
for layer in model.layers:
    if hasattr(layer, "units"):
        layer_summary = {}
        layer_summary["units"] = layer.get_config()["units"]
        layer_summary["name"] = layer.name
        model_results["layers"].append(layer_summary)
    if hasattr(layer, "rate"):
        dropout_summary = {}
        dropout_summary["rate"] = layer.get_config()["rate"]
        model_results["dropouts"].append(dropout_summary)
text_file_path = "../weights/{0}/model_results.json".format(model_name)
with open(text_file_path, "w") as f:
        json.dump(model_results, f)

In [15]:
# import matplotlib.pyplot as plt
# %matplotlib inline

In [16]:
# pos_vocab = [v for (k,v) in index2pos.items()]

In [17]:
# plt.figure(figsize=(16,5))
# plt.hist(list_for_hist_index, width=1, color='r', alpha=0.5)
# plt.hist(list_for_hist_index_ori, width=1, color='b', alpha=0.5)
# plt.xticks(range(len(pos_vocab)),pos_vocab, rotation='vertical')
# plt.show()

In [18]:
# list_x = []
# list_y = []
# data_all = []
# for i in range(0,1500):
#     list_x.append((index2pos[np.argmax(x_data[i][0])], index2pos[np.argmax(x_data[i][1])]))
#     list_y.append(index2pos[np.argmax(y_data[i])])
#     data_all.append((str(list_x[i]),list_y[i]))

In [19]:
# from nltk import ConditionalFreqDist as cfd
# from nltk.collocations import *
# import plotly.offline as plot
# import plotly.graph_objs as go
# plot.offline.init_notebook_mode(connected=True)
# import pandas as pd

In [20]:
# cfd_res = cfd(data_all)

In [21]:
# df = pd.DataFrame(cfd_res).fillna(0)
# mat = df.as_matrix()
# #mat

In [22]:
# trace = go.Heatmap(z = mat,
#                    x=df.columns,
#                    y=list(df.index))
# data=[trace]
# plot.iplot(data, filename='labelled-heatmap')

In [ ]: