In [1]:
import numpy as np
import pandas as pd
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from w2v import train_word2vec
import pickle, datetime
import difflib
import keras
from keras.datasets import mnist
from keras.models import Model, Sequential
from keras.utils import np_utils
from keras.layers import Input, Dense, Dropout, Flatten, Embedding, regularizers
from keras.layers import Conv1D, MaxPooling1D
from keras.layers.merge import Concatenate
from keras import optimizers
from keras.preprocessing import sequence
from keras import backend as K
np.random.seed(0)
In [2]:
df = pd.read_csv('../../Datasets/SST1_dataset/Processed_SST1.tsv', sep='\t')
train_df = pd.read_csv('../../Datasets/Kaggle_dataset/train.tsv', sep='\t', header=0)
raw_docs_train = df[df.split_ind == 1]['Phrases'].values
sentiment_train = df[df.split_ind == 1]['Label'].values
# raw_docs_train = train_df['Phrase'].values
# sentiment_train = train_df['Sentiment'].values
raw_docs_test = df[df.split_ind == 2]['Phrases'].values
sentiment_test = df[df.split_ind == 2]['Label'].values
num_labels = len(np.unique(sentiment_train))
N_TRAIN = len(raw_docs_train)
N_TEST = len(raw_docs_test)
In [3]:
#text pre-processing
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stemmer = SnowballStemmer('english')
print ("pre-processing train docs...")
processed_docs_train = []
for doc in raw_docs_train:
tokens = word_tokenize(doc)
filtered = [word for word in tokens if word not in stop_words]
stemmed = [stemmer.stem(word) for word in filtered]
processed_docs_train.append(stemmed)
print ("pre-processing test docs...")
processed_docs_test = []
for doc in raw_docs_test:
tokens = word_tokenize(doc)
filtered = [word for word in tokens if word not in stop_words]
stemmed = [stemmer.stem(word) for word in filtered]
processed_docs_test.append(stemmed)
processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)
dictionary = corpora.Dictionary(processed_docs_all)
dictionary_size = len(dictionary.keys())
print ("converting to token ids...")
word_id_train, word_id_len = [], []
for doc in processed_docs_train:
word_ids = [dictionary.token2id[word] for word in doc]
word_id_train.append(word_ids)
word_id_len.append(len(word_ids))
word_id_test, word_ids = [], []
for doc in processed_docs_test:
word_ids = [dictionary.token2id[word] for word in doc]
word_id_test.append(word_ids)
word_id_len.append(len(word_ids))
seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
#pad sequences
x_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
x_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
y_train = np_utils.to_categorical(sentiment_train, num_labels)
y_test = np_utils.to_categorical(sentiment_test, num_labels)
In [4]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
In [5]:
model_type = 'CNN-non-static' # CNN-rand|CNN-non-static|CNN-static
embedding_dim = 100 # word2vec dim
vocab_size = dictionary_size
max_sent_len = seq_len
N_CATEGORY = 5
In [6]:
if model_type in ['CNN-non-static', 'CNN-static']:
embedding_wts = train_word2vec( np.vstack((x_train, x_test)),
dictionary.token2id, num_features = embedding_dim)
if model_type == 'CNN-static':
x_train = embedding_wts[0][x_train]
x_test = embedding_wts[0][x_test]
elif model_type == 'CNN-rand':
embedding_wts = None
else:
raise ValueError("Unknown model type")
In [7]:
filter_sizes = [3,4,5]
num_filters = 50
dropout_prob = (0.5, 0.8)
hidden_dims = 50
batch_size = 64
l2_reg = 0.3
# Deciding dimension of input based on the model
input_shape = (max_sent_len, embedding_dim) if model_type == "CNN-static" else (max_sent_len,)
model_input = Input(shape = input_shape)
# Static model do not have embedding layer
if model_type == "CNN-static":
z = Dropout(dropout_prob[0])(model_input)
else:
z = Embedding(vocab_size, embedding_dim, input_length = max_sent_len, name="embedding", trainable=True)(model_input)
z = Dropout(dropout_prob[0])(z)
# Convolution layers
z1 = Conv1D( filters=num_filters, kernel_size=filter_sizes[0],
padding="valid", activation="relu",
strides=1)(z)
z1 = MaxPooling1D(pool_size = int(max_sent_len - filter_sizes[0] + 1))(z1)
z1 = Flatten()(z1)
z2 = Conv1D( filters=num_filters, kernel_size=filter_sizes[1],
padding="valid", activation="relu",
strides=1)(z)
z2 = MaxPooling1D(pool_size= int(max_sent_len - filter_sizes[1] + 1))(z2)
z2 = Flatten()(z2)
z3 = Conv1D( filters=num_filters, kernel_size=filter_sizes[2],
padding="valid", activation="relu",
strides=1)(z)
z3 = MaxPooling1D(pool_size= int(max_sent_len - filter_sizes[2] + 1))(z3)
z3 = Flatten()(z3)
# Concatenate the output of all convolution layers
z = Concatenate()([z1, z2, z3])
z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense( N_CATEGORY, activation="softmax")(z)
model = Model(model_input, model_output)
model.summary()
In [8]:
from sklearn.metrics import roc_auc_score
import time
import pylab as pl
from IPython import display
pl.style.use('ggplot')
%matplotlib inline
class Histories(keras.callbacks.Callback):
def on_train_begin(self, logs={}):
self.acc = []
self.loss = []
self.val_loss = []
self.val_acc = []
def on_train_end(self, logs={}):
return
def on_epoch_begin(self, epoch, logs={}):
return
def on_epoch_end(self, epoch, logs={}):
self.acc.append(logs['acc'])
self.loss.append(logs['loss'])
self.val_acc.append(logs['val_acc'])
self.val_loss.append(logs['val_loss'])
pl.hold(True)
pl.plot(self.acc)
pl.plot(self.loss)
pl.plot(self.val_acc)
pl.plot(self.val_loss)
pl.legend(['Train acc','Train loss','Valid acc', 'Valid loss'], loc=2)
display.clear_output(wait=True)
display.display(pl.gcf())
return
def on_batch_begin(self, batch, logs={}):
return
def on_batch_end(self, batch, logs={}):
return
In [ ]:
if model_type == "CNN-non-static":
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights(embedding_wts)
model.compile(loss="categorical_crossentropy", optimizer=optimizers.SGD(), metrics=["accuracy"])
histories = Histories()
res = model.fit(x_train, y_train,
batch_size = batch_size,
epochs=200,
validation_data=(x_test, y_test), verbose=2, callbacks=[histories])
In [27]:
import os
os.system('say done')
Out[27]:
In [15]:
scores = model.evaluate(x_train, y_train, verbose=0)
print("Train Accuracy: %.2f%%" % (scores[1]*100))
scores = model.evaluate(x_test, y_test, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))
In [17]:
date = str(datetime.date.today() )
time = str(datetime.datetime.now().time())[:-7]
filename = './newtrain_' + model_type + '_' + date + '_' +time
with open( filename + '_history', 'wb') as output:
pickle.dump([model.history.history], output, pickle.HIGHEST_PROTOCOL)
model.save(filename + '.h5')
In [20]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (10,6)
plt.plot(model.history.history['acc'])
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_acc'])
plt.plot(model.history.history['val_loss'])
plt.legend(['Train acc','Train loss','Valid acc', 'Valid loss'], loc=2)
plt.xlabel('Epochs')
plt.ylabel('Loss/Accuracy')
plt.title('Using '+ model_type)
# imgName = 'Images/' + model_type + '_' + date + '_' + time + '.jpg'
# plt.savefig( imgName, dpi= 200, bbox_inches='tight', transparent=False)
plt.show()
In [19]:
import pickle
# filename = './newtrain_CNN-rand_2017-04-08_16:29:11'
model = keras.models.load_model(filename)
In [38]:
model.compile(loss="categorical_crossentropy", optimizer=optimizers.SGD(0.005), metrics=["accuracy"])
histories = Histories()
res = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=110,
validation_data=(x_test, y_test), verbose=2, callbacks=[histories])
In [24]:
date = str(datetime.date.today() )
time = str(datetime.datetime.now().time())[:-7]
filename = './newtrain_continued_' + model_type + '_' + date + '_' +time
with open( filename + '_history', 'wb') as output:
pickle.dump([model.history.history], output, pickle.HIGHEST_PROTOCOL)
model.save(filename + '.h5';)
In [23]:
import os
os.system('say done')
Out[23]:
In [24]:
files = [
'CNN-rand_continued_2017-04-02_19:26:35',
'CNN-rand_continued_2017-04-03_16:50:47',
'CNN-rand_continued_2017-04-03_17:18:47'
]
In [25]:
train_acc, train_loss, val_acc, val_loss = [],[],[],[]
for file in files:
hist = out[0][0]
train_acc += hist.history['acc']
train_loss += hist.history['loss']
val_acc += hist.history['val_acc']
val_loss += hist.history['val_loss']
In [35]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (10,6)
plt.plot(train_acc)
plt.plot(train_loss)
plt.plot(val_acc)
# plt.plot(val_loss)
plt.legend(['Train acc','Train loss','Valid acc', 'Valid loss'], loc=2)
plt.xlabel('Epochs')
plt.ylabel('Loss/Accuracy')
plt.title('Using '+ model_type)
date = str(datetime.date.today() )
time = str(datetime.datetime.now().time())[:-7]
imgName = 'Images/' + model_type + '_' + date + '_' + time + '.jpg'
plt.savefig( imgName, dpi= 200, bbox_inches='tight', transparent=False)
plt.show()