Created by Peter Nagy February 2017 Github
As an improvement to my previous Kernel, here I am trying to achieve better results with a Recurrent Neural Network.
In [32]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
os.environ['KERAS_BACKEND']='theano'
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, LSTM, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate, Convolution1D
from keras.optimizers import Adam, SGD
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.callbacks import TensorBoard
seed = 42
np.random.seed(seed)
import matplotlib.pyplot as plt
%matplotlib inline
Only keeping the necessary columns.
In [2]:
MAX_SEQUENCE_LENGTH = 1000 # top 30
MAX_NB_WORDS = 20000 # more than vocab size
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
data_train = pd.read_csv('../result_all_windows_labels.csv')
# Cleanup - remove no labels
data_train = data_train[data_train['label'].notnull()]
data_train = data_train[data_train.label != 'environmental']
data_train = data_train[data_train.label != 'religious']
data_train = data_train[data_train.label != 'economical']
label_cat = {'violence/terrorism' : 1, 'misc': 2, 'political': 3,
# 'religious': 4, 'economical': 5, 'environmental': 6
}
print(label_cat)
def to_category(x):
return label_cat[x]
data_train['target'] = data_train.apply(lambda row: to_category(row['label']), axis=1)
data_train['target'].plot.hist(alpha=0.5)
texts = []
# Get corpus by joining all keywords
for index, row in data_train.iloc[ :, 2:32].iterrows():
texts.append(u' '.join(row.tolist()))
data_train['topicFlat'] = texts
labels = data_train['target']
# print(labels)
data_train['topicFlat'].head()
Out[2]:
Next, I am dropping the 'Neutral' sentiments as my goal was to only differentiate positive and negative tweets. After that, I am filtering the tweets so only valid texts and words remain. Then, I define the number of max features as 2000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input.
In [4]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data_train['topicFlat'].values)
X = tokenizer.texts_to_sequences(data_train['topicFlat'].values)
print(X[0])
X = pad_sequences(X)
print(X[0])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index)) # all the tokens in corpus
print(X.shape)
Next, I compose the LSTM Network. Note that embed_dim, lstm_out, batch_size, droupout_x variables are hyperparameters, their values are somehow intuitive, can be and must be played with in order to achieve good results. Please also note that I am using softmax as activation function. The reason is that our Network is using categorical crossentropy, and softmax is just the right activation method for that.
In [5]:
# Y = data_train['target'].values
Y = pd.get_dummies(data_train['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
print(Y_train[100])
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
In [6]:
def plot_history(history):
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
In [19]:
embed_dim = 128
lstm_out = 196
epochs = 100
sequence_length = X.shape[1] #
model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length=sequence_length))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_cat), activation='softmax'))
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss = 'categorical_crossentropy',
optimizer=adam, metrics = ['accuracy'])
print(model.summary())
batch_size = 32
model_name = 'topicConvNet-Reg.h5'
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
# checkpoint = ModelCheckpoint(model_name, verbose=0, save_best_only=True)
checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5',
monitor='val_acc', verbose=0, save_best_only=True, mode='max')
network_hist = model.fit(X_train, Y_train, epochs = epochs,
validation_data=(X_test, Y_test),
callbacks=[early_stop, checkpoint],
validation_split=0.2,
verbose=1, batch_size=batch_size)
score, acc = model.evaluate(X_test, Y_test)
print('Test score:', score)
print('Test accuracy:', acc)
# print(network_hist.history)
plot_history(network_hist)
In [49]:
%%time
# Source
# https://github.com/bhaveshoswal/CNN-text-classification-keras/blob/master/model.py
sequence_length = X.shape[1] #
print(sequence_length)
print(len(X))
vocabulary_size = len(word_index) #
embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 128 # 512
drop = 0.2
epochs = 20
batch_size = 30
# this returns a tensor
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim,
input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)
conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim),
padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim),
padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim),
padding='valid', kernel_initializer='normal', activation='relu')(reshape)
maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1),
strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1),
strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1),
strides=(1,1), padding='valid')(conv_2)
concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=len(label_cat), activation='softmax')(dropout)
# this creates a model that includes
model = Model(inputs=inputs, outputs=output)
print(model.summary())
# early_stop = EarlyStopping(monitor='val_loss', patience=4, verbose=1)
checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5',
monitor='val_acc', verbose=0,
save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer='Adagrad', loss='categorical_crossentropy',
metrics=['accuracy'])
print("Traning Model...")
network_hist = model.fit(X_train, Y_train, batch_size=batch_size,
epochs=epochs, verbose=1,
callbacks=[checkpoint],
validation_data=(X_test, Y_test)) # starts training
score, acc = model.evaluate(X_test, Y_test)
print('Test score:', score)
print('Test accuracy:', acc)
# print(network_hist.history)
plot_history(network_hist)
In [ ]:
In [47]:
%%time
embed_dim = 128
sequence_length = X.shape[1] #
print(sequence_length)
print(len(X))
epochs = 30
batch_size = 30
# https://github.com/Theo-/sentiment-analysis-keras-conv/blob/master/train_keras.py
# Using keras to load the dataset with the top_words
# top_words = 10000
# (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# Pad the sequence to the same length
# max_review_length = 1600
# X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
# X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# Using embedding from Keras
# embedding_vecor_length = 300
model = Sequential()
# model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Embedding(max_fatures, embed_dim, input_length=sequence_length))
# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same', kernel_initializer='normal', activation='relu'))
model.add(Convolution1D(32, 3, padding='same', kernel_initializer='normal', activation='relu'))
model.add(Convolution1D(16, 3, padding='same', kernel_initializer='normal', activation='relu'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(Dense(180, activation='sigmoid'))
model.add(Dropout(0.1))
model.add(Dense(units=len(label_cat), activation='sigmoid'))
print(model.summary())
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
sgd = SGD(lr=0.01, clipnorm=1.)
# Adagrad
# Log to tensorboard
# tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='categorical_crossentropy',
optimizer=sgd, metrics=['accuracy'])
# early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5',
monitor='val_acc', verbose=0,
save_best_only=True, mode='auto')
print("Traning Model...")
network_hist = model.fit(X_train, Y_train, batch_size=batch_size,
epochs=epochs, verbose=1,
callbacks=[checkpoint],
validation_data=(X_test, Y_test)) # starts training
# Evaluation on the test set
# scores = model.evaluate(X_test, Y_test, verbose=0)
# print("Accuracy: %.2f%%" % (scores[1]*100))
score, acc = model.evaluate(X_test, Y_test)
print('Test score:', score)
print('Test accuracy:', acc)
# print(network_hist.history)
plot_history(network_hist)
In [ ]: