In [1]:
# author - Richard Liao
# Dec 26 2016
# https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model


Using Theano backend.
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.

In [2]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

data_train = pd.read_csv('../data/labeledTrainData.tsv', sep='\t')
print (data_train.shape)


texts = []
labels = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx], "lxml")
    texts.append(clean_str(str(text.get_text().encode('ascii','ignore'))))
    labels.append(data_train.sentiment[idx])

# print(labels)
data_train.head()


(25000, 3)
Out[2]:
id sentiment review
0 5814_8 1 With all this stuff going down at the moment w...
1 2381_9 1 \The Classic War of the Worlds\" by Timothy Hi...
2 7759_3 0 The film starts with a manager (Nicholas Bell)...
3 3630_4 0 It must be assumed that those who praised this...
4 9495_8 1 Superbly trashy and wondrously unpretentious 8...

In [3]:
%%time
    

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
# print(sequences[0])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index)) # all the tokens in corpus
# print(word_index)

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


Found 83244 unique tokens.
Shape of data tensor: (25000, 1000)
Shape of label tensor: (25000, 2)
CPU times: user 7.07 s, sys: 112 ms, total: 7.18 s
Wall time: 7.14 s

In [4]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set ')
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))


Number of positive and negative reviews in traing and validation set 
[  9962.  10038.]
[ 2538.  2462.]

In [5]:
%%time

# https://www.kaggle.com/rtatman/glove-global-vectors-for-word-representation/data

GLOVE_DIR = "../data/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))


Total 400000 word vectors in Glove 6B 100d.
CPU times: user 10.8 s, sys: 148 ms, total: 11 s
Wall time: 11 s

In [6]:
%%time


embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)


CPU times: user 172 ms, sys: 16 ms, total: 188 ms
Wall time: 186 ms

In [7]:
%%time


sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - simplified convolutional neural network")
model.summary()
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=10, batch_size=128)


model fitting - simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         8324500   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
=================================================================
Total params: 8,569,494
Trainable params: 8,569,494
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 770s 38ms/step - loss: 0.7107 - acc: 0.5737 - val_loss: 0.7683 - val_acc: 0.5492
Epoch 2/10
20000/20000 [==============================] - 769s 38ms/step - loss: 0.4687 - acc: 0.7813 - val_loss: 0.3534 - val_acc: 0.8466
Epoch 3/10
18048/20000 [==========================>...] - ETA: 1:10 - loss: 0.3405 - acc: 0.8543
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<timed exec> in <module>()

/usr/local/lib/python3.5/dist-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
   1655                               initial_epoch=initial_epoch,
   1656                               steps_per_epoch=steps_per_epoch,
-> 1657                               validation_steps=validation_steps)
   1658 
   1659     def evaluate(self, x=None, y=None,

/usr/local/lib/python3.5/dist-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
   1211                     batch_logs['size'] = len(batch_ids)
   1212                     callbacks.on_batch_begin(batch_index, batch_logs)
-> 1213                     outs = f(ins_batch)
   1214                     if not isinstance(outs, list):
   1215                         outs = [outs]

/usr/local/lib/python3.5/dist-packages/keras/backend/theano_backend.py in __call__(self, inputs)
   1222     def __call__(self, inputs):
   1223         assert isinstance(inputs, (list, tuple))
-> 1224         return self.function(*inputs)
   1225 
   1226 

/usr/local/lib/python3.5/dist-packages/theano/compile/function_module.py in __call__(self, *args, **kwargs)
    901         try:
    902             outputs =\
--> 903                 self.fn() if output_subset is None else\
    904                 self.fn(output_subset=output_subset)
    905         except Exception:

/usr/local/lib/python3.5/dist-packages/theano/gof/op.py in rval(p, i, o, n)
    889         if params is graph.NoParams:
    890             # default arguments are stored in the closure of `rval`
--> 891             def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
    892                 r = p(n, [x[0] for x in i], o)
    893                 for o in node.outputs:

KeyboardInterrupt: 

In [ ]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

# applying a more complex convolutional approach
convs = []
filter_sizes = [3,4,5]

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

for fsz in filter_sizes:
    l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(5)(l_conv)
    convs.append(l_pool)
    
l_merge = Merge(mode='concat', concat_axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - more complex convolutional neural network")
model.summary()
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=20, batch_size=50)