Sentiment model with CNNs

CAUTION: Only python 2.7!!


In [1]:
#Imports 
from __future__ import print_function

import numpy as np

import tensorflow as tf
print(tf.__version__)

data_path='/home/ubuntu/data/training/keras/aclImdb/'


1.0.0

Read data and create sequences


In [2]:
#Imports 
import sys
import os
import shutil
import fnmatch

import numpy as np

from collections import OrderedDict
from nltk import word_tokenize

data_path='/home/ubuntu/data/training/keras/aclImdb/'

# Generator of list of files in a folder and subfolders
def gen_find(filepath,top):
    for path, dirlist, filelist in os.walk(top):
        for name in fnmatch.filter(filelist,filepath):
            yield os.path.join(path,name)

def read_sentences(path):
    sentences = []
    sentences_list = gen_find("*.txt", path)
    for ff in sentences_list:
        with open(ff, 'r') as f:
            sentences.append(f.readline().strip())
    return sentences 

def tokenize(sentences):
    print( 'Tokenizing...',)
    tokens = []
    for sentence in sentences:
        if sys.version_info.major==2:
            tokens += [word_tokenize(sentence.decode('utf-8'))] #python2.7
        else:
            tokens += [word_tokenize(sentence)] #python3
    print('Done!')
    return tokens

def build_dict(sentences):
    print( 'Building dictionary..',)
    wordcount = dict()
    for ss in sentences:
        for w in ss:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1

    counts = wordcount.values()
    keys = list(wordcount) # keys
    sorted_idx = np.argsort(counts)[::-1]

    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+1  # leave 0 (UNK)
    print( np.sum(list(counts)), ' total words ', len(keys), ' unique words')
    return worddict, wordcount

def generate_sequence(sentences, dictionary):
    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in ss]
    return seqs

#Data extraction

#Extract training sentences
sentences_trn_pos = tokenize(read_sentences(data_path+'train/pos/'))
sentences_trn_neg = tokenize(read_sentences(data_path+'train/neg/'))
sentences_trn = sentences_trn_pos + sentences_trn_neg

#Build train dictionary
worddict, wordcount = build_dict(sentences_trn)

#Generate train data
train_x_pos = generate_sequence(sentences_trn_pos, worddict)
train_x_neg = generate_sequence(sentences_trn_neg, worddict)
X_train_full = train_x_pos + train_x_neg
y_trn = np.array([[1.,0.]]*len(train_x_pos) + [[0.,1.]]*len(train_x_neg), dtype=np.float32)


#Read test sentences and generate target y
sentences_tst_pos = read_sentences(data_path+'test/pos/')
sentences_tst_neg = read_sentences(data_path+'test/neg/')

test_x_pos = generate_sequence(tokenize(sentences_tst_pos), worddict)
test_x_neg = generate_sequence(tokenize(sentences_tst_neg), worddict)
X_test_full = test_x_pos + test_x_neg
y_tst = np.array([[1.,0.]]*len(test_x_pos) + [[0.,1.]]*len(test_x_neg), dtype=np.float32)


print('Preprocess done!')


Tokenizing...
Done!
Tokenizing...
Done!
Building dictionary..
7056193  total words  135098  unique words
Tokenizing...
Done!
Tokenizing...
Done!
Preprocess done!

In [3]:
max_features = 20000 # Number of most frequent words selected. Others recoded to 0
max_len = 100  # cut texts after this number of words (among top max_features most common words)


#Select the most frequent max_features, recode others using 0
def remove_features(x):
    return [[0 if w >= max_features else w for w in sen] for sen in x]

X_train = remove_features(X_train_full)
X_test  = remove_features(X_test_full)


# Shuffle data
from sklearn.utils import shuffle
X_train, y_trn = shuffle(X_train, y_trn, random_state=0)


# Cut or complete the sentences to length = maxlen
from keras.preprocessing import sequence

print("Pad sequences (samples x time)")
X_trn = sequence.pad_sequences(X_train, maxlen=max_len)
X_tst = sequence.pad_sequences(X_test, maxlen=max_len)
print('X_train shape:', X_trn.shape)
print('X_test shape:', X_tst.shape)


Using TensorFlow backend.
Pad sequences (samples x time)
X_train shape: (25000, 100)
X_test shape: (25000, 100)

The model


In [5]:
# Model parameters

sequence_length = 100
vocab_size = 20000

embedding_size = 128
num_filters = 128
filter_sizes = [3, 4, 5]

LOG_DIR = '/tmp/tensorboard/sentiment_cnn'

In [6]:
# Start an interactive session
gpu_options = tf.GPUOptions(allow_growth = True)
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True))

In [7]:
# Inputs
input_x = tf.placeholder(tf.int32, shape=[None, sequence_length], name="input_x")
input_y = tf.placeholder(tf.int32, shape=[None, 2], name="input_y")
dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

# Embedding layer
with tf.name_scope("embedding"):
    W_embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W_embedding")
    embedded_chars = tf.nn.embedding_lookup(W_embedding, input_x)
    embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

    
# Create a convolution + maxpool layer for each filter size
def conv_layer(x, size_x=2, size_y=2, input_channels=1, output_channels=32):
    W_conv = tf.Variable(tf.truncated_normal([size_x, size_y, input_channels, output_channels], stddev=0.1), name='W')
    b_conv = tf.Variable(tf.constant(0.1, shape=[output_channels]), name='b')
    conv_out = tf.nn.relu(tf.nn.conv2d(x, W_conv, strides=[1, 1, 1, 1], padding='VALID') + b_conv, name='conv')
    pooled = tf.nn.max_pool(conv_out, ksize=[1, sequence_length - filter_size + 1, 1, 1], 
                            strides=[1, 1, 1, 1], padding='VALID', name="pool")
    return pooled

pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
    with tf.name_scope("conv-maxpool-%s" % filter_size):
        pooled = conv_layer(embedded_chars_expanded, size_x=filter_size, size_y=embedding_size, input_channels=1, output_channels=num_filters)
        pooled_outputs.append(pooled)

# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
h_pool = tf.concat(pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])    

# Add dropout
with tf.name_scope("dropout"):
    h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

# Final (unnormalized) scores and predictions
with tf.name_scope("output"):
    W = tf.get_variable("W", shape=[num_filters_total, 2], initializer=tf.contrib.layers.xavier_initializer())
    b = tf.Variable(tf.constant(0.1, shape=[2]), name="b")
    
    l2_loss = tf.nn.l2_loss(W)
    l2_loss += tf.nn.l2_loss(b)

    scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
    predictions = tf.argmax(scores, 1, name="predictions")
    
# Calculate Mean cross-entropy loss
loss_factor = 0.1

with tf.name_scope("loss"):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=input_y), name='loss') + loss_factor * l2_loss
    loss_summ = tf.summary.scalar("Loss", loss) #TENSORBOARD

# Accuracy
with tf.name_scope("accuracy"):
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, tf.argmax(input_y, 1)), "float"), name="accuracy")
    acc_summ = tf.summary.scalar("Accuracy", accuracy) #TENSORBOARD
    
#Optimizer
with tf.name_scope("train") as scope:
    train_step = tf.train.AdamOptimizer(1e-3).minimize(loss)

In [8]:
# Merge all the summaries and write them out to /tmp/mnist_logs
with tf.name_scope('summaries') as scope:
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(LOG_DIR + '/train', sess.graph)
    test_writer  = tf.summary.FileWriter(LOG_DIR + '/test')

Batch iterator


In [9]:
def batch_iter(X, y, batch_size):
    """
    Generates a batch iterator for inputs and targets.
    """
    data_size = len(X)
    # Shuffle the data at each epoch
    shuffle_indices = np.random.permutation(np.arange(data_size))    
    shuffled_X = X[shuffle_indices]
    shuffled_y = y[shuffle_indices]
        
    num_batches = int((data_size-1)/batch_size) + 1
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        yield shuffled_X[start_index:end_index], shuffled_y[start_index:end_index]

Train iterator


In [10]:
#Inicialization.
sess.run(tf.global_variables_initializer())

# Train proccess
num_epochs = 20
batch_size = 128

step_trn = 0
step_tst = 0

loss_trn_epoch = []
loss_tst_epoch = []
acc_trn_epoch = []
acc_tst_epoch = []

print('e- LssTrn - AccTrn - LssTst - AccTst' )
for epoch in range(num_epochs):
    loss_trn = []
    acc_trn = []
    loss_tst = []
    acc_tst = []
    for x_batch, y_batch in batch_iter(X_trn, y_trn, batch_size):
        train_step.run(feed_dict={input_x: x_batch, input_y: y_batch, dropout_keep_prob: 0.5})
        loss_step, acc_step = sess.run([loss, accuracy], 
                                       feed_dict={input_x: x_batch, input_y: y_batch, dropout_keep_prob: 1})
        loss_trn += [loss_step]
        acc_trn += [acc_step]
        
        # Summary over the last batch of the epoch
        step_trn += 1
        if step_trn % 10 ==0 :
            summary_str = merged.eval(feed_dict={input_x: x_batch, input_y: y_batch, dropout_keep_prob: 1})
            train_writer.add_summary(summary_str, step_trn) #TENSORBOARD
        
    
    for x_batch_test, y_batch_test in batch_iter(X_tst, y_tst, batch_size):
        loss_step, acc_step = sess.run([loss, accuracy], 
                                       feed_dict={input_x: x_batch_test, input_y: y_batch_test, dropout_keep_prob: 1})
        loss_tst += [loss_step]
        acc_tst += [acc_step]
        
        # Summary over the last batch of the epoch
        step_tst += 1 
        if step_tst % 10 ==0 :
            summary_str = merged.eval(feed_dict={input_x: x_batch_test, input_y: y_batch_test, dropout_keep_prob: 1})
            test_writer.add_summary(summary_str, step_tst) #TENSORBOARD

    print(epoch, np.mean(loss_trn), np.mean(acc_trn), np.mean(loss_tst), np.mean(acc_tst))
    loss_trn_epoch += [np.mean(loss_trn)]
    loss_tst_epoch += [np.mean(loss_tst)]
    acc_trn_epoch += [np.mean(acc_trn)]
    acc_tst_epoch += [np.mean(acc_tst)]


e- LssTrn - AccTrn - LssTst - AccTst
0 0.796526 0.643806 0.677261 0.699809
1 0.629114 0.744149 0.649683 0.731872
2 0.60023 0.768503 0.64605 0.66305
3 0.549839 0.78188 0.599157 0.714334
4 0.495927 0.806848 0.541825 0.747202
5 0.440494 0.836201 0.484567 0.794388
6 0.385066 0.86533 0.452905 0.808315
7 0.329724 0.892403 0.421362 0.822409
8 0.275408 0.913528 0.400643 0.833139
9 0.222787 0.937101 0.395609 0.833865
10 0.177874 0.954592 0.401047 0.834343
11 0.140424 0.969308 0.41633 0.832254
12 0.108526 0.982135 0.430941 0.834495
13 0.0844958 0.990179 0.46018 0.831617
14 0.0678115 0.995177 0.489607 0.827655
15 0.0571514 0.997321 0.508476 0.829576
16 0.0497373 0.998525 0.527613 0.826666
17 0.0444149 0.999163 0.547335 0.823637
18 0.0401861 0.999641 0.559652 0.82453
19 0.0370001 0.99988 0.570758 0.824131

In [11]:
# Plot a scaterplot real vs predict
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(loss_trn_epoch)
plt.plot(loss_tst_epoch)
plt.show()



In [12]:
plt.plot(acc_trn_epoch)
plt.plot(acc_tst_epoch)
plt.show()



In [ ]: