In [33]:
# Import packages
import os
import re
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from random import shuffle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tflearn.data_utils import VocabularyProcessor
# Define dataset directories
train_pos_dir = 'aclImdb/train/pos/'
train_neg_dir = 'aclImdb/train/neg/'
test_pos_dir = 'aclImdb/test/pos/'
test_neg_dir = 'aclImdb/test/neg/'
# Define dataset size
data_set_size = 12500
In [36]:
# Read data
train_pos = []
for file_name in os.listdir(train_pos_dir):
with open(train_pos_dir + file_name, 'r') as myfile:
train_pos.append(myfile.read())
train_neg = []
for file_name in os.listdir(train_neg_dir):
with open(train_neg_dir + file_name, 'r') as myfile:
train_neg.append(myfile.read())
test_pos = []
for file_name in os.listdir(test_pos_dir):
with open(test_pos_dir + file_name, 'r') as myfile:
test_pos.append(myfile.read())
test_neg = []
for file_name in os.listdir(test_neg_dir):
with open(test_neg_dir + file_name, 'r') as myfile:
test_neg.append(myfile.read())
# Cleanse data
stop_words = set(stopwords.words("english"))
def parse_html(data):
data = BeautifulSoup(data, 'lxml').get_text() # Remove markup
data = re.sub("[^a-zA-Z]"," ", data) # Remove all non-alphanumeric characters
data = ' '.join([x for x in data.lower().split() if not x in stop_words]) #Remove stopwords
return data
for i in xrange(data_set_size):
train_pos[i] = parse_html(train_pos[i])
train_neg[i] = parse_html(train_neg[i])
test_pos[i] = parse_html(test_pos[i])
test_neg[i] = parse_html(test_neg[i])
total_x = np.concatenate([train_pos, train_neg, test_pos, test_neg])
train_y = np.concatenate([[[0, 1] for _ in xrange(data_set_size)],
[[1, 0] for _ in xrange(data_set_size)]])
test_y = np.concatenate([[[0, 1] for _ in xrange(data_set_size)],
[[1, 0] for _ in xrange(data_set_size)]])
print '%i total reviews:' % len(total_x)
print total_x[0][:50]
In [3]:
# Graph distribution of review lengths
%matplotlib inline
review_lengths = sorted([len(x.split()) for x in train_x] +
[len(x.split()) for x in test_x])[:49900]
plt.show(plt.bar(range(len(review_lengths)), review_lengths))
In [5]:
# Convert reviews into padded word indexes
num_words = 250 # Based on graph (want to capture most reviews without too much padding)
vocab_processor = VocabularyProcessor(num_words).fit_transform(total_x)
total_x_vector = np.array(list(vocab_processor))
train_x_vector = total_x_vector[:25000]
test_x_vector = total_x_vector[25000:]
# Shuffle data
shuffled_i = np.random.permutation(np.arange(data_set_size * 2))
train_x_vector_shuffled = train_x_vector[shuffled_i]
train_y_shuffled = train_y[shuffled_i]
test_x_vector_shuffled = test_x_vector[shuffled_i]
test_y_shuffled = test_y[shuffled_i]
In [32]:
# Variables
epochs = 10
batch_size = 64
num_steps = int(data_set_size * 2 / batch_size * epochs) # 3900
reg_constant = 0.01
num_classes = 2
vocab_size = max([max(x) for x in total_x_vector]) + 1 # 101244
embedding_size = 128
patch_size_1 = 3
patch_size_2 = 4
patch_size_3 = 5
num_channels = 1
conv_depth = 128
conv_stride = [1, 1, 1, 1]
pool_stride = [1, 1, 1, 1]
padding = 'VALID'
losses = []
# Graph
with tf.Graph().as_default():
with tf.Session() as session:
# Input data
data_x_tf = tf.placeholder(tf.int32, [batch_size, num_words])
data_y_tf = tf.placeholder(tf.float32, [batch_size, num_classes])
dropout_tf = tf.placeholder(tf.float32)
# Embeddeding
embedding_space = tf.Variable(tf.random_uniform([vocab_size, embedding_size]))
data_x_embedded = tf.nn.embedding_lookup(embedding_space, data_x_tf)
data_x_embedded_expanded = tf.expand_dims(data_x_embedded, -1)
# Fist convolution
conv_1_weights = tf.Variable(tf.truncated_normal([patch_size_1,
embedding_size,
num_channels,
conv_depth]))
conv_1_biases = tf.Variable(tf.constant(0.1, shape=(conv_depth,)))
conv_1 = tf.nn.conv2d(data_x_embedded_expanded, conv_1_weights, conv_stride, padding)
conv_1_relu = tf.nn.relu(tf.nn.bias_add(conv_1, conv_1_biases))
pool_1 = tf.nn.max_pool(conv_1_relu,
[1, num_words - patch_size_1 + 1, 1, 1],
pool_stride, padding)
# Second convolution
conv_2_weights = tf.Variable(tf.truncated_normal([patch_size_2,
embedding_size,
num_channels,
conv_depth]))
conv_2_biases = tf.Variable(tf.constant(0.1, shape=(conv_depth,)))
conv_2 = tf.nn.conv2d(data_x_embedded_expanded, conv_2_weights, conv_stride, padding)
conv_2_relu = tf.nn.relu(tf.nn.bias_add(conv_2, conv_2_biases))
pool_2 = tf.nn.max_pool(conv_2_relu,
[1, num_words - patch_size_2 + 1, 1, 1],
pool_stride, padding)
# Third convolution
conv_3_weights = tf.Variable(tf.truncated_normal([patch_size_3,
embedding_size,
num_channels,
conv_depth]))
conv_3_biases = tf.Variable(tf.constant(0.1, shape=(conv_depth,)))
conv_3 = tf.nn.conv2d(data_x_embedded_expanded, conv_3_weights, conv_stride, padding)
conv_3_relu = tf.nn.relu(tf.nn.bias_add(conv_3, conv_3_biases))
pool_3 = tf.nn.max_pool(conv_3_relu,
[1, num_words - patch_size_3 + 1, 1, 1],
pool_stride, padding)
# Reshape
pool = tf.concat(3, [pool_1, pool_2, pool_3])
pool_shape = pool.get_shape().as_list()
reshaped_pool = tf.reshape(pool, [pool_shape[0], pool_shape[3]])
reshaped_pool_dropout = tf.nn.dropout(reshaped_pool, dropout_tf)
# Output layer weights and biases
output_weights = tf.Variable(tf.truncated_normal([conv_depth * 3, num_classes]))
output_biases = tf.Variable(tf.constant(0.1, shape=(num_classes,)))
output = tf.nn.bias_add(tf.matmul(reshaped_pool_dropout, output_weights),
output_biases)
# Loss, optimizer, and predictions
regularization = reg_constant * tf.nn.l2_loss(output_weights)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, data_y_tf) +
regularization)
loss_summary = tf.scalar_summary('loss', loss)
optimizer = tf.train.AdamOptimizer().minimize(loss)
preds = tf.nn.softmax(output)
# Write accuracy to summary
sim = tf.equal(tf.argmax(preds, 1), tf.argmax(data_y_tf, 1))
accuracy = tf.reduce_mean(tf.cast(sim, tf.float32)) * 100
accuracy_summary = tf.scalar_summary('accuracy', accuracy)
merged_train = tf.merge_summary([loss_summary, accuracy_summary])
summary_writer_train = tf.train.SummaryWriter('tensorboard/train', session.graph)
# Train model
tf.initialize_all_variables().run()
for i in xrange(num_steps):
batch_start = batch_size * i % (data_set_size * 2 - batch_size)
batch_end = batch_start + batch_size
batch_x = train_x_vector_shuffled[batch_start:batch_end,:]
batch_y = train_y_shuffled[batch_start:batch_end,:]
feed_dict = {data_x_tf: batch_x, data_y_tf: batch_y, dropout_tf: 0.5}
_, l, summary, accuracy_train = session.run([optimizer,
loss,
merged_train,
accuracy],
feed_dict)
summary_writer_train.add_summary(summary, i)
if i % int(num_steps / epochs) == 0:
epoch_num = int(i / int(num_steps / epochs))
print('Train epoch %d: loss = %.2f, train accuracy = %.2f%%'
% (epoch_num, l, accuracy_train))
# Test model
test_preds_total = []
for i in xrange(int(np.floor(data_set_size * 2 / batch_size))):
batch_start = batch_size * i
batch_end = batch_start + batch_size
batch_x = test_x_vector_shuffled[batch_start:batch_end,:]
batch_y = test_y_shuffled[batch_start:batch_end,:]
feed_dict = {data_x_tf: batch_x, data_y_tf: batch_y, dropout_tf: 1.0}
test_preds_total += (session.run([accuracy], feed_dict))
print 'Test set accuracy: %.2f%%' % (sum(test_preds_total) / len(test_preds_total))