In [1]:
import tensorflow as tf
In [2]:
import tarfile
import re
import urllib.request
import os
import random
class ImdbMovieReviews:
"""
The movie review dataset is offered by Stanford University’s AI department:
http://ai.stanford.edu/~amaas/data/sentiment/. It comes as a compressed tar archive where
positive and negative reviews can be found as text files in two according folders. We apply
the same pre-processing to the text as in the last section: Extracting plain words using a
regular expression and converting to lower case.
"""
DEFAULT_URL = \
'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
TOKEN_REGEX = re.compile(r'[A-Za-z]+|[!?.:,()]')
def __init__(self):
self._cache_dir = './imdb'
self._url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
if not os.path.isfile(self._cache_dir):
urllib.request.urlretrieve(self._url, self._cache_dir)
self.filepath = self._cache_dir
def __iter__(self):
with tarfile.open(self.filepath) as archive:
items = archive.getnames()
for filename in archive.getnames():
if filename.startswith('aclImdb/train/pos/'):
yield self._read(archive, filename), True
elif filename.startswith('aclImdb/train/neg/'):
yield self._read(archive, filename), False
def _read(self, archive, filename):
with archive.extractfile(filename) as file_:
data = file_.read().decode('utf-8')
data = type(self).TOKEN_REGEX.findall(data)
data = [x.lower() for x in data]
return data
In [3]:
import numpy as np
# Spacy is my favourite nlp framework, which havu builtin word embeddings trains on wikipesia
from spacy.en import English
class Embedding:
def __init__(self, length):
# spaCy makes using word vectors very easy.
# The Lexeme , Token , Span and Doc classes all have a .vector property,
# which is a 1-dimensional numpy array of 32-bit floats:
self.parser = English()
self._length = length
self.dimensions = 300
def __call__(self, sequence):
data = np.zeros((self._length, self.dimensions))
# you can access known words from the parser's vocabulary
embedded = [self.parser.vocab[w].vector for w in sequence]
data[:len(sequence)] = embedded
return data
In [4]:
from lazy import lazy
class SequenceClassificationModel:
def __init__(self, params):
self.params = params
self._create_placeholders()
self.prediction
self.cost
self.error
self.optimize
self.global_step = 0
self._create_summaries()
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def _create_placeholders(self):
with tf.name_scope("data"):
self.data = tf.placeholder(tf.float32, [None, self.params.seq_length, self.params.embed_length])
self.target = tf.placeholder(tf.float32, [None, 2])
def _create_summaries(self):
with tf.name_scope("summaries"):
tf.summary.scalar('loss', self.cost)
tf.summary.scalar('erroe', self.error)
self.summary = tf.summary.merge_all()
saver = tf.train.Saver()
@lazy
def length(self):
with tf.name_scope("seq_length"):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
@lazy
def prediction(self):
with tf.name_scope("recurrent_layer"):
output, _ = tf.nn.dynamic_rnn(
self.params.rnn_cell(self.params.rnn_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length
)
last = self._last_relevant(output, self.length)
with tf.name_scope("softmax_layer"):
num_classes = int(self.target.get_shape()[1])
weight = tf.Variable(tf.truncated_normal(
[self.params.rnn_hidden, num_classes], stddev=0.01))
bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
return prediction
@lazy
def cost(self):
cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
return cross_entropy
@lazy
def error(self):
self.mistakes = tf.not_equal(
tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
return tf.reduce_mean(tf.cast(self.mistakes, tf.float32))
@lazy
def optimize(self):
with tf.name_scope("optimization"):
gradient = self.params.optimizer.compute_gradients(self.cost)
if self.params.gradient_clipping:
limit = self.params.gradient_clipping
gradient = [
(tf.clip_by_value(g, -limit, limit), v)
if g is not None else (None, v)
for g, v in gradient]
optimize = self.params.optimizer.apply_gradients(gradient)
return optimize
@staticmethod
def _last_relevant(output, length):
with tf.name_scope("last_relevant"):
# As of now, TensorFlow only supports indexing along the first dimension, using
# tf.gather() . We thus flatten the first two dimensions of the output activations from their
# shape of sequences x time_steps x word_vectors and construct an index into this resulting tensor.
batch_size = tf.shape(output)[0]
max_length = int(output.get_shape()[1])
output_size = int(output.get_shape()[2])
# The index takes into account the start indices for each sequence in the flat tensor and adds
# the sequence length to it. Actually, we only add length - 1 so that we select the last valid
# time step.
index = tf.range(0, batch_size) * max_length + (length - 1)
flat = tf.reshape(output, [-1, output_size])
relevant = tf.gather(flat, index)
return relevant
def train(self, batches, save_prefix, save_every=10):
saver = tf.train.Saver()
if os.path.isdir('./saved/'):
saver.restore(self.sess, tf.train.latest_checkpoint('./saved/'))
else:
os.makedirs('saved')
summary_writer = tf.summary.FileWriter('graphs/run{}'.format(self.global_step), self.sess.graph)
self.global_step += 1
for index, batch in enumerate(batches):
feed = {model.data: batch[0], model.target: batch[1]}
error, _, summary_str = self.sess.run([model.error, model.optimize, model.summary], feed)
print('{}: {:3.1f}%'.format(index + 1, 100 * error))
if index % save_every == 0:
summary_writer.add_summary(summary_str, index)
summary_writer.flush()
if index % save_every == 0:
save_path = os.path.join('checkpoints', save_prefix)
print('saving...', save_path)
saver.save(self.sess, save_path, global_step=index)
saver.save(self.sess, os.path.join('checkpoints', save_prefix + '_final'))
def predict_proba(self, data):
feed = {model.data: data, }
prediction = self.sess.run([model.prediction], feed)
return prediction
def close(self):
tf.reset_default_graph()
self.session.close()
In [5]:
def preprocess_batched(iterator, length, embedding, batch_size):
iterator = iter(iterator)
while True:
data = np.zeros((batch_size, length, embedding.dimensions))
target = np.zeros((batch_size, 2))
for index in range(batch_size):
text, label = next(iterator)
data[index] = embedding(text)
target[index] = [1, 0] if label else [0, 1]
yield data, target
In [6]:
reviews = list(ImdbMovieReviews())
In [7]:
random.shuffle(reviews)
In [8]:
length = max(len(x[0]) for x in reviews)
embedding = Embedding(length)
In [9]:
from attrdict import AttrDict
params = AttrDict(
rnn_cell=tf.contrib.rnn.GRUCell,
rnn_hidden=300,
optimizer=tf.train.RMSPropOptimizer(0.002),
batch_size=20,
gradient_clipping=100,
seq_length=length,
embed_length=embedding.dimensions
)
In [10]:
batches = preprocess_batched(reviews, length, embedding, params.batch_size)
In [11]:
tf.reset_default_graph()
model = SequenceClassificationModel(params)
In [12]:
saver = tf.train.Saver()
checkpoint_dir = 'no_att_checkpoints'
checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
if checkpoint:
print("Reading model parameters from %s" % checkpoint.model_checkpoint_path)
saver.restore(model.sess, checkpoint.model_checkpoint_path)
else:
raise FileNotFoundError("Cannot restore model")
In [13]:
tf.reset_default_graph()
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.33)
config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
TP = 0
all_pred = 0
steps = 50
for step_num, (data, labels_batch) in enumerate(batches, 1):
prediction = np.round(model.predict_proba(data))
TP += (labels_batch == prediction).sum() // 2
all_pred += len(labels_batch)
print(step_num, end='_')
if step_num == steps:
break
In [14]:
print("Baseline RNN accuracy:", TP / all_pred)