In [1]:
import tensorflow as tf

In [2]:
import tarfile
import re
import urllib.request
import os
import random

class ImdbMovieReviews:
    """
    The movie review dataset is offered by Stanford University’s AI department:
    http://ai.stanford.edu/~amaas/data/sentiment/. It comes as a compressed  tar  archive where
    positive and negative reviews can be found as text files in two according folders. We apply
    the same pre-processing to the text as in the last section: Extracting plain words using a
    regular expression and converting to lower case.
    """
    DEFAULT_URL = \
        'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    TOKEN_REGEX = re.compile(r'[A-Za-z]+|[!?.:,()]')
    
    def __init__(self):
        self._cache_dir = './imdb'
        self._url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
        
        if not os.path.isfile(self._cache_dir):
            urllib.request.urlretrieve(self._url, self._cache_dir)
        self.filepath = self._cache_dir

    def __iter__(self):
        with tarfile.open(self.filepath) as archive:
            items = archive.getnames()
            for filename in archive.getnames():
                if filename.startswith('aclImdb/train/pos/'):
                    yield self._read(archive, filename), True
                elif filename.startswith('aclImdb/train/neg/'):
                    yield self._read(archive, filename), False
                    
    def _read(self, archive, filename):
        with archive.extractfile(filename) as file_:
            data = file_.read().decode('utf-8')
            data = type(self).TOKEN_REGEX.findall(data)
            data = [x.lower() for x in data]
            return data

In [3]:
import numpy as np
# Spacy is my favourite nlp framework, which havu builtin word embeddings trains on wikipesia
from spacy.en import English

class Embedding:
    
    def __init__(self, length):
#          spaCy makes using word vectors very easy. 
#             The Lexeme , Token , Span  and Doc  classes all have a .vector property,
#             which is a 1-dimensional numpy array of 32-bit floats:
        self.parser = English()
        self._length = length
        self.dimensions = 300
        
    def __call__(self, sequence):
        data = np.zeros((self._length, self.dimensions))
        # you can access known words from the parser's vocabulary
        embedded = [self.parser.vocab[w].vector for w in sequence]
        data[:len(sequence)] = embedded
        return data

In [4]:
from lazy import lazy

class SequenceClassificationModel:
    def __init__(self, params):
        self.params = params
        self._create_placeholders()
        self.prediction
        self.cost
        self.error
        self.optimize
        self.global_step = 0
        self._create_summaries()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def _create_placeholders(self):
        with tf.name_scope("data"):
            self.data = tf.placeholder(tf.float32, [None, self.params.seq_length, self.params.embed_length])
            self.target = tf.placeholder(tf.float32, [None, 2])
  
    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar('loss', self.cost)
            tf.summary.scalar('erroe', self.error)
            self.summary = tf.summary.merge_all()
            saver = tf.train.Saver()
            
    @lazy
    def length(self):
        with tf.name_scope("seq_length"):
            used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
            length = tf.reduce_sum(used, reduction_indices=1)
            length = tf.cast(length, tf.int32)
        return length
    
    @lazy
    def prediction(self):
        with tf.name_scope("recurrent_layer"):
            output, _ = tf.nn.dynamic_rnn(
                self.params.rnn_cell(self.params.rnn_hidden),
                self.data,
                dtype=tf.float32,
                sequence_length=self.length
            )
        last = self._last_relevant(output, self.length)

        with tf.name_scope("softmax_layer"):
            num_classes = int(self.target.get_shape()[1])
            weight = tf.Variable(tf.truncated_normal(
                [self.params.rnn_hidden, num_classes], stddev=0.01))
            bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
            prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        return prediction
    
    @lazy
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
        return cross_entropy
    
    @lazy
    def error(self):
        self.mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(self.mistakes, tf.float32))
    
    @lazy
    def optimize(self):
        with tf.name_scope("optimization"):
            gradient = self.params.optimizer.compute_gradients(self.cost)
            if self.params.gradient_clipping:
                limit = self.params.gradient_clipping
                gradient = [
                    (tf.clip_by_value(g, -limit, limit), v)
                    if g is not None else (None, v)
                    for g, v in gradient]
            optimize = self.params.optimizer.apply_gradients(gradient)
        return optimize
    
    @staticmethod
    def _last_relevant(output, length):
        with tf.name_scope("last_relevant"):
            # As of now, TensorFlow only supports indexing along the first dimension, using
            # tf.gather() . We thus flatten the first two dimensions of the output activations from their
            # shape of  sequences x time_steps x word_vectors  and construct an index into this resulting tensor.
            batch_size = tf.shape(output)[0]
            max_length = int(output.get_shape()[1])
            output_size = int(output.get_shape()[2])

            # The index takes into account the start indices for each sequence in the flat tensor and adds
            # the sequence length to it. Actually, we only add  length - 1  so that we select the last valid
            # time step.
            index = tf.range(0, batch_size) * max_length + (length - 1)
            flat = tf.reshape(output, [-1, output_size])
            relevant = tf.gather(flat, index)
        return relevant
    
    def train(self, batches, save_prefix, save_every=10):
        saver = tf.train.Saver()
        if os.path.isdir('./saved/'):
            saver.restore(self.sess, tf.train.latest_checkpoint('./saved/'))
        else:
            os.makedirs('saved')
        summary_writer = tf.summary.FileWriter('graphs/run{}'.format(self.global_step), self.sess.graph)
        self.global_step += 1
        for index, batch in enumerate(batches):
            feed = {model.data: batch[0], model.target: batch[1]}
            error, _, summary_str = self.sess.run([model.error, model.optimize, model.summary], feed)
            print('{}: {:3.1f}%'.format(index + 1, 100 * error))
            if index % save_every == 0:
                summary_writer.add_summary(summary_str, index)
                summary_writer.flush()
            if index % save_every == 0:
                save_path = os.path.join('checkpoints', save_prefix)
                print('saving...', save_path)
                saver.save(self.sess, save_path, global_step=index)
        saver.save(self.sess, os.path.join('checkpoints', save_prefix + '_final'))

    def predict_proba(self, data):
        feed = {model.data: data, }
        prediction = self.sess.run([model.prediction], feed)        
        return prediction
        
    def close(self):
        tf.reset_default_graph()
        self.session.close()

In [5]:
def preprocess_batched(iterator, length, embedding, batch_size):
    iterator = iter(iterator)
    while True:
        data = np.zeros((batch_size, length, embedding.dimensions))
        target = np.zeros((batch_size, 2))
        for index in range(batch_size):
            text, label = next(iterator)
            data[index] = embedding(text)
            target[index] = [1, 0] if label else [0, 1]
        yield data, target

In [6]:
reviews = list(ImdbMovieReviews())

In [7]:
random.shuffle(reviews)

In [8]:
length = max(len(x[0]) for x in reviews)
embedding = Embedding(length)

In [9]:
from attrdict import AttrDict

params = AttrDict(
    rnn_cell=tf.contrib.rnn.GRUCell,
    rnn_hidden=300,
    optimizer=tf.train.RMSPropOptimizer(0.002),
    batch_size=20,
    gradient_clipping=100,
    seq_length=length,
    embed_length=embedding.dimensions
)

In [10]:
batches = preprocess_batched(reviews, length, embedding, params.batch_size)

In [11]:
tf.reset_default_graph()

model = SequenceClassificationModel(params)


/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:95: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "

In [12]:
saver = tf.train.Saver()
checkpoint_dir = 'no_att_checkpoints'
checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)
if checkpoint:
    print("Reading model parameters from %s" % checkpoint.model_checkpoint_path)
    saver.restore(model.sess, checkpoint.model_checkpoint_path)
else:
    raise FileNotFoundError("Cannot restore model")


Reading model parameters from no_att_checkpoints/simple-rnn-1240
INFO:tensorflow:Restoring parameters from no_att_checkpoints/simple-rnn-1240

In [13]:
tf.reset_default_graph()

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.33)
config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)

TP = 0
all_pred = 0
steps = 50

for step_num, (data, labels_batch) in enumerate(batches, 1):

    prediction = np.round(model.predict_proba(data))

    TP += (labels_batch == prediction).sum() // 2
    all_pred += len(labels_batch)

    print(step_num, end='_')

    if step_num == steps:
        break


1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_48_49_50_

In [14]:
print("Baseline RNN accuracy:", TP / all_pred)


Baseline RNN accuracy: 0.882