In [1]:
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time
import random
import os


/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [10]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [544, 2559, 3142, 10271, 36, 7703, 217, 151, 19, 3801] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
from tensorflow.python.ops.rnn_cell import RNNCell
from tensorflow.contrib.rnn.python.ops import core_rnn_cell

def linear(args, output_size, bias, bias_start=0.0, scope=None):
    if args is None or (isinstance(args, (list, tuple)) and not args):
        raise ValueError("`args` must be specified")
    if not isinstance(args, (list, tuple)):
        args = [args]

    # Calculate the total size of arguments on dimension 1.
    total_arg_size = 0
    shapes = [a.get_shape().as_list() for a in args]
    for shape in shapes:
        if len(shape) != 2:
            raise ValueError(
                "Linear is expecting 2D arguments: %s" % str(shapes))
        if not shape[1]:
            raise ValueError(
                "Linear expects shape[1] of arguments: %s" % str(shapes))
        else:
            total_arg_size += shape[1]

    with tf.variable_scope(scope or "Linear"):
        matrix = tf.get_variable("Matrix", [total_arg_size, output_size])
        if len(args) == 1:
            res = tf.matmul(args[0], matrix)
        else:
            res = tf.matmul(tf.concat(1, args), matrix)
        if not bias:
            return res
        bias_term = tf.get_variable(
            "Bias", [output_size],
            initializer=tf.constant_initializer(bias_start))
    return res + bias_term

class SRUCell(RNNCell):

    def __init__(self, num_units, activation=None, reuse=None):
        self._num_units = num_units
        self._activation = activation or tf.tanh

    @property
    def output_size(self):
        return self._num_units

    @property
    def state_size(self):
        return self._num_units

    def __call__(self, inputs, state, scope='SRUCell'):

        with tf.variable_scope(scope):
            with tf.variable_scope("Inputs"):
                x = linear([inputs], self._num_units, False)
            with tf.variable_scope("Gate"):
                concat = tf.sigmoid(linear([inputs], 2 * self._num_units, True))
                f, r = tf.split(axis=1, num_or_size_splits=2, value=concat)
                    
            c = f * state + (1 - f) * x
            h = r * self._activation(c) + (1 - r) * inputs

        return h, c

class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output):
        
        def cells(reuse=False):
            return SRUCell(size_layer,reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded = tf.nn.embedding_lookup(embeddings, self.X)
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype = tf.float32)
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(outputs[:, -1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = 1e-3).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
maxlen = 50
batch_size = 128

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output)
sess.run(tf.global_variables_initializer())


WARNING:tensorflow:From <ipython-input-6-9f2b5c716f38>:80: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.


In [11]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.554199
time taken: 3.4405574798583984
epoch: 0, training loss: 0.689039, training acc: 0.535748, valid loss: 0.680192, valid acc: 0.554199

epoch: 1, pass acc: 0.554199, current acc: 0.583984
time taken: 3.066664934158325
epoch: 1, training loss: 0.630970, training acc: 0.649858, valid loss: 0.671964, valid acc: 0.583984

epoch: 2, pass acc: 0.583984, current acc: 0.626465
time taken: 3.135775327682495
epoch: 2, training loss: 0.532484, training acc: 0.734730, valid loss: 0.686642, valid acc: 0.626465

epoch: 3, pass acc: 0.626465, current acc: 0.646484
time taken: 3.11407732963562
epoch: 3, training loss: 0.408517, training acc: 0.818892, valid loss: 0.734317, valid acc: 0.646484

epoch: 4, pass acc: 0.646484, current acc: 0.653809
time taken: 3.2255287170410156
epoch: 4, training loss: 0.288211, training acc: 0.884233, valid loss: 0.821830, valid acc: 0.653809

time taken: 3.1103758811950684
epoch: 5, training loss: 0.185976, training acc: 0.932528, valid loss: 0.953978, valid acc: 0.650391

time taken: 3.197439193725586
epoch: 6, training loss: 0.107132, training acc: 0.967685, valid loss: 1.143521, valid acc: 0.649414

epoch: 7, pass acc: 0.653809, current acc: 0.655273
time taken: 3.0919787883758545
epoch: 7, training loss: 0.056378, training acc: 0.986387, valid loss: 1.360909, valid acc: 0.655273

epoch: 8, pass acc: 0.655273, current acc: 0.659180
time taken: 3.200023651123047
epoch: 8, training loss: 0.028726, training acc: 0.994437, valid loss: 1.571254, valid acc: 0.659180

time taken: 3.148078441619873
epoch: 9, training loss: 0.014215, training acc: 0.997869, valid loss: 1.786603, valid acc: 0.658203

time taken: 3.2151803970336914
epoch: 10, training loss: 0.008183, training acc: 0.999290, valid loss: 1.966834, valid acc: 0.652344

time taken: 2.972202777862549
epoch: 11, training loss: 0.004848, training acc: 0.999645, valid loss: 2.058176, valid acc: 0.657715

time taken: 3.206712007522583
epoch: 12, training loss: 0.003433, training acc: 0.999408, valid loss: 2.168201, valid acc: 0.651367

time taken: 3.1262435913085938
epoch: 13, training loss: 0.002158, training acc: 0.999882, valid loss: 2.243634, valid acc: 0.654297

break epoch:14


In [12]:
logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.66      0.65      0.66      1087
   positive       0.64      0.65      0.65      1046

avg / total       0.65      0.65      0.65      2133


In [ ]: