In [1]:
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20465
Most common words [('the', 10129), ('a', 7312), ('and', 6199), ('of', 6063), ('to', 4233), ('is', 3378)]
Sample data [4, 668, 9, 2822, 8, 22, 4, 3424, 16369, 97] ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'centurys', 'new']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.layers import base as base_layer

_BIAS_VARIABLE_NAME = "bias"
_WEIGHTS_VARIABLE_NAME = "kernel"


class NLSTMCell(rnn_cell_impl.RNNCell):
    def __init__(self, num_units, depth, forget_bias=1.0,
               state_is_tuple=True, use_peepholes=True,
               activation=None, gate_activation=None,
               cell_activation=None,
               initializer=None,
               input_gate_initializer=None,
               use_bias=True, reuse=None, name=None):
    
        super(NLSTMCell, self).__init__(_reuse=reuse, name=name)
        if not state_is_tuple:
            logging.warn("%s: Using a concatenated state is slower and will soon be "
                   "deprecated.  Use state_is_tuple=True.", self)

        self.input_spec = base_layer.InputSpec(ndim=2)
        self._num_units = num_units
        self._forget_bias = forget_bias
        self._state_is_tuple = state_is_tuple
        self._use_peepholes = use_peepholes
        self._depth = depth
        self._activation = activation or math_ops.tanh
        self._gate_activation = gate_activation or math_ops.sigmoid
        self._cell_activation = cell_activation or array_ops.identity
        self._initializer = initializer or init_ops.orthogonal_initializer()
        self._input_gate_initializer = (input_gate_initializer 
                                    or init_ops.glorot_normal_initializer())
        self._use_bias = use_bias
        self._kernels = None
        self._biases = None
        self.built = False

    @property
    def state_size(self):
        if self._state_is_tuple:
            return tuple([self._num_units] * (self.depth + 1))
        else:
            return self._num_units * (self.depth + 1)

    @property
    def output_size(self):
        return self._num_units

    @property
    def depth(self):
        return self._depth

    def build(self, inputs_shape):
        if inputs_shape[1].value is None:
            raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape)

        input_depth = inputs_shape[1].value
        h_depth = self._num_units
        self._kernels = []
        if self._use_bias:
            self._biases = []

        if self._use_peepholes:
            self._peep_kernels = []
        for i in range(self.depth):
            if i == 0:
                input_kernel = self.add_variable("input_gate_kernel",
                                                 shape=[input_depth, 4 * self._num_units],
                                                 initializer=self._input_gate_initializer)
                hidden_kernel = self.add_variable("hidden_gate_kernel",
                                                  shape=[h_depth, 4 * self._num_units],
                                                  initializer=self._initializer)
                kernel = tf.concat([input_kernel, hidden_kernel],
                                   axis=0, name="kernel_0")
                self._kernels.append(kernel)
            else:
                self._kernels.append(self.add_variable("kernel_{}".format(i),
                                                       shape=[2 * h_depth, 4 * self._num_units],
                                                       initializer=self._initializer))
            if self._use_bias:
                self._biases.append(self.add_variable("bias_{}".format(i),
                                                      shape=[4 * self._num_units],
                                                      initializer=init_ops.zeros_initializer(dtype=self.dtype)))
            if self._use_peepholes:
                self._peep_kernels.append(self.add_variable("peep_kernel_{}".format(i),
                                                            shape=[h_depth, 3 * self._num_units],
                                                            initializer=self._initializer))

        self.built = True

    def _recurrence(self, inputs, hidden_state, cell_states, depth):

        sigmoid = math_ops.sigmoid
        one = constant_op.constant(1, dtype=dtypes.int32)
        c = cell_states[depth]
        h = hidden_state

        gate_inputs = math_ops.matmul(array_ops.concat([inputs, h], 1), self._kernels[depth])
        if self._use_bias:
            gate_inputs = nn_ops.bias_add(gate_inputs, self._biases[depth])
        if self._use_peepholes:
            peep_gate_inputs = math_ops.matmul(c, self._peep_kernels[depth])
        i_peep, f_peep, o_peep = array_ops.split(value=peep_gate_inputs, num_or_size_splits=3, axis=one)

        i, j, f, o = array_ops.split(value=gate_inputs, num_or_size_splits=4, axis=one)
        if self._use_peepholes:
            i += i_peep
            f += f_peep
            o += o_peep 

        if self._use_peepholes:
            peep_gate_inputs = math_ops.matmul(c, self._peep_kernels[depth])
            i_peep, f_peep, o_peep = array_ops.split(value=peep_gate_inputs, num_or_size_splits=3, axis=one)
            i += i_peep
            f += f_peep
            o += o_peep 

        add = math_ops.add
        multiply = math_ops.multiply

        if self._use_bias:
            forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype)
            f = add(f, forget_bias_tensor)

        inner_hidden = multiply(c, self._gate_activation(f))

        if depth == 0:
            inner_input = multiply(self._gate_activation(i), self._cell_activation(j))
        else:
            inner_input = multiply(self._gate_activation(i), self._activation(j))

        if depth == (self.depth - 1):
            new_c = add(inner_hidden, inner_input)
            new_cs = [new_c]
        else:
            new_c, new_cs = self._recurrence(inputs=inner_input,
                                             hidden_state=inner_hidden,
                                             cell_states=cell_states,
                                             depth=depth + 1)
        new_h = multiply(self._activation(new_c), self._gate_activation(o))
        new_cs = [new_h] + new_cs
        return new_h, new_cs

    def call(self, inputs, state):
        if not self._state_is_tuple:
            states = array_ops.split(state, self.depth + 1, axis=1)
        else:
            states = state
        hidden_state = states[0]
        cell_states = states[1:]
        outputs, next_state = self._recurrence(inputs, hidden_state, cell_states, 0)
        if self._state_is_tuple:
            next_state = tuple(next_state)
        else:
            next_state = array_ops.concat(next_state, axis=1)
        return outputs, next_state

In [7]:
class Model:
    def __init__(self, size_layer, embedded_size,
                 dict_size, dimension_output, learning_rate, batch_size,
                timestamp, depth=1):
        self.X = tf.placeholder(tf.int32, [batch_size, maxlen])
        self.Y = tf.placeholder(tf.float32, [batch_size, dimension_output])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        cell = NLSTMCell(size_layer, depth)
        init_state = cell.zero_state(batch_size, dtype=dtypes.float32)
        state = init_state
        outputs = []
        with tf.variable_scope("RNN"):
            for time_step in range(timestamp):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                out, state = cell(encoder_embedded[:, time_step, :], state)
                outputs.append(out)
        outputs = tf.reshape(tf.concat(outputs,axis=1),[batch_size,timestamp,size_layer])
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(outputs[:, -1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
size_layer = 64
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,embedded_size,vocabulary_size+4,
              dimension_output,learning_rate,
             batch_size,maxlen)
sess.run(tf.global_variables_initializer())

In [10]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.531738
time taken: 2.8110363483428955
epoch: 0, training loss: 0.695090, training acc: 0.517756, valid loss: 0.689452, valid acc: 0.531738

epoch: 1, pass acc: 0.531738, current acc: 0.592773
time taken: 2.0530991554260254
epoch: 1, training loss: 0.668451, training acc: 0.598130, valid loss: 0.669185, valid acc: 0.592773

epoch: 2, pass acc: 0.592773, current acc: 0.634766
time taken: 2.054539918899536
epoch: 2, training loss: 0.594847, training acc: 0.699574, valid loss: 0.646298, valid acc: 0.634766

epoch: 3, pass acc: 0.634766, current acc: 0.668945
time taken: 2.0540168285369873
epoch: 3, training loss: 0.471607, training acc: 0.790365, valid loss: 0.666460, valid acc: 0.668945

epoch: 4, pass acc: 0.668945, current acc: 0.698242
time taken: 2.0545835494995117
epoch: 4, training loss: 0.345011, training acc: 0.862571, valid loss: 0.695080, valid acc: 0.698242

epoch: 5, pass acc: 0.698242, current acc: 0.708984
time taken: 2.0536608695983887
epoch: 5, training loss: 0.250333, training acc: 0.911340, valid loss: 0.695614, valid acc: 0.708984

epoch: 6, pass acc: 0.708984, current acc: 0.719238
time taken: 2.05330491065979
epoch: 6, training loss: 0.168091, training acc: 0.945668, valid loss: 0.761937, valid acc: 0.719238

epoch: 7, pass acc: 0.719238, current acc: 0.720215
time taken: 2.0530500411987305
epoch: 7, training loss: 0.111187, training acc: 0.968277, valid loss: 0.817810, valid acc: 0.720215

time taken: 2.050994873046875
epoch: 8, training loss: 0.077725, training acc: 0.980114, valid loss: 0.883286, valid acc: 0.719238

time taken: 2.053058624267578
epoch: 9, training loss: 0.056436, training acc: 0.986861, valid loss: 0.986419, valid acc: 0.713379

time taken: 2.0518455505371094
epoch: 10, training loss: 0.041559, training acc: 0.991004, valid loss: 1.005594, valid acc: 0.717285

epoch: 11, pass acc: 0.720215, current acc: 0.721191
time taken: 2.052720546722412
epoch: 11, training loss: 0.032299, training acc: 0.993134, valid loss: 1.040796, valid acc: 0.721191

epoch: 12, pass acc: 0.721191, current acc: 0.722656
time taken: 2.0508596897125244
epoch: 12, training loss: 0.024856, training acc: 0.995028, valid loss: 1.161813, valid acc: 0.722656

time taken: 2.0544140338897705
epoch: 13, training loss: 0.019435, training acc: 0.995975, valid loss: 1.237681, valid acc: 0.714355

time taken: 2.05448317527771
epoch: 14, training loss: 0.012193, training acc: 0.997869, valid loss: 1.242486, valid acc: 0.713867

time taken: 2.0531187057495117
epoch: 15, training loss: 0.008589, training acc: 0.998580, valid loss: 1.282168, valid acc: 0.721191

time taken: 2.054352045059204
epoch: 16, training loss: 0.006250, training acc: 0.998935, valid loss: 1.329985, valid acc: 0.721680

time taken: 2.05053973197937
epoch: 17, training loss: 0.004599, training acc: 0.999171, valid loss: 1.405500, valid acc: 0.721680

break epoch:18