In [1]:
import tensorflow as tf
import numpy as np
from utils import *
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [555, 2515, 3325, 14971, 36, 9275, 219, 150, 19, 3686] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate, maxlen):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.GRUCell(size_layer,reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype = tf.float32)
        
        w_omega = tf.Variable(tf.random_normal([size_layer, maxlen], stddev=0.1))
        b_omega = tf.Variable(tf.random_normal([maxlen], stddev=0.1))
        u_omega = tf.Variable(tf.random_normal([maxlen], stddev=0.1))

        with tf.name_scope('v'):
            v = tf.tanh(tf.tensordot(outputs, w_omega, axes=1) + b_omega)

        vu = tf.tensordot(v, u_omega, axes=1, name='vu')
        alphas = tf.nn.softmax(vu, name='alphas') 

        outputs = tf.reduce_sum(outputs * tf.expand_dims(alphas, -1), 1)
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(outputs, W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output,learning_rate,maxlen)
sess.run(tf.global_variables_initializer())


WARNING:tensorflow:From <ipython-input-6-1087f6a1d38e>:29: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.


In [8]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.576172
time taken: 3.125619411468506
epoch: 0, training loss: 0.696378, training acc: 0.510890, valid loss: 0.687591, valid acc: 0.576172

epoch: 1, pass acc: 0.576172, current acc: 0.669434
time taken: 3.052114725112915
epoch: 1, training loss: 0.644105, training acc: 0.620739, valid loss: 0.619567, valid acc: 0.669434

epoch: 2, pass acc: 0.669434, current acc: 0.707031
time taken: 3.0710995197296143
epoch: 2, training loss: 0.479612, training acc: 0.768466, valid loss: 0.604454, valid acc: 0.707031

epoch: 3, pass acc: 0.707031, current acc: 0.707520
time taken: 3.0152018070220947
epoch: 3, training loss: 0.322288, training acc: 0.860322, valid loss: 0.693783, valid acc: 0.707520

epoch: 4, pass acc: 0.707520, current acc: 0.718750
time taken: 3.0594992637634277
epoch: 4, training loss: 0.207350, training acc: 0.916075, valid loss: 0.810913, valid acc: 0.718750

time taken: 3.0505475997924805
epoch: 5, training loss: 0.123278, training acc: 0.951113, valid loss: 1.074653, valid acc: 0.715332

time taken: 3.075266122817993
epoch: 6, training loss: 0.069617, training acc: 0.974432, valid loss: 1.203621, valid acc: 0.716309

time taken: 3.0302562713623047
epoch: 7, training loss: 0.038215, training acc: 0.986624, valid loss: 1.464504, valid acc: 0.712891

time taken: 3.0507609844207764
epoch: 8, training loss: 0.020674, training acc: 0.992306, valid loss: 1.714260, valid acc: 0.711914

time taken: 3.0347683429718018
epoch: 9, training loss: 0.013206, training acc: 0.995028, valid loss: 1.841591, valid acc: 0.715820

break epoch:10


In [9]:
logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.71      0.71      0.71      1065
   positive       0.71      0.71      0.71      1068

avg / total       0.71      0.71      0.71      2133


In [ ]: