In [1]:
import tensorflow as tf
import numpy as np
from utils import *
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [555, 2672, 3381, 11896, 36, 8200, 219, 151, 19, 4002] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate, maxlen):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.GRUCell(size_layer,reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embeddings_query = tf.Variable(tf.random_uniform([dict_size, embedded_size], -5, 5))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        encoder_embedded_query = tf.nn.embedding_lookup(encoder_embeddings_query, self.X)
        
        with tf.variable_scope('document', initializer=tf.orthogonal_initializer()):
            rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
            outputs, _ = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype = tf.float32)
            
        with tf.variable_scope('query', initializer=tf.orthogonal_initializer()):
            rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
            outputs_query, _ = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded_query, dtype = tf.float32)
        
        M = tf.multiply(outputs, outputs_query)
        alpha = tf.nn.softmax(M, 1)
        beta = tf.nn.softmax(M, 2)
        query_importance = tf.expand_dims(tf.reduce_sum(beta, 1), -1)
        s = tf.squeeze(tf.matmul(alpha, query_importance),2)   
        
        W = tf.get_variable('w',shape=(maxlen, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(s, W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output,learning_rate,maxlen)
sess.run(tf.global_variables_initializer())


WARNING:tensorflow:From <ipython-input-6-ae570a8b0dbd>:34: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.


In [8]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.499512
time taken: 5.658096075057983
epoch: 0, training loss: 0.699142, training acc: 0.493371, valid loss: 0.693512, valid acc: 0.499512

epoch: 1, pass acc: 0.499512, current acc: 0.559570
time taken: 5.507836103439331
epoch: 1, training loss: 0.685375, training acc: 0.566998, valid loss: 0.681353, valid acc: 0.559570

epoch: 2, pass acc: 0.559570, current acc: 0.625488
time taken: 5.518467903137207
epoch: 2, training loss: 0.586436, training acc: 0.718277, valid loss: 0.659593, valid acc: 0.625488

epoch: 3, pass acc: 0.625488, current acc: 0.652832
time taken: 5.4672651290893555
epoch: 3, training loss: 0.414610, training acc: 0.829072, valid loss: 0.716053, valid acc: 0.652832

time taken: 5.470943212509155
epoch: 4, training loss: 0.280202, training acc: 0.902462, valid loss: 0.825094, valid acc: 0.645508

epoch: 5, pass acc: 0.652832, current acc: 0.672363
time taken: 5.52541184425354
epoch: 5, training loss: 0.205628, training acc: 0.935014, valid loss: 0.796941, valid acc: 0.672363

time taken: 5.532345533370972
epoch: 6, training loss: 0.176779, training acc: 0.942590, valid loss: 0.906282, valid acc: 0.646973

time taken: 5.5103089809417725
epoch: 7, training loss: 0.130328, training acc: 0.961174, valid loss: 0.973442, valid acc: 0.663086

time taken: 5.517350912094116
epoch: 8, training loss: 0.082498, training acc: 0.981179, valid loss: 1.045645, valid acc: 0.664551

time taken: 5.511422157287598
epoch: 9, training loss: 0.065562, training acc: 0.986032, valid loss: 1.118302, valid acc: 0.664062

time taken: 5.517352819442749
epoch: 10, training loss: 0.059507, training acc: 0.986387, valid loss: 1.091805, valid acc: 0.663574

break epoch:11


In [9]:
logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.68      0.64      0.66      1069
   positive       0.66      0.69      0.67      1064

avg / total       0.67      0.67      0.67      2133


In [ ]: