In [1]:
import tensorflow as tf
import numpy as np
from utils import *
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [553, 2575, 3398, 11452, 36, 8241, 217, 151, 19, 3835] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate, maxlen):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.GRUCell(size_layer,reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        batch_size = tf.shape(encoder_embedded)[0]
        seq_len = tf.shape(encoder_embedded)[1]
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype = tf.float32)
        
        x_attention = tf.reshape(outputs,[-1,size_layer])
        attention_size=tf.get_variable(name='attention',shape=[size_layer,1],dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))
        bias_ = tf.get_variable(name='bias_',shape=[1],dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))
        linear_projection = tf.add(tf.matmul(x_attention,attention_size),bias_)
        reshape_ = tf.reshape(linear_projection,[batch_size,seq_len,-1])
        attention_output=tf.nn.softmax(reshape_,dim=1)
        atten_visualize=tf.reshape(attention_output,[batch_size,seq_len],name='plot_dis')
        multi = tf.multiply(attention_output,outputs)
        atten_out_s = tf.reduce_sum(multi,1)
        
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(atten_out_s, W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output,learning_rate,maxlen)
sess.run(tf.global_variables_initializer())


WARNING:tensorflow:From <ipython-input-6-cd613c8ac9ec>:22: calling softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.
Instructions for updating:
dim is deprecated, use axis instead
WARNING:tensorflow:From <ipython-input-6-cd613c8ac9ec>:30: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.


In [8]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.500000
time taken: 3.057192802429199
epoch: 0, training loss: 0.706160, training acc: 0.494555, valid loss: 0.693313, valid acc: 0.500000

epoch: 1, pass acc: 0.500000, current acc: 0.520508
time taken: 2.934628963470459
epoch: 1, training loss: 0.691318, training acc: 0.517874, valid loss: 0.688289, valid acc: 0.520508

epoch: 2, pass acc: 0.520508, current acc: 0.685059
time taken: 3.0018999576568604
epoch: 2, training loss: 0.598523, training acc: 0.667732, valid loss: 0.606993, valid acc: 0.685059

epoch: 3, pass acc: 0.685059, current acc: 0.700195
time taken: 2.9580676555633545
epoch: 3, training loss: 0.395388, training acc: 0.822088, valid loss: 0.629455, valid acc: 0.700195

epoch: 4, pass acc: 0.700195, current acc: 0.707031
time taken: 3.007512331008911
epoch: 4, training loss: 0.250498, training acc: 0.902580, valid loss: 0.756995, valid acc: 0.707031

epoch: 5, pass acc: 0.707031, current acc: 0.710938
time taken: 2.955876111984253
epoch: 5, training loss: 0.146345, training acc: 0.948627, valid loss: 1.039384, valid acc: 0.710938

epoch: 6, pass acc: 0.710938, current acc: 0.718750
time taken: 2.941133499145508
epoch: 6, training loss: 0.093190, training acc: 0.968632, valid loss: 1.144270, valid acc: 0.718750

time taken: 2.9638419151306152
epoch: 7, training loss: 0.063832, training acc: 0.977391, valid loss: 1.218100, valid acc: 0.714355

epoch: 8, pass acc: 0.718750, current acc: 0.721191
time taken: 2.9340081214904785
epoch: 8, training loss: 0.026701, training acc: 0.991359, valid loss: 1.417536, valid acc: 0.721191

time taken: 2.9663562774658203
epoch: 9, training loss: 0.010883, training acc: 0.997277, valid loss: 1.641903, valid acc: 0.717285

time taken: 2.9637436866760254
epoch: 10, training loss: 0.005974, training acc: 0.998224, valid loss: 1.742597, valid acc: 0.716309

time taken: 2.9169058799743652
epoch: 11, training loss: 0.003839, training acc: 0.998816, valid loss: 1.839181, valid acc: 0.720703

time taken: 2.9449501037597656
epoch: 12, training loss: 0.002979, training acc: 0.998935, valid loss: 1.924913, valid acc: 0.717773

time taken: 2.9460959434509277
epoch: 13, training loss: 0.002424, training acc: 0.999171, valid loss: 1.986492, valid acc: 0.716797

break epoch:14


In [10]:
logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.72      0.71      0.72      1068
   positive       0.71      0.72      0.72      1065

avg / total       0.72      0.72      0.72      2133


In [ ]: