In [1]:
import tensorflow as tf
import numpy as np
from utils import *
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [562, 2617, 3517, 16582, 36, 9639, 217, 152, 19, 4369] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
class Attention:
    def __init__(self,hidden_size):
        self.hidden_size = hidden_size
        self.dense_layer = tf.layers.Dense(hidden_size)
        self.v = tf.random_normal([hidden_size],mean=0,stddev=1/np.sqrt(hidden_size))
        
    def score(self, hidden_tensor, encoder_outputs):
        energy = tf.nn.tanh(self.dense_layer(encoder_outputs))
        energy = tf.transpose(energy,[0,2,1])
        batch_size = tf.shape(encoder_outputs)[0]
        v = tf.expand_dims(tf.tile(tf.expand_dims(self.v,0),[batch_size,1]),1)
        energy = tf.matmul(v,energy)
        return tf.squeeze(energy,1)
    
    def __call__(self, hidden, encoder_outputs):
        seq_len = tf.shape(encoder_outputs)[1]
        batch_size = tf.shape(encoder_outputs)[0]
        H = tf.tile(tf.expand_dims(hidden, 1),[1,seq_len,1])
        attn_energies = self.score(H,encoder_outputs)
        return tf.expand_dims(tf.nn.softmax(attn_energies),1)

class Luong(tf.contrib.rnn.RNNCell):
    def __init__(self, hidden_size, output_size, encoder_outputs):
        self.hidden_size = hidden_size
        self.batch_size = tf.shape(encoder_outputs)[0]
        self.gru = tf.contrib.rnn.GRUCell(hidden_size)
        self.attention = Attention(hidden_size)
        self.out = tf.layers.Dense(output_size)
        self.encoder_outputs = encoder_outputs
        self.reset_state()
    
    @property
    def state_size(self):
        return self.hidden_size
 
    @property
    def output_size(self):
        return self.hidden_size
    
    def reset_state(self):
        self.context = tf.zeros(shape=(self.batch_size,self.hidden_size))
    
    def __call__(self,inputs, state, scope=None):
        rnn_input = tf.concat([inputs,self.context],1)
        output, hidden = self.gru(rnn_input, state)
        attn_weights = self.attention(output, self.encoder_outputs)
        self.context = tf.matmul(attn_weights, self.encoder_outputs)[:,0,:]
        output = tf.concat([output,self.context],1)
        output = tf.nn.softmax(self.out(output))
        return output, hidden

In [7]:
class Model:
    def __init__(self, size_layer, embedded_size, dict_size, dimension_output, learning_rate):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        self.encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(self.encoder_embeddings, self.X)
        self.luong_cell = Luong(size_layer,size_layer,encoder_embedded)
        outputs, last_states = tf.nn.dynamic_rnn(self.luong_cell, encoder_embedded, dtype = tf.float32)
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(outputs[:,-1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
size_layer = 128
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,embedded_size,len(dictionary),dimension_output,learning_rate)
sess.run(tf.global_variables_initializer())


WARNING:tensorflow:From <ipython-input-7-50dce70c2347>:13: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.


In [9]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        model.luong_cell.reset_state()
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        model.luong_cell.reset_state()
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.540039
time taken: 9.742317914962769
epoch: 0, training loss: 0.692666, training acc: 0.503551, valid loss: 0.688301, valid acc: 0.540039

epoch: 1, pass acc: 0.540039, current acc: 0.660645
time taken: 9.577836036682129
epoch: 1, training loss: 0.653488, training acc: 0.645952, valid loss: 0.636614, valid acc: 0.660645

epoch: 2, pass acc: 0.660645, current acc: 0.691895
time taken: 9.636506795883179
epoch: 2, training loss: 0.561013, training acc: 0.780777, valid loss: 0.610502, valid acc: 0.691895

time taken: 9.737434148788452
epoch: 3, training loss: 0.484725, training acc: 0.844579, valid loss: 0.607925, valid acc: 0.687988

epoch: 4, pass acc: 0.691895, current acc: 0.698730
time taken: 9.770654678344727
epoch: 4, training loss: 0.428225, training acc: 0.876539, valid loss: 0.606709, valid acc: 0.698730

time taken: 9.933742046356201
epoch: 5, training loss: 0.371753, training acc: 0.911695, valid loss: 0.612761, valid acc: 0.698242

epoch: 6, pass acc: 0.698730, current acc: 0.701172
time taken: 10.066709518432617
epoch: 6, training loss: 0.325556, training acc: 0.931226, valid loss: 0.617917, valid acc: 0.701172

time taken: 10.072763204574585
epoch: 7, training loss: 0.291312, training acc: 0.942590, valid loss: 0.636979, valid acc: 0.699707

epoch: 8, pass acc: 0.701172, current acc: 0.701660
time taken: 10.197040557861328
epoch: 8, training loss: 0.267891, training acc: 0.947325, valid loss: 0.644354, valid acc: 0.701660

epoch: 9, pass acc: 0.701660, current acc: 0.705078
time taken: 10.20655870437622
epoch: 9, training loss: 0.245343, training acc: 0.953480, valid loss: 0.652071, valid acc: 0.705078

time taken: 13.8681321144104
epoch: 10, training loss: 0.228671, training acc: 0.957150, valid loss: 0.667429, valid acc: 0.705078

epoch: 11, pass acc: 0.705078, current acc: 0.708984
time taken: 20.409197330474854
epoch: 11, training loss: 0.215550, training acc: 0.959635, valid loss: 0.676031, valid acc: 0.708984

time taken: 20.692713260650635
epoch: 12, training loss: 0.203894, training acc: 0.961293, valid loss: 0.693618, valid acc: 0.705078

time taken: 20.51321005821228
epoch: 13, training loss: 0.194636, training acc: 0.962831, valid loss: 0.710977, valid acc: 0.704102

time taken: 20.729293823242188
epoch: 14, training loss: 0.187575, training acc: 0.963305, valid loss: 0.726808, valid acc: 0.702637

time taken: 20.588152647018433
epoch: 15, training loss: 0.181914, training acc: 0.963187, valid loss: 0.742926, valid acc: 0.699219

time taken: 20.96327304840088
epoch: 16, training loss: 0.174368, training acc: 0.965199, valid loss: 0.743632, valid acc: 0.708496

break epoch:17


In [10]:
model.luong_cell.reset_state()
logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.71      0.68      0.70      1068
   positive       0.69      0.72      0.71      1065

avg / total       0.70      0.70      0.70      2133