In [1]:
import tensorflow as tf
import numpy as np
from utils import *
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [561, 2546, 3559, 15743, 36, 7860, 219, 150, 19, 3709] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
class Attention:
    def __init__(self,hidden_size):
        self.hidden_size = hidden_size
        self.dense_layer = tf.layers.Dense(hidden_size)
        self.v = tf.random_normal([hidden_size],mean=0,stddev=1/np.sqrt(hidden_size))
        
    def score(self, hidden_tensor, encoder_outputs):
        energy = tf.nn.tanh(self.dense_layer(tf.concat([hidden_tensor,encoder_outputs],2)))
        energy = tf.transpose(energy,[0,2,1])
        batch_size = tf.shape(encoder_outputs)[0]
        v = tf.expand_dims(tf.tile(tf.expand_dims(self.v,0),[batch_size,1]),1)
        energy = tf.matmul(v,energy)
        return tf.squeeze(energy,1)
    
    def __call__(self, hidden, encoder_outputs):
        seq_len = tf.shape(encoder_outputs)[1]
        batch_size = tf.shape(encoder_outputs)[0]
        H = tf.tile(tf.expand_dims(hidden, 1),[1,seq_len,1])
        attn_energies = self.score(H,encoder_outputs)
        return tf.expand_dims(tf.nn.softmax(attn_energies),1)

class Bahdanau(tf.contrib.rnn.RNNCell):
    def __init__(self, hidden_size, output_size, encoder_outputs):
        self.hidden_size = hidden_size
        self.gru = tf.contrib.rnn.GRUCell(hidden_size)
        self.attention = Attention(hidden_size)
        self.out = tf.layers.Dense(output_size)
        self.encoder_outputs = encoder_outputs
        self.stack = []
    
    @property
    def state_size(self):
        return self.hidden_size
 
    @property
    def output_size(self):
        return self.hidden_size
    
    def reset_state(self):
        self.stack = []
    
    def __call__(self,inputs, state, scope=None):
        attn_weights = self.attention(state, self.encoder_outputs)
        context = tf.matmul(attn_weights, self.encoder_outputs)[:,0,:]
        rnn_input = tf.concat([inputs,context],1)
        output, hidden = self.gru(rnn_input, state)
        output = tf.nn.softmax(self.out(output))
        return output, hidden
    
    def get_attention(self, inputs, state):
        attn_weights = self.attention(state, self.encoder_outputs)
        self.stack.append(attn_weights)
        context = tf.matmul(attn_weights, self.encoder_outputs)[:,0,:]
        rnn_input = tf.concat([inputs,context],1)
        output, hidden = self.gru(rnn_input, state)
        output = tf.nn.softmax(self.out(output))
        return output, hidden, attn_weights

In [7]:
class Model:
    def __init__(self, size_layer, embedded_size, dict_size, dimension_output, learning_rate):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        self.encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(self.encoder_embeddings, self.X)
        self.bahdanau_cell = Bahdanau(size_layer,size_layer,encoder_embedded)
        outputs, last_states = tf.nn.dynamic_rnn(self.bahdanau_cell, encoder_embedded, dtype = tf.float32)
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(outputs[:,-1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
size_layer = 128
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,embedded_size,len(dictionary),dimension_output,learning_rate)
sess.run(tf.global_variables_initializer())


WARNING:tensorflow:From <ipython-input-7-90b23bdd3511>:13: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.


In [9]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        model.bahdanau_cell.reset_state()
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        model.bahdanau_cell.reset_state()
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.534668
time taken: 20.91740655899048
epoch: 0, training loss: 0.692880, training acc: 0.504261, valid loss: 0.691029, valid acc: 0.534668

epoch: 1, pass acc: 0.534668, current acc: 0.653320
time taken: 20.791388750076294
epoch: 1, training loss: 0.663596, training acc: 0.623816, valid loss: 0.646891, valid acc: 0.653320

epoch: 2, pass acc: 0.653320, current acc: 0.687988
time taken: 20.562223434448242
epoch: 2, training loss: 0.583401, training acc: 0.752959, valid loss: 0.616532, valid acc: 0.687988

epoch: 3, pass acc: 0.687988, current acc: 0.698730
time taken: 20.59592342376709
epoch: 3, training loss: 0.503855, training acc: 0.825284, valid loss: 0.600355, valid acc: 0.698730

time taken: 20.402120113372803
epoch: 4, training loss: 0.459829, training acc: 0.841619, valid loss: 0.652106, valid acc: 0.647461

epoch: 5, pass acc: 0.698730, current acc: 0.708984
time taken: 20.807710647583008
epoch: 5, training loss: 0.418119, training acc: 0.860559, valid loss: 0.585371, valid acc: 0.708984

epoch: 6, pass acc: 0.708984, current acc: 0.717285
time taken: 15.171148777008057
epoch: 6, training loss: 0.343612, training acc: 0.911813, valid loss: 0.590517, valid acc: 0.717285

time taken: 11.792047500610352
epoch: 7, training loss: 0.286065, training acc: 0.940341, valid loss: 0.608303, valid acc: 0.716309

time taken: 11.779529333114624
epoch: 8, training loss: 0.250861, training acc: 0.952770, valid loss: 0.620904, valid acc: 0.717285

time taken: 11.797487020492554
epoch: 9, training loss: 0.228651, training acc: 0.957623, valid loss: 0.648205, valid acc: 0.712402

time taken: 11.778848648071289
epoch: 10, training loss: 0.210854, training acc: 0.960938, valid loss: 0.666810, valid acc: 0.712891

epoch: 11, pass acc: 0.717285, current acc: 0.720215
time taken: 11.76455569267273
epoch: 11, training loss: 0.195384, training acc: 0.964134, valid loss: 0.666509, valid acc: 0.720215

time taken: 11.788504838943481
epoch: 12, training loss: 0.187069, training acc: 0.963660, valid loss: 0.688140, valid acc: 0.713379

time taken: 11.83158564567566
epoch: 13, training loss: 0.181791, training acc: 0.962831, valid loss: 0.720466, valid acc: 0.706543

time taken: 11.792386054992676
epoch: 14, training loss: 0.182083, training acc: 0.959872, valid loss: 0.758919, valid acc: 0.695801

time taken: 11.777226209640503
epoch: 15, training loss: 0.165933, training acc: 0.965909, valid loss: 0.720335, valid acc: 0.717285

time taken: 11.784944295883179
epoch: 16, training loss: 0.146761, training acc: 0.973248, valid loss: 0.740853, valid acc: 0.714355

break epoch:17


In [10]:
model.bahdanau_cell.reset_state()
logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.73      0.69      0.71      1091
   positive       0.70      0.74      0.72      1042

avg / total       0.72      0.71      0.71      2133