In [1]:
import tensorflow as tf
import numpy as np
from utils import *
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))
ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, 
                                                                               trainset.target, 
                                                                               ONEHOT, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [547, 2619, 3487, 18663, 36, 8027, 218, 151, 19, 3746] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [9]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate, maxlen):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.GRUCell(size_layer,reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype = tf.float32)
        
        U = tf.get_variable('U', 
            shape=[size_layer, embedded_size],
            dtype=tf.float32, 
            initializer=tf.random_uniform_initializer(-0.01, 0.01))
        V = tf.get_variable('V', 
            shape=[embedded_size, embedded_size],
            dtype=tf.float32, 
            initializer=tf.random_uniform_initializer(-0.01, 0.01))
        Av = tf.get_variable('Av', 
            shape=[embedded_size, 1],
            dtype=tf.float32, 
            initializer=tf.random_uniform_initializer(-0.01, 0.01))
        
        timesteps = tf.shape(outputs)[1]
        
        ref_proj = tf.reshape(tf.matmul(tf.reshape(outputs, [-1, size_layer]), U),[-1, timesteps, embedded_size])
        hi = tf.expand_dims(tf.matmul(encoder_embedded[:,-1], V), axis=1)
        blended = (ref_proj + hi)
        self.scores = tf.reshape(tf.matmul(tf.reshape(blended, [-1, embedded_size]),Av),[-1, timesteps])
        W = tf.get_variable('w',shape=(maxlen, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(self.scores, W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [10]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output,learning_rate,maxlen)
sess.run(tf.global_variables_initializer())

In [11]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.483398
time taken: 3.0111024379730225
epoch: 0, training loss: 0.693123, training acc: 0.509706, valid loss: 0.694546, valid acc: 0.483398

epoch: 1, pass acc: 0.483398, current acc: 0.519043
time taken: 2.975780963897705
epoch: 1, training loss: 0.681662, training acc: 0.564039, valid loss: 0.693371, valid acc: 0.519043

epoch: 2, pass acc: 0.519043, current acc: 0.534180
time taken: 2.9888012409210205
epoch: 2, training loss: 0.651196, training acc: 0.625829, valid loss: 0.700180, valid acc: 0.534180

epoch: 3, pass acc: 0.534180, current acc: 0.537109
time taken: 2.95375657081604
epoch: 3, training loss: 0.610450, training acc: 0.671520, valid loss: 0.725243, valid acc: 0.537109

epoch: 4, pass acc: 0.537109, current acc: 0.546875
time taken: 2.952038288116455
epoch: 4, training loss: 0.563615, training acc: 0.706321, valid loss: 0.755379, valid acc: 0.546875

epoch: 5, pass acc: 0.546875, current acc: 0.633789
time taken: 2.9598748683929443
epoch: 5, training loss: 0.493952, training acc: 0.758523, valid loss: 0.703989, valid acc: 0.633789

epoch: 6, pass acc: 0.633789, current acc: 0.675781
time taken: 2.95723557472229
epoch: 6, training loss: 0.326752, training acc: 0.861742, valid loss: 0.803185, valid acc: 0.675781

time taken: 2.980271816253662
epoch: 7, training loss: 0.171324, training acc: 0.934304, valid loss: 1.110790, valid acc: 0.669922

epoch: 8, pass acc: 0.675781, current acc: 0.681152
time taken: 2.9620633125305176
epoch: 8, training loss: 0.074807, training acc: 0.975379, valid loss: 1.592094, valid acc: 0.681152

time taken: 2.9621477127075195
epoch: 9, training loss: 0.032625, training acc: 0.990294, valid loss: 1.895876, valid acc: 0.660156

time taken: 2.9880611896514893
epoch: 10, training loss: 0.014231, training acc: 0.996094, valid loss: 2.302230, valid acc: 0.679688

epoch: 11, pass acc: 0.681152, current acc: 0.684570
time taken: 2.9962332248687744
epoch: 11, training loss: 0.007939, training acc: 0.997277, valid loss: 2.728962, valid acc: 0.684570

time taken: 2.945730209350586
epoch: 12, training loss: 0.003370, training acc: 0.999645, valid loss: 2.733052, valid acc: 0.680664

epoch: 13, pass acc: 0.684570, current acc: 0.686035
time taken: 2.9486236572265625
epoch: 13, training loss: 0.000349, training acc: 1.000000, valid loss: 2.926276, valid acc: 0.686035

epoch: 14, pass acc: 0.686035, current acc: 0.687500
time taken: 2.950187921524048
epoch: 14, training loss: 0.000149, training acc: 1.000000, valid loss: 3.054515, valid acc: 0.687500

epoch: 15, pass acc: 0.687500, current acc: 0.687988
time taken: 2.9908742904663086
epoch: 15, training loss: 0.000103, training acc: 1.000000, valid loss: 3.148623, valid acc: 0.687988

time taken: 2.9619953632354736
epoch: 16, training loss: 0.000078, training acc: 1.000000, valid loss: 3.225630, valid acc: 0.687988

time taken: 2.985590696334839
epoch: 17, training loss: 0.000062, training acc: 1.000000, valid loss: 3.292313, valid acc: 0.687988

epoch: 18, pass acc: 0.687988, current acc: 0.688965
time taken: 3.023634910583496
epoch: 18, training loss: 0.000051, training acc: 1.000000, valid loss: 3.351618, valid acc: 0.688965

time taken: 2.978372097015381
epoch: 19, training loss: 0.000043, training acc: 1.000000, valid loss: 3.405291, valid acc: 0.688965

epoch: 20, pass acc: 0.688965, current acc: 0.689453
time taken: 2.979339599609375
epoch: 20, training loss: 0.000037, training acc: 1.000000, valid loss: 3.454500, valid acc: 0.689453

time taken: 2.9494800567626953
epoch: 21, training loss: 0.000032, training acc: 1.000000, valid loss: 3.500069, valid acc: 0.689453

epoch: 22, pass acc: 0.689453, current acc: 0.689941
time taken: 2.9744861125946045
epoch: 22, training loss: 0.000028, training acc: 1.000000, valid loss: 3.542605, valid acc: 0.689941

time taken: 3.0208091735839844
epoch: 23, training loss: 0.000025, training acc: 1.000000, valid loss: 3.582568, valid acc: 0.689941

time taken: 2.9876980781555176
epoch: 24, training loss: 0.000022, training acc: 1.000000, valid loss: 3.620321, valid acc: 0.689941

epoch: 25, pass acc: 0.689941, current acc: 0.690918
time taken: 2.9925127029418945
epoch: 25, training loss: 0.000020, training acc: 1.000000, valid loss: 3.656149, valid acc: 0.690918

epoch: 26, pass acc: 0.690918, current acc: 0.691406
time taken: 2.9557723999023438
epoch: 26, training loss: 0.000018, training acc: 1.000000, valid loss: 3.690288, valid acc: 0.691406

time taken: 2.986029624938965
epoch: 27, training loss: 0.000016, training acc: 1.000000, valid loss: 3.722929, valid acc: 0.690918

time taken: 2.980065107345581
epoch: 28, training loss: 0.000015, training acc: 1.000000, valid loss: 3.754234, valid acc: 0.690918

time taken: 2.985074043273926
epoch: 29, training loss: 0.000014, training acc: 1.000000, valid loss: 3.784340, valid acc: 0.691406

time taken: 2.995537281036377
epoch: 30, training loss: 0.000013, training acc: 1.000000, valid loss: 3.813363, valid acc: 0.691406

epoch: 31, pass acc: 0.691406, current acc: 0.691895
time taken: 2.981157064437866
epoch: 31, training loss: 0.000012, training acc: 1.000000, valid loss: 3.841403, valid acc: 0.691895

time taken: 2.979064464569092
epoch: 32, training loss: 0.000011, training acc: 1.000000, valid loss: 3.868545, valid acc: 0.691895

time taken: 2.9751744270324707
epoch: 33, training loss: 0.000010, training acc: 1.000000, valid loss: 3.894866, valid acc: 0.691895

time taken: 2.954972982406616
epoch: 34, training loss: 0.000009, training acc: 1.000000, valid loss: 3.920431, valid acc: 0.691895

time taken: 2.9637160301208496
epoch: 35, training loss: 0.000008, training acc: 1.000000, valid loss: 3.945302, valid acc: 0.691895

epoch: 36, pass acc: 0.691895, current acc: 0.692383
time taken: 2.9549238681793213
epoch: 36, training loss: 0.000008, training acc: 1.000000, valid loss: 3.969530, valid acc: 0.692383

epoch: 37, pass acc: 0.692383, current acc: 0.692871
time taken: 2.950207233428955
epoch: 37, training loss: 0.000007, training acc: 1.000000, valid loss: 3.993162, valid acc: 0.692871

time taken: 2.9671473503112793
epoch: 38, training loss: 0.000007, training acc: 1.000000, valid loss: 4.016239, valid acc: 0.692871

time taken: 2.946089744567871
epoch: 39, training loss: 0.000006, training acc: 1.000000, valid loss: 4.038800, valid acc: 0.692871

time taken: 2.949782133102417
epoch: 40, training loss: 0.000006, training acc: 1.000000, valid loss: 4.060879, valid acc: 0.692871

epoch: 41, pass acc: 0.692871, current acc: 0.693359
time taken: 2.9578158855438232
epoch: 41, training loss: 0.000006, training acc: 1.000000, valid loss: 4.082506, valid acc: 0.693359

time taken: 3.0046489238739014
epoch: 42, training loss: 0.000005, training acc: 1.000000, valid loss: 4.103708, valid acc: 0.693359

time taken: 2.9298746585845947
epoch: 43, training loss: 0.000005, training acc: 1.000000, valid loss: 4.124513, valid acc: 0.693359

time taken: 3.0205276012420654
epoch: 44, training loss: 0.000005, training acc: 1.000000, valid loss: 4.144944, valid acc: 0.693359

time taken: 2.9677155017852783
epoch: 45, training loss: 0.000004, training acc: 1.000000, valid loss: 4.165024, valid acc: 0.693359

epoch: 46, pass acc: 0.693359, current acc: 0.693848
time taken: 2.962214946746826
epoch: 46, training loss: 0.000004, training acc: 1.000000, valid loss: 4.184771, valid acc: 0.693848

time taken: 2.97208571434021
epoch: 47, training loss: 0.000004, training acc: 1.000000, valid loss: 4.204202, valid acc: 0.693848

time taken: 2.961679220199585
epoch: 48, training loss: 0.000004, training acc: 1.000000, valid loss: 4.223339, valid acc: 0.693848

time taken: 2.9789092540740967
epoch: 49, training loss: 0.000003, training acc: 1.000000, valid loss: 4.242197, valid acc: 0.693848

time taken: 2.9738428592681885
epoch: 50, training loss: 0.000003, training acc: 1.000000, valid loss: 4.260787, valid acc: 0.693359

time taken: 2.9665794372558594
epoch: 51, training loss: 0.000003, training acc: 1.000000, valid loss: 4.279125, valid acc: 0.693359

break epoch:52


In [12]:
logits = sess.run(model.logits, feed_dict={model.X:str_idx(test_X,dictionary,maxlen)})
print(metrics.classification_report(test_Y, np.argmax(logits,1), target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.71      0.67      0.69      1105
   positive       0.67      0.71      0.69      1028

avg / total       0.69      0.69      0.69      2133


In [ ]: