In [1]:
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
train_X, test_X, train_Y, test_Y = train_test_split(trainset.data, trainset.target, test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20332
Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]
Sample data [543, 2605, 3331, 18780, 36, 7289, 218, 150, 19, 4121] ['rock', 'destined', '21st', 'centurys', 'new', 'conan', 'hes', 'going', 'make', 'splash']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128

In [7]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        batch_size = tf.shape(self.X)[0]
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        outputs, last_state = tf.nn.dynamic_rnn(tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), 
                                                encoder_embedded, dtype = tf.float32)
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units = size_layer, 
                                                                    memory = outputs)
        rnn_cells = tf.contrib.seq2seq.AttentionWrapper(cell = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), 
                                                        attention_mechanism = attention_mechanism,
                                                        attention_layer_size = size_layer,
                                                       alignment_history=True)
        decoder_outputs, decoder_last_state = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, 
                                                                initial_state = rnn_cells.zero_state(batch_size, tf.float32).clone(cell_state=last_state),
                                                                dtype = tf.float32)
        self.alignments = tf.transpose(decoder_last_state.alignment_history.stack(),[1,2,0])
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(outputs[:, -1], W) + b
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),dimension_output,learning_rate)
sess.run(tf.global_variables_initializer())

In [9]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i:i+batch_size],dictionary,maxlen)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_Y[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i:i+batch_size],dictionary,maxlen)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_Y[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1


epoch: 0, pass acc: 0.000000, current acc: 0.595703
time taken: 2.8624331951141357
epoch: 0, training loss: 0.684345, training acc: 0.543797, valid loss: 0.664340, valid acc: 0.595703

epoch: 1, pass acc: 0.595703, current acc: 0.662109
time taken: 2.77538800239563
epoch: 1, training loss: 0.585240, training acc: 0.692235, valid loss: 0.626291, valid acc: 0.662109

epoch: 2, pass acc: 0.662109, current acc: 0.675293
time taken: 2.777212381362915
epoch: 2, training loss: 0.431262, training acc: 0.807765, valid loss: 0.638474, valid acc: 0.675293

epoch: 3, pass acc: 0.675293, current acc: 0.688477
time taken: 2.793844699859619
epoch: 3, training loss: 0.293421, training acc: 0.875710, valid loss: 0.705397, valid acc: 0.688477

epoch: 4, pass acc: 0.688477, current acc: 0.698242
time taken: 2.8104021549224854
epoch: 4, training loss: 0.185434, training acc: 0.929569, valid loss: 1.045832, valid acc: 0.698242

time taken: 2.790931463241577
epoch: 5, training loss: 0.123517, training acc: 0.955492, valid loss: 1.152864, valid acc: 0.695801

epoch: 6, pass acc: 0.698242, current acc: 0.701172
time taken: 2.7840640544891357
epoch: 6, training loss: 0.065245, training acc: 0.980824, valid loss: 1.655402, valid acc: 0.701172

epoch: 7, pass acc: 0.701172, current acc: 0.706055
time taken: 2.7694666385650635
epoch: 7, training loss: 0.033158, training acc: 0.989228, valid loss: 1.710513, valid acc: 0.706055

epoch: 8, pass acc: 0.706055, current acc: 0.711426
time taken: 2.7587289810180664
epoch: 8, training loss: 0.018099, training acc: 0.994437, valid loss: 1.959445, valid acc: 0.711426

time taken: 2.773940324783325
epoch: 9, training loss: 0.014265, training acc: 0.995739, valid loss: 2.054330, valid acc: 0.694336

time taken: 2.797513723373413
epoch: 10, training loss: 0.013101, training acc: 0.997041, valid loss: 2.234490, valid acc: 0.700195

time taken: 2.780038833618164
epoch: 11, training loss: 0.005318, training acc: 0.998580, valid loss: 2.492469, valid acc: 0.707520

time taken: 2.7738423347473145
epoch: 12, training loss: 0.002818, training acc: 0.999171, valid loss: 2.571098, valid acc: 0.699219

time taken: 2.7855167388916016
epoch: 13, training loss: 0.002345, training acc: 0.999527, valid loss: 2.692470, valid acc: 0.690430

break epoch:14


In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [11]:
heatmap=sess.run(model.alignments,feed_dict={model.X:str_idx(test_X[1:2],dictionary,len(test_X[1].split()))})

In [12]:
plt.figure(figsize=(15,10))
sns.heatmap(heatmap[:,0,:],
           xticklabels=test_X[1].split(),yticklabels=test_X[1].split())
plt.show()