In [1]:
from utils import *
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import time
import random


/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))


['negative', 'positive']
10662
10662

In [3]:
train_X, test_X, train_Y, test_Y = train_test_split(trainset.data, trainset.target,
                                                    test_size = 0.2)

In [4]:
concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])


vocab from size: 20465
Most common words [('the', 10129), ('a', 7312), ('and', 6199), ('of', 6063), ('to', 4233), ('is', 3378)]
Sample data [4, 663, 9, 2542, 8, 22, 4, 3378, 17841, 97] ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'centurys', 'new']

In [5]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [6]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output,margin=0.2):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.BasicRNNCell(size_layer,reuse=reuse)
        
        def rnn(embedded,reuse=False):
            with tf.variable_scope('model', reuse=reuse):
                rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
                outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype = tf.float32)
                W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
                b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
                return tf.matmul(outputs[:, -1], W) + b
            
        with tf.device('/cpu:0'):    
            self.INPUT_1 = tf.placeholder(tf.int32, [None, None])
            self.INPUT_2 = tf.placeholder(tf.int32, [None, None])
            self.Y = tf.placeholder(tf.float32, [None, 1])
            encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
            input1_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.INPUT_1)
            input2_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.INPUT_2)
            self.logits_1 = rnn(input1_embedded,False)
            self.logits_2 = rnn(input2_embedded,True)
            d = tf.sqrt(tf.reduce_sum(tf.pow(self.logits_1-self.logits_2, 2), 1, keep_dims=True))
            tmp = self.Y * tf.square(d)    
            tmp2 = (1 - self.Y) * tf.square(tf.maximum((margin - d),0))
            self.cost = tf.reduce_mean(tmp + tmp2) /2
            self.optimizer = tf.train.MomentumOptimizer(0.01, 0.99, use_nesterov=True).minimize(self.cost)

In [7]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = 32
maxlen = 50
batch_size = 128

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,vocabulary_size+4,dimension_output)
sess.run(tf.global_variables_initializer())

In [9]:
c = list(zip(train_X, train_Y))
random.shuffle(c)
train_X_1, train_Y_1 = zip(*c)

c = list(zip(train_X, train_Y))
random.shuffle(c)
train_X_2, train_Y_2 = zip(*c)

label_shuffle = np.expand_dims((np.array(train_Y_1) == np.array(train_Y_2)).astype('int'),1)

In [10]:
for i in range(50):
    total_loss = 0
    lasttime = time.time()
    for k in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x_1 = str_idx(train_X_1[i:i+batch_size],dictionary,maxlen)
        batch_x_2 = str_idx(train_X_2[i:i+batch_size],dictionary,maxlen)
        batch_y = label_shuffle[i:i+batch_size]
        loss, _ = sess.run([model.cost,model.optimizer],feed_dict={model.INPUT_1:batch_x_1,
                                                                 model.INPUT_2:batch_x_2,
                                                                 model.Y:batch_y})
        total_loss += loss
    total_loss /= (len(train_X) // batch_size)
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f\n'%(i,total_loss))


time taken: 7.159314870834351
epoch: 0, training loss: 0.371119

time taken: 7.075716257095337
epoch: 1, training loss: 0.005651

time taken: 7.074535369873047
epoch: 2, training loss: 0.003678

time taken: 7.048555135726929
epoch: 3, training loss: 0.002070

time taken: 7.1200714111328125
epoch: 4, training loss: 0.000913

time taken: 7.097279787063599
epoch: 5, training loss: 0.000275

time taken: 7.189687728881836
epoch: 6, training loss: 0.000149

time taken: 7.128118515014648
epoch: 7, training loss: 0.000183

time taken: 7.178845405578613
epoch: 8, training loss: 0.000073

time taken: 7.181195974349976
epoch: 9, training loss: 0.000084

time taken: 7.181663990020752
epoch: 10, training loss: 0.000042

time taken: 7.173894882202148
epoch: 11, training loss: 0.000089

time taken: 7.198666095733643
epoch: 12, training loss: 0.000019

time taken: 7.187653064727783
epoch: 13, training loss: 0.000041

time taken: 7.254589319229126
epoch: 14, training loss: 0.000016

time taken: 7.234525918960571
epoch: 15, training loss: 0.000020

time taken: 7.272420883178711
epoch: 16, training loss: 0.000047

time taken: 7.242069721221924
epoch: 17, training loss: 0.000019

time taken: 7.216881275177002
epoch: 18, training loss: 0.000018

time taken: 7.233326196670532
epoch: 19, training loss: 0.000023

time taken: 7.3419716358184814
epoch: 20, training loss: 0.000152

time taken: 7.247962951660156
epoch: 21, training loss: 0.000361

time taken: 7.204127550125122
epoch: 22, training loss: 0.000110

time taken: 7.303826570510864
epoch: 23, training loss: 0.000189

time taken: 7.336746692657471
epoch: 24, training loss: 0.000092

time taken: 7.272712707519531
epoch: 25, training loss: 0.000052

time taken: 7.328545331954956
epoch: 26, training loss: 0.000023

time taken: 7.156804084777832
epoch: 27, training loss: 0.000019

time taken: 7.222546577453613
epoch: 28, training loss: 0.000051

time taken: 7.252626657485962
epoch: 29, training loss: 0.000023

time taken: 7.3687944412231445
epoch: 30, training loss: 0.000084

time taken: 7.205474615097046
epoch: 31, training loss: 0.000109

time taken: 7.292163133621216
epoch: 32, training loss: 0.000102

time taken: 7.217748641967773
epoch: 33, training loss: 0.000041

time taken: 7.308082580566406
epoch: 34, training loss: 0.000016

time taken: 7.210784912109375
epoch: 35, training loss: 0.000069

time taken: 7.206620693206787
epoch: 36, training loss: 0.000019

time taken: 7.314574956893921
epoch: 37, training loss: 0.000008

time taken: 7.243310928344727
epoch: 38, training loss: 0.000010

time taken: 7.3154518604278564
epoch: 39, training loss: 0.000038

time taken: 7.164898872375488
epoch: 40, training loss: 0.000040

time taken: 7.209126234054565
epoch: 41, training loss: 0.000066

time taken: 7.138073921203613
epoch: 42, training loss: 0.000025

time taken: 7.223880767822266
epoch: 43, training loss: 0.000138

time taken: 7.107582330703735
epoch: 44, training loss: 0.000226

time taken: 7.197932958602905
epoch: 45, training loss: 0.000137

time taken: 7.154123783111572
epoch: 46, training loss: 0.000055

time taken: 7.160715103149414
epoch: 47, training loss: 0.000101

time taken: 7.12724232673645
epoch: 48, training loss: 0.000039

time taken: 7.135353326797485
epoch: 49, training loss: 0.000095


In [11]:
from scipy.spatial.distance import cdist

batch_x = str_idx(train_X_1,dictionary,maxlen)
batch_y = str_idx(test_X, dictionary,maxlen)

In [12]:
logits_train = sess.run(model.logits_1,feed_dict={model.INPUT_1:batch_x})
logits_test = sess.run(model.logits_1,feed_dict={model.INPUT_1:batch_y})

In [26]:
label_test = []
for i in range(logits_test.shape[0]):
    label_test.append(train_Y_1[np.argsort(cdist(logits_train, [logits_test[i,:]], 'cosine').ravel())[0]])

In [28]:
print(metrics.classification_report(test_Y, label_test, target_names = trainset.target_names))


             precision    recall  f1-score   support

   negative       0.52      0.51      0.52      1068
   positive       0.52      0.52      0.52      1065

avg / total       0.52      0.52      0.52      2133


In [ ]: