In [3]:
from __future__ import division, print_function, absolute_import

import tensorflow as tf
import tflearn
import pickle
import numpy as np
from tensorflow.contrib import learn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression

In [4]:
# Load preprocessed datasets
with open('preprocess_x_2.pickle', 'rb') as handle:
    x_shuffled = pickle.load(handle)

with open('preprocess_y_2.pickle', 'rb') as handle:
    y_shuffled = pickle.load(handle)

print ("Files loaded.")
print ("x_shuffled size: {:d}".format(len(x_shuffled)))
print ("y_shuffled size: {:d}".format(len(y_shuffled)))


Files loaded.
x_shuffled size: 108
y_shuffled size: 108

In [5]:
# Split train/test set
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
for train_ind, test_ind in sss.split(x_shuffled, y_shuffled):
    print ("TRAIN:", train_ind, "TEST:", test_ind)
    X_train, X_val = x_shuffled[train_ind], x_shuffled[test_ind]
    y_train, y_val = y_shuffled[train_ind], y_shuffled[test_ind]

print("Train features dimensions: {:d}, {:d}".format(*X_train.shape))
print("Train labels dimensions: {:d}, {:d}".format(*y_train.shape))
print("Test features dimensions: {:d}, {:d}".format(*X_val.shape))
print("Test labels dimensions: {:d}, {:d}".format(*y_val.shape))


TRAIN: [ 47  67  10  73  15   0  83  80  11  91  36  34  18  12 100  37  35  48
  33  65  20  81  72  68  23  45 103  88  68  84  45  84  18  53   7  55
  98  74  95  30 103  62  96  96  28  64  80  67  82  10  15   7  31  92
  30  55  39  44  89  65   9  22  49  76  79  78   5  53   9  81  97  16
  86 107  69  70  42   4  69  99  25  70  79  38  60  26  40  56  17  63
  59  39  24 101  90  66  16] TEST: [43 27 66 73 19 90 19 57 54 29 88]
TRAIN: [ 74  17  26  19  87   2  52  55  74  10  30  34  77   6  51  48   0  86
  44  70  61  94  56  39  78  13  65  37   9  55  11  43 105  13   5  40
  83  69  45 101  89  14  99  58  70  85  35   4  81   7  50   1  67   2
  21  93  58  15   6  19  99  42 104  41  14  79  73  12   7  98 106  82
  71  95  37  24  80  27  10  20  36  93  90 100  49  40  72  92 102  84
  72  76  67  33  81  59  64] TEST: [23 76 49 71 94 25 44 69 83 12  3]
TRAIN: [ 54  54  86  37  31  26  61  57 105 105  85  48  20  87  99  90  83   8
  60  58  86  73 101  41  27  35  40  21  18  19   9 106  77  69   6  17
  65  72  45  82  43  91  61  92   1 106  67   6  24  80  21  87  43  23
  55  25  67  29  68 101  24  44  39  75   3  51  88 107  82  66   7  14
  42  98   0  58  36  22  70  68  97  10 104  39  34  36 103  33  64  89
  81  41  32  97  80  38  70] TEST: [107  90  30   4  93  76  50  15  94   5  34]
TRAIN: [ 65  91  46   2  84  31   6  96 102  42  31  32  33 104  99  63  66  65
  13  70  81  87  91  27  20  26  95  64  16  92  80  50  51  47  30  17
  95   2  60  72 101  98  57  99  83  78  96  90  28  29  60  47   3  23
  24  55 106  87  15  88  67  41   6  21  48  49  58  38   7  85  79  84
  54  82  44  80  52  57  11  40  15  86  19  27   1  35  88  43   5  18
  32  69  56  45 105 100  82] TEST: [ 34   9   7  55  38  97  86  76  53 103  51]
TRAIN: [ 35  47  29  59   5  78 104  49  63  62 103  99  43  15  73  51  48  57
 100  26  90  19  55  50  91   2  91  60  88  80  65  83  13  67  90  42
  51  74  87  94  97  54  65  87  69  40  37 102  81  18  77  98  93 107
  31  73  49  66   0  56  79  98  20   6  83  66  45  86   0  92  61  62
  93  19   4  52  14  12   3   8  27  64  75  60  27  67  10  22  53  68
  92  76   7   7  57  18  95] TEST: [33 34 76 46 29  4 21 32 23 55  1]
TRAIN: [ 96  67  39  64   8  60  94  24   5  47  28  33   4  86  42  32 105   3
  80  75  27  48  57  90  39   5  32  89  15  69  41  89  66  10  77  53
  95  37   2  98  55  15  44  43  18 101  23  72 106  21  81   3  56  33
  36  48  40 102  54  61  38   4   6  42  80  22  35  50  14 107  88  40
  20  75  58  51   7  56  97 107  53  91   9  66   0 102  82  13  50  97
  94   0  78 104  25  87  18] TEST: [51 54 73 61 65 35 44 83 74  6 31]
TRAIN: [ 86  29  93  31   6  96  23  80  55  42  45  38  57  51  73   4  56  50
  23  54  92  24  41  82   2  21  35   6  36  60  66  26  44  40  99  85
  47  70  38  52  54  58  16  33  24  61  41  28  71  17  58   5   2  51
  53  27  74  50 100  95  81  80  15  19  97  20  96  59  76  32  37  94
  11  78 100  98  17  85  11  81  39  15  29  21  33  46  90  75  49   3
  26  72  62  89  12  86  18] TEST: [106  25   8  57   9  28   8   1 104  83  99]
TRAIN: [ 75  47  96  45  38  21 106  39  56  87  74  72  92  77  37   3  36  42
  23  33  41   6  58 103  91  94  60  24  41   4 101  86  10   0  61   8
   3 104  47  57  64   5  10  25  89  80   8  78  38  50  37  46   0  84
  23  58  91  43 105  86 103  45  49  15  17  32  82  70  26  13  16  28
  18  76  48  54  51  52  62  55  53  69  25  14  36  81  95  75  97  18
  33 106  99  68  56   9  74] TEST: [98 39 22 60 50 52 42 62 55 92 76]
TRAIN: [ 48  67  60  96  43  39  34 106  58  49  66  83 105  28  25   5  64  48
  21  79  22  57  32  28  60  37  30  56  75 103  33  65  64  14  69   2
  47   4   0  92  95  44   8   3  10   1  41  32  36  51  59   7   0  13
  80  62  38  96  47   6  18  42  68  74   5  34  27 101  31  99 104  26
  76  63   4  86  91  41  99  72  19  46  87 103  27  70  23 102  78  70
 107  49  87  55  68  24  61] TEST: [18 42 17 29 50 94 92 30 53 78 81]
TRAIN: [104  87  66  99  91  96  90  94  88  11  60  45  37  28  77   3  47  14
  71  54   6  39  55  42  85   8  66  13  30  28  12  71  33  36  96  99
  41  30 106  17  29  10   7 107  54  49  86  22 100  23  83  81  90 101
  35  61  44  78  79   9  68  44   1  80  80  72  52  64  91  87 105 102
  36  48  93  53  38  63  79  89  77  81  74  86  74  23  46 100  94  52
  82 104  53  69   0  35  18] TEST: [ 41  21 102   5  70  76  16  82  92  51  34]
Train features dimensions: 97, 8405
Train labels dimensions: 97, 2
Test features dimensions: 11, 8405
Test labels dimensions: 11, 2

In [14]:
'''# Building convolutional network
network = input_data(shape=[None, 407], name='input')
# Converts all words in vocabulary to lower dimensional representation
network = tflearn.embedding(network, input_dim=3800, output_dim=128)
branch1 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 6, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 7, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
# Change the shape by adding 2 to dimensions
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')

# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(X_train, y_train, 
          n_epoch = 5, shuffle=True, 
          validation_set=(X_val, y_val), 
          show_metric=True, batch_size=32, 
          run_id='oc_1')'''


Training Step: 19  | total loss: 0.56784 | time: 1.757s
| Adam | epoch: 005 | loss: 0.56784 - acc: 0.8582 -- iter: 96/97
Training Step: 20  | total loss: 0.55129 | time: 2.790s
| Adam | epoch: 005 | loss: 0.55129 - acc: 0.9038 | val_loss: 0.56559 - val_acc: 0.8182 -- iter: 97/97
--

In [ ]:
'''# TFLearn bi-directional RNN
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell
from tflearn.layers.estimator import regression

net = input_data(shape=[None, 8405])
net = embedding(net, input_dim=15299, output_dim=256)
net = bidirectional_rnn(net, BasicLSTMCell(256), BasicLSTMCell(256))
net = dropout(net, 0.5)
net = fully_connected(net, 2, activation='softmax')
net = regression(net, optimizer='adam', learning_rate=0.0001, loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
model.fit(X_train, 
          y_train, 
          n_epoch=5,
          shuffle=True,
          validation_set=(X_val, y_val),
          show_metric=True, 
          batch_size=16,
          run_id='os_1'
         )'''

In [ ]:
'''#TFLearn LSTM
from __future__ import division, print_function, absolute_import

import tflearn

net = tflearn.input_data([None, 8405])
net = tflearn.embedding(net, input_dim=15299, output_dim=256)
net = tflearn.lstm(net, 256, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(X_train, y_train, validation_set=(X_val, y_val), show_metric=True,
          batch_size=16)'''

In [23]:
# Build TensorFlow model
sequence_length = X_train.shape[1]
num_classes = y_train.shape[1]
vocab_size = 15299
#embedding_size = 128
embedding_size = 300 # for word2vec
#filter_sizes = [5, 6, 7]
filter_sizes = [3, 5, 7]
num_filters = 256
l2_reg_lambda = 0.0

graph = tf.Graph()
with graph.as_default():
    # Placeholders for input, output and dropout
    input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
    input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # Keep track of L2 regularization loss
    l2_loss = tf.constant(0.0)

    # Build model
    # Embedding layer
    with tf.device('/cpu:0'), tf.name_scope("embedding"):
        W_em = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W_em")
        embedded_chars = tf.nn.embedding_lookup(W_em, input_x)
        embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

    # Create a convolution + maxpool layer for each filter size
    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W-%s" % filter_size)
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b-%s" % filter_size)

            conv = tf.nn.conv2d(
                embedded_chars_expanded,
                W,
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="conv")

            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")

            # Maxpooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, sequence_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="pool")
            pooled_outputs.append(pooled)

    # Combine all pooled features
    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(3, pooled_outputs)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

    # Add dropout
    with tf.name_scope("dropout"):
        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

    # Final (unnormalized) scores and predictions
    with tf.name_scope("output"):
        W = tf.get_variable(
            "W",
            shape=[num_filters_total, num_classes],
            initializer=tf.contrib.layers.xavier_initializer())

        b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
        l2_loss += tf.nn.l2_loss(W)
        l2_loss += tf.nn.l2_loss(b)
        scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
        predictions = tf.argmax(scores, 1, name="predictions")

    # Calculate mean cross-entropy loss
    with tf.name_scope("loss"):
        losses = tf.nn.softmax_cross_entropy_with_logits(scores, input_y)
        loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

    # Accuracy
    with tf.name_scope("accuracy"):
        correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
        
    # AUC
    with tf.name_scope("auc"):
        a = tf.cast(tf.argmax(predictions, 1),tf.float32)
        b = tf.cast(tf.argmax(input_y, 1),tf.float32)
        auc = tf.contrib.metrics.streaming_auc(a, b)

    # Optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    starter_learning_rate = 0.0005
    # Decay factor of 0.95 after every 10000 steps.
    with tf.name_scope('learning_rate'):
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 10000, 0.95)
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)

In [24]:
# ======== Training =========
with tf.Session(graph=graph) as sess:
    tf.initialize_all_variables().run()
    #sess.run(tf.initialize_all_variables())
    #sess.run(tf.initialize_local_variables())
    saver = tf.train.Saver(tf.all_variables())
    print('Initialized')

    def train_step(x_batch, y_batch):
        feed_dict = {
                input_x: x_batch,
                input_y: y_batch,
                dropout_keep_prob: 0.5
                }
        
        _, step, l, accuracy_train = sess.run(
            [optimizer, global_step, loss, accuracy], feed_dict=feed_dict)
        return step, l, accuracy_train
        
    def val_step(x_val, y_val):
        feed_dict = {
                input_x: x_val,
                input_y: y_val,
                dropout_keep_prob: 1.0
                }
            
        step, loss_val, accuracy_val = sess.run(
            [global_step, loss, accuracy], feed_dict=feed_dict)
        return accuracy_val

    def load_word2vec(filepath, vocab_size, embedding_size, max_document_length):
        '''
        Loads pretrained word2vec weights to be fed into the graph,
        instead of training the word embeddings from scratch.
        '''
        # Initialize matrix
        initW = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_size))
        # Load vectors from word2vec
        print("Load word2vec file {}\n".format(filepath))
        with open(filepath, 'rb') as f:
            header = f.readline()
            vocab_size, layer1_size = map(int, header.split())
            binary_len = np.dtype('float32').itemsize * layer1_size
            for line in xrange(vocab_size):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == ' ':
                        word = ''.join(word)
                        break
                    if ch != '\n':
                        word.append(ch)
                vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
                idx = vocab_processor.vocabulary_.get(word)
                if idx != None:
                    initW[idx] = np.fromstring(f.read(binary_len), dtype='float32')
                else:
                    f.read(binary_len)

        sess.run(W_em.assign(initW))   
    
    def batch_iter(data, batch_size, num_epochs, shuffle=False):
        '''
        Generates a batch iterator for a dataset.
        '''
        data = np.array(data)
        data_size = len(data)
        num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
        for epoch in range(num_epochs):
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    batch_size = 16
    num_epochs = 5
    evaluate_every = 10
    checkpoint_every = 10
    checkpoint = '/home/ubuntu/pynb/oscars/cp'
    filepath = '/home/ubuntu/pynb/rt-movie-reviews/GoogleNews-vectors-negative300.bin'

    # Load word2vec weights
    load_word2vec(filepath, vocab_size, embedding_size, sequence_length)
    # Generate batches
    batches = batch_iter(
        list(zip(X_train, y_train)), batch_size, num_epochs)
    # Training loop. For each batch...
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        step, l, accuracy_train = train_step(x_batch, y_batch)
        if (step % evaluate_every == 0):
            accuracy_val = val_step(X_val, y_val)
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: {:.4f}'.format(accuracy_train))
            print('Validation accuracy: {:.4f}'.format(accuracy_val))
            #print('Minibatch AUC: {:.4f}'.format(auc_train))
            #print('Validation AUC: {:.4f}'.format(auc_val))
        if (step % checkpoint_every == 0):
            path = saver.save(sess, checkpoint, global_step=step)
            print("Saved model checkpoint to {}\n".format(path))


Initialized
Load word2vec file /home/ubuntu/pynb/rt-movie-reviews/GoogleNews-vectors-negative300.bin

Minibatch loss at step 10: 0.006366
Minibatch accuracy: 1.0000
Validation accuracy: 0.9091
Saved model checkpoint to /home/ubuntu/pynb/oscars/cp-10

Minibatch loss at step 20: 0.475943
Minibatch accuracy: 0.8750
Validation accuracy: 0.9091
Saved model checkpoint to /home/ubuntu/pynb/oscars/cp-20

Minibatch loss at step 30: 0.018249
Minibatch accuracy: 1.0000
Validation accuracy: 0.9091
Saved model checkpoint to /home/ubuntu/pynb/oscars/cp-30


In [25]:
checkpoint_file = 'cp-30'

# Load preprocessed test dataset
with open('preprocess_test_2.pickle', 'rb') as handle:
    X_test = pickle.load(handle)
    print("Test set loaded.")

# Restore model and run predictions
with tf.Session(graph=graph) as sess:
    print("Loading variables from '%s'." % checkpoint_file)
    saver.restore(sess, checkpoint_file)
    print("Model restored.") 
    pred = sess.run(predictions, feed_dict={
                                input_x: X_test,
                                dropout_keep_prob: 1.0
                                })
    print(pred)


Test set loaded.
Loading variables from 'cp-30'.
Model restored.
[1 1 1 1 1 1 1 1 1]

In [ ]: