In [1]:
from __future__ import division, print_function, absolute_import
import os
import pickle
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display # Allows the use of display() for DataFrames
%matplotlib inline

import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell, lstm
from tflearn.layers.estimator import regression

In [2]:
# Load preprocessed file and inspect dimensions
with open('preprocess_3.pickle', 'rb') as f:
    X_train = pickle.load(f)
    X_val = pickle.load(f)
    y_train = pickle.load(f)
    y_val = pickle.load(f)   
    
print("File loaded.")
print(len(X_train))
print(len(X_val))
print(len(y_train))
print(len(y_val))


File loaded.
859345
15262
859345
15262

In [3]:
# Check class counts
from collections import Counter

c = Counter(y_train)
d = Counter(y_val)

c_pct = float(c[1])/(c[0] + c[1])
d_pct = float(d[1])/(d[0] + d[1])

print(c, c_pct)
print(d, d_pct)


Counter({0: 717553, 1: 141792}) 0.165000087276
Counter({0: 12744, 1: 2518}) 0.164984929891

In [4]:
'''with open('preprocess_train', 'rb'):  
    X_train = joblib.load('preprocess_train')

print("File loaded.")
print(len(X_train))'''


Out[4]:
'with open(\'preprocess_train\', \'rb\'):  \n    X_train = joblib.load(\'preprocess_train\')\n\nprint("File loaded.")\nprint(len(X_train))'

In [5]:
print(X_train[:5])


[[ 1  2  3  4  5  6  7  8  6  9 10 11 12 13  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30  2 31 32 33 21 34 25
  35  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 36 37 38 39 40  1 36 41 42 40  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  2 43 21 44  9 45 46 47 48 49 17 50  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [51 52 53 27 54 51 52 53 55 56 25 57 25 54 58  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]

In [6]:
print(y_train[:5])


[0, 0, 1, 0, 1]

In [7]:
# Data preprocessing
# Sequence padding
#X_train = pad_sequences(X_train, maxlen=235, value=0.)
#X_val = pad_sequences(X_val, maxlen=235, value=0.)
# Converting labels to binary vectors
y_train = to_categorical(y_train, nb_classes=2)
y_val = to_categorical(y_val, nb_classes=2)

In [8]:
print(y_train[:5])


[[ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]

In [9]:
print(X_val[0])


[   9   10   16 1111 1968 1969  633   21 1797    9   10   16 1111 1968 1969
  633   21  395    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0]

In [10]:
# Network building

net = input_data(shape=[None, 45])
net = embedding(net, input_dim=87514, output_dim=128)
net = bidirectional_rnn(net,
                        BasicLSTMCell(64), 
                        BasicLSTMCell(64)
                        )
net = dropout(net, 0.5)
net = fully_connected(net, 2, activation='softmax')
net = regression(net, optimizer='adam', loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, 
                    clip_gradients=0., 
                    tensorboard_verbose=0, 
                    checkpoint_path='/home/ubuntu/pynb/quora-question-pairs/checkpoints',
                    max_checkpoints=1)

#model.load('birnn_1.tflearn')

model.fit(X_train,
          y_train, 
          n_epoch=3,
          snapshot_epoch=True,
          validation_set=(X_val, y_val),
          show_metric=True,
          batch_size=64)

model.save('birnn_1.tflearn')


Training Step: 40283  | total loss: 0.12815 | time: 879.704s
| Adam | epoch: 003 | loss: 0.12815 - acc: 0.9542 -- iter: 859328/859345
Training Step: 40284  | total loss: 0.13320 | time: 882.545s
| Adam | epoch: 003 | loss: 0.13320 - acc: 0.9479 | val_loss: 0.38263 - val_acc: 0.8301 -- iter: 859345/859345
--

In [11]:
'''from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

max_features = 87506
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 235
batch_size = 32

print("Pad sequences (samples x time)")
#X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
#X_val = sequence.pad_sequences(X_val, maxlen=maxlen)
y_train = np.array(y_train)
y_val = np.array(y_val)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          validation_data=[X_val, y_val])'''


Out[11]:
'from __future__ import print_function\nimport numpy as np\n\nfrom keras.preprocessing import sequence\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional\n\nmax_features = 87506\n# cut texts after this number of words\n# (among top max_features most common words)\nmaxlen = 235\nbatch_size = 32\n\nprint("Pad sequences (samples x time)")\n#X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n#X_val = sequence.pad_sequences(X_val, maxlen=maxlen)\ny_train = np.array(y_train)\ny_val = np.array(y_val)\n\nmodel = Sequential()\nmodel.add(Embedding(max_features, 128, input_length=maxlen))\nmodel.add(Bidirectional(LSTM(64)))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(1, activation=\'sigmoid\'))\n\n# try using different optimizers and different optimizer configs\nmodel.compile(\'adam\', \'binary_crossentropy\', metrics=[\'accuracy\'])\n\nprint(\'Train...\')\nmodel.fit(X_train, y_train,\n          batch_size=batch_size,\n          validation_data=[X_val, y_val])'