In [1]:
from __future__ import division, print_function, absolute_import
import os
import pickle
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display # Allows the use of display() for DataFrames
%matplotlib inline
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell, lstm
from tflearn.layers.estimator import regression
In [2]:
# Load preprocessed file and inspect dimensions
with open('preprocess_3.pickle', 'rb') as f:
X_train = pickle.load(f)
X_val = pickle.load(f)
y_train = pickle.load(f)
y_val = pickle.load(f)
print("File loaded.")
print(len(X_train))
print(len(X_val))
print(len(y_train))
print(len(y_val))
In [3]:
# Check class counts
from collections import Counter
c = Counter(y_train)
d = Counter(y_val)
c_pct = float(c[1])/(c[0] + c[1])
d_pct = float(d[1])/(d[0] + d[1])
print(c, c_pct)
print(d, d_pct)
In [4]:
'''with open('preprocess_train', 'rb'):
X_train = joblib.load('preprocess_train')
print("File loaded.")
print(len(X_train))'''
Out[4]:
In [5]:
print(X_train[:5])
In [6]:
print(y_train[:5])
In [7]:
# Data preprocessing
# Sequence padding
#X_train = pad_sequences(X_train, maxlen=235, value=0.)
#X_val = pad_sequences(X_val, maxlen=235, value=0.)
# Converting labels to binary vectors
y_train = to_categorical(y_train, nb_classes=2)
y_val = to_categorical(y_val, nb_classes=2)
In [8]:
print(y_train[:5])
In [9]:
print(X_val[0])
In [10]:
# Network building
net = input_data(shape=[None, 45])
net = embedding(net, input_dim=87514, output_dim=128)
net = bidirectional_rnn(net,
BasicLSTMCell(64),
BasicLSTMCell(64)
)
net = dropout(net, 0.5)
net = fully_connected(net, 2, activation='softmax')
net = regression(net, optimizer='adam', loss='categorical_crossentropy')
# Training
model = tflearn.DNN(net,
clip_gradients=0.,
tensorboard_verbose=0,
checkpoint_path='/home/ubuntu/pynb/quora-question-pairs/checkpoints',
max_checkpoints=1)
#model.load('birnn_1.tflearn')
model.fit(X_train,
y_train,
n_epoch=3,
snapshot_epoch=True,
validation_set=(X_val, y_val),
show_metric=True,
batch_size=64)
model.save('birnn_1.tflearn')
In [11]:
'''from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
max_features = 87506
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 235
batch_size = 32
print("Pad sequences (samples x time)")
#X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
#X_val = sequence.pad_sequences(X_val, maxlen=maxlen)
y_train = np.array(y_train)
y_val = np.array(y_val)
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=batch_size,
validation_data=[X_val, y_val])'''
Out[11]: