In [1]:
import os
import csv
import codecs
import numpy as np
import pandas as pd
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, merge, LSTM, Lambda, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import backend as K
import sys


Using TensorFlow backend.

In [6]:
BASE_DIR = 'data/'
GLOVE_DIR = '/home/mageswarand/dataset/glove/'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.01

In [4]:
print('Indexing word vectors.')
embeddings_index = {}
f = codecs.open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'), encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors.
Found 2196017 word vectors.

Now read the train and test questions into list of questions.


In [7]:
print('Processing text dataset')
texts_1 = [] 
texts_2 = []
labels = []  # list of label ids
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(values[3])
        texts_2.append(values[4])
        labels.append(int(values[5]))
print('Found %s texts.' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_labels = []  # list of label ids
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(values[1])
        test_texts_2.append(values[2])
        test_labels.append(values[0])
print('Found %s texts.' % len(test_texts_1))


Processing text dataset
Found 404290 texts.
Found 2345796 texts.

Using keras tokenizer to tokenize the text and then do padding the sentences to 30 words


In [8]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_labels = np.array(test_labels)
del test_sequences_1
del test_sequences_2
del sequences_1
del sequences_2
import gc
gc.collect()


/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/keras/preprocessing/text.py:89: UserWarning: The `nb_words` argument in `Tokenizer` has been renamed `num_words`.
  warnings.warn('The `nb_words` argument in `Tokenizer` '
Found 137042 unique tokens.
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)
Out[8]:
0

Now let us create the embedding matrix where each row corresponds to a word.


In [9]:
print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


Preparing embedding matrix.
Null word embeddings: 50301

Now its time to build the model. Let us specify the model architecture. First layer is the embedding layer.


In [10]:
embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In embedding layer, 'trainable' is set to False so as to not train the word embeddings during the back propogation.
The neural net architecture is as follows:

  • Word embeddings of each question is passed to a 1-dimensional convolution layer followed by max pooling.
  • It is followed by one dense layer for each of the two questions
  • The outputs from both the dense layers are merged together
  • It is followed by a dense layer
  • Final layer is a sigmoid layer

In [11]:
# Model Architecture #
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = Conv1D(128, 3, activation='relu')(embedded_sequences_1)
x1 = MaxPooling1D(10)(x1)
x1 = Flatten()(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.2)(x1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = Conv1D(128, 3, activation='relu')(embedded_sequences_2)
y1 = MaxPooling1D(10)(y1)
y1 = Flatten()(y1)
y1 = Dense(64, activation='relu')(y1)
y1 = Dropout(0.2)(y1)

merged = merge([x1,y1], mode='concat')
merged = BatchNormalization()(merged)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(input=[sequence_1_input,sequence_2_input], output=preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])


/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/ipykernel/__main__.py:18: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.
/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/keras/legacy/layers.py:456: UserWarning: The `Merge` layer is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.
  name=name)
/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/ipykernel/__main__.py:24: UserWarning: Update your `Model` call to the Keras 2 API: `Model(outputs=Tensor("de..., inputs=[<tf.Tenso...)`

Model training and predictions : Uncomment the below cell and run it in local as it is exceeding the time limits here.


In [12]:
# pass
model.fit([data_1,data_2], labels, validation_split=VALIDATION_SPLIT, nb_epoch=1, batch_size=1024, shuffle=True)
preds = model.predict([test_data_1, test_data_2])
print(preds.shape)

out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions.csv", index=False)


/home/mageswarand/anaconda3/envs/tensorflow1.0/lib/python3.5/site-packages/ipykernel/__main__.py:2: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.
  from ipykernel import kernelapp as app
Train on 400247 samples, validate on 4043 samples
Epoch 1/1
400247/400247 [==============================] - 84s - loss: 0.6194 - acc: 0.6290 - val_loss: 0.5783 - val_acc: 0.6708 - ETA: 55s - loss: 0.6512 - acc: 0.6040 - ETA: 34s - loss: 0.6351 - acc: 0.6167 - ETA: 10s - loss: 0.6236 - acc: 0.6260 - ETA: 2s - loss: 0.6204 - acc: 0.6282 - ETA: 1s - loss: 0.6200 - acc: 0.6285
(2345796, 1)

In [ ]: