In [1]:
from __future__ import division, print_function, absolute_import
from IPython.display import display # Allows the use of display() for DataFrames

import pandas as pd
import numpy as np
import re
import os
import csv
import codecs
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from string import punctuation
from collections import defaultdict
# Run below two lines of code for the first time
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

# From: https://pythonhosted.org/kitchen/unicode-frustrations.html
from kitchen.text.converters import getwriter
UTF8Writer = getwriter('utf8')

# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')


Using TensorFlow backend.

In [2]:
# Set base directories and parameters
BASE_DIR = ''
EMBEDDING_FILE = BASE_DIR + 'glove.6B.100d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [3]:
print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    # Format of embedding file: word, weights
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors
Found 400000 word vectors.

In [4]:
# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = UTF8Writer(text).lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [5]:
print('Processing text dataset')

texts_1 = [] 
texts_2 = []
labels = []
with open(TRAIN_DATA_FILE) as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3])) # question 1
        texts_2.append(text_to_wordlist(values[4])) # question 2
        labels.append(int(values[5])) # is_duplicate
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with open(TEST_DATA_FILE) as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1])) # question 1
        test_texts_2.append(text_to_wordlist(values[2])) # question 2
        test_ids.append(values[0]) # test_id
print('Found %s texts in test.csv' % len(test_texts_1))


Processing text dataset
Found 404290 texts in train.csv
Found 2345796 texts in test.csv

In [6]:
'''print('Processing text dataset')

df_train = pd.read_csv(TRAIN_DATA_FILE, encoding='utf-8')
df_train["q1"] = df_train["question1"].apply(lambda row: text_to_wordlist(row))
df_train["q2"] = df_train["question2"].apply(lambda row: text_to_wordlist(row))
texts_1 = df_train["q1"].tolist()
texts_2 = df_train["q2"].tolist()
labels = df_train["is_duplicate"].values.tolist()

print(texts_1[:2])
print(texts_2[:2])
print(labels[:2])'''


Out[6]:
'print(\'Processing text dataset\')\n\ndf_train = pd.read_csv(TRAIN_DATA_FILE, encoding=\'utf-8\')\ndf_train["q1"] = df_train["question1"].apply(lambda row: text_to_wordlist(row))\ndf_train["q2"] = df_train["question2"].apply(lambda row: text_to_wordlist(row))\ntexts_1 = df_train["q1"].tolist()\ntexts_2 = df_train["q2"].tolist()\nlabels = df_train["is_duplicate"].values.tolist()\n\nprint(texts_1[:2])\nprint(texts_2[:2])\nprint(labels[:2])'

In [7]:
# Tokenize texts
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

# Convert text to sequences
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

# Add index in found words
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

# Pad sequences
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)


Found 120500 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)

In [8]:
# Generate leaky features
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

ques = pd.concat([train_df[['question1', 'question2']], \
        test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

train_df['q1_q2_intersect'] = train_df.apply(q1_q2_intersect, axis=1, raw=True)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1, raw=True)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1, raw=True)

test_df['q1_q2_intersect'] = test_df.apply(q1_q2_intersect, axis=1, raw=True)
test_df['q1_freq'] = test_df.apply(q1_freq, axis=1, raw=True)
test_df['q2_freq'] = test_df.apply(q2_freq, axis=1, raw=True)

leaks = train_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = test_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]

ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)


/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, _DataConversionWarning)

In [9]:
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


Preparing embedding matrix
Null word embeddings: 43907

In [10]:
# Sample train/validation data
np.random.seed(42)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [11]:
# Saves preprocessed data into a pickle file
import pickle

with open('preprocess_n1.pickle', 'wb') as f:
    pickle.dump(data_1_train, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(data_2_train, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(leaks_train, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(labels_train, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(data_1_val, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(data_2_val, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(leaks_val, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(labels_val, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(weight_val, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(embedding_matrix, f, protocol=pickle.HIGHEST_PROTOCOL)
    
print("File saved.")


File saved.

In [12]:
# Build model
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

leaks_input = Input(shape=(leaks.shape[1],))
leaks_dense = Dense(int(num_dense/2), activation=act)(leaks_input)

merged = concatenate([x1, y1, leaks_dense])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [13]:
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [14]:
# Train model
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])


lstm_240_135_0.19_0.22
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
727722/727722 [==============================] - 306s - loss: 0.2694 - acc: 0.8255 - val_loss: 0.3113 - val_acc: 0.7988
Epoch 2/200
727722/727722 [==============================] - 305s - loss: 0.2345 - acc: 0.8367 - val_loss: 0.2229 - val_acc: 0.8408
Epoch 3/200
727722/727722 [==============================] - 305s - loss: 0.2214 - acc: 0.8412 - val_loss: 0.2108 - val_acc: 0.8566
Epoch 4/200
727722/727722 [==============================] - 305s - loss: 0.2119 - acc: 0.8465 - val_loss: 0.2026 - val_acc: 0.8557
Epoch 5/200
727722/727722 [==============================] - 305s - loss: 0.2047 - acc: 0.8504 - val_loss: 0.2038 - val_acc: 0.8632
Epoch 6/200
727722/727722 [==============================] - 305s - loss: 0.1986 - acc: 0.8547 - val_loss: 0.1960 - val_acc: 0.8611
Epoch 7/200
727722/727722 [==============================] - 305s - loss: 0.1942 - acc: 0.8577 - val_loss: 0.2021 - val_acc: 0.8681
Epoch 8/200
727722/727722 [==============================] - 305s - loss: 0.1895 - acc: 0.8611 - val_loss: 0.1958 - val_acc: 0.8643
Epoch 9/200
727722/727722 [==============================] - 305s - loss: 0.1858 - acc: 0.8638 - val_loss: 0.1910 - val_acc: 0.8633
Epoch 10/200
727722/727722 [==============================] - 305s - loss: 0.1825 - acc: 0.8664 - val_loss: 0.1947 - val_acc: 0.8745
Epoch 11/200
727722/727722 [==============================] - 305s - loss: 0.1797 - acc: 0.8686 - val_loss: 0.1986 - val_acc: 0.8769
Epoch 12/200
727722/727722 [==============================] - 305s - loss: 0.1767 - acc: 0.8708 - val_loss: 0.2012 - val_acc: 0.8788
Epoch 13/200
727722/727722 [==============================] - 305s - loss: 0.1742 - acc: 0.8730 - val_loss: 0.1930 - val_acc: 0.8722

In [16]:
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=2048, verbose=1)
preds += model.predict([test_data_2, test_data_1, test_leaks], batch_size=2048, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)


Start making the submission before fine-tuning
2345796/2345796 [==============================] - 365s   
2345796/2345796 [==============================] - 365s   

In [ ]: