In [1]:
from __future__ import division, print_function, absolute_import
from IPython.display import display # Allows the use of display() for DataFrames
import pandas as pd
import numpy as np
import re
import os
import csv
import codecs
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from string import punctuation
from collections import defaultdict
# Run below two lines of code for the first time
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
# From: https://pythonhosted.org/kitchen/unicode-frustrations.html
from kitchen.text.converters import getwriter
UTF8Writer = getwriter('utf8')
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
In [2]:
# Set base directories and parameters
BASE_DIR = ''
EMBEDDING_FILE = BASE_DIR + 'glove.6B.100d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set
STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
rate_drop_dense)
In [3]:
print('Indexing word vectors')
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
values = line.split()
# Format of embedding file: word, weights
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
In [4]:
# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
# Clean the text, with the option to remove stopwords and to stem words.
# Convert words to lower case and split them
text = UTF8Writer(text).lower().split()
# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" u s ", " american ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"j k", "jk", text)
text = re.sub(r"\s{2,}", " ", text)
# Optionally, shorten words to their stems
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
# Return a list of words
return(text)
In [5]:
print('Processing text dataset')
texts_1 = []
texts_2 = []
labels = []
with open(TRAIN_DATA_FILE) as f:
reader = csv.reader(f, delimiter=',')
header = next(reader)
for values in reader:
texts_1.append(text_to_wordlist(values[3])) # question 1
texts_2.append(text_to_wordlist(values[4])) # question 2
labels.append(int(values[5])) # is_duplicate
print('Found %s texts in train.csv' % len(texts_1))
test_texts_1 = []
test_texts_2 = []
test_ids = []
with open(TEST_DATA_FILE) as f:
reader = csv.reader(f, delimiter=',')
header = next(reader)
for values in reader:
test_texts_1.append(text_to_wordlist(values[1])) # question 1
test_texts_2.append(text_to_wordlist(values[2])) # question 2
test_ids.append(values[0]) # test_id
print('Found %s texts in test.csv' % len(test_texts_1))
In [6]:
'''print('Processing text dataset')
df_train = pd.read_csv(TRAIN_DATA_FILE, encoding='utf-8')
df_train["q1"] = df_train["question1"].apply(lambda row: text_to_wordlist(row))
df_train["q2"] = df_train["question2"].apply(lambda row: text_to_wordlist(row))
texts_1 = df_train["q1"].tolist()
texts_2 = df_train["q2"].tolist()
labels = df_train["is_duplicate"].values.tolist()
print(texts_1[:2])
print(texts_2[:2])
print(labels[:2])'''
Out[6]:
In [7]:
# Tokenize texts
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)
# Convert text to sequences
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)
# Add index in found words
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))
# Pad sequences
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)
In [8]:
# Generate leaky features
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)
ques = pd.concat([train_df[['question1', 'question2']], \
test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
q_dict[ques.question1[i]].add(ques.question2[i])
q_dict[ques.question2[i]].add(ques.question1[i])
def q1_freq(row):
return(len(q_dict[row['question1']]))
def q2_freq(row):
return(len(q_dict[row['question2']]))
def q1_q2_intersect(row):
return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))
train_df['q1_q2_intersect'] = train_df.apply(q1_q2_intersect, axis=1, raw=True)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1, raw=True)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1, raw=True)
test_df['q1_q2_intersect'] = test_df.apply(q1_q2_intersect, axis=1, raw=True)
test_df['q1_freq'] = test_df.apply(q1_freq, axis=1, raw=True)
test_df['q2_freq'] = test_df.apply(q2_freq, axis=1, raw=True)
leaks = train_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = test_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)
In [9]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
In [10]:
# Sample train/validation data
np.random.seed(42)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
weight_val = np.ones(len(labels_val))
if re_weight:
weight_val *= 0.472001959
weight_val[labels_val==0] = 1.309028344
In [11]:
# Saves preprocessed data into a pickle file
import pickle
with open('preprocess_n1.pickle', 'wb') as f:
pickle.dump(data_1_train, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(data_2_train, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(leaks_train, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(labels_train, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(data_1_val, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(data_2_val, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(leaks_val, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(labels_val, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(weight_val, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(embedding_matrix, f, protocol=pickle.HIGHEST_PROTOCOL)
print("File saved.")
In [12]:
# Build model
embedding_layer = Embedding(nb_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)
leaks_input = Input(shape=(leaks.shape[1],))
leaks_dense = Dense(int(num_dense/2), activation=act)(leaks_input)
merged = concatenate([x1, y1, leaks_dense])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
In [13]:
if re_weight:
class_weight = {0: 1.309028344, 1: 0.472001959}
else:
class_weight = None
In [14]:
# Train model
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
outputs=preds)
model.compile(loss='binary_crossentropy',
optimizer='nadam',
metrics=['acc'])
#model.summary()
print(STAMP)
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
epochs=200, batch_size=2048, shuffle=True, \
class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
In [16]:
print('Start making the submission before fine-tuning')
preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=2048, verbose=1)
preds += model.predict([test_data_2, test_data_1, test_leaks], batch_size=2048, verbose=1)
preds /= 2
submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)
In [ ]: