In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Concatenate, concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
import sys
from time import time
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from itertools import chain
from collections import Counter
from gensim.models import word2vec
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pickle
In [15]:
test_df = pd.read_csv('test_snello_pulito.csv', encoding='latin1')
train_df = pd.read_csv('train_snello_pulito.csv', encoding='latin1')
train_df.fillna("ciccia", inplace=True)
q1, q2 = 'question1_final', 'question2_final'
In [2]:
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 135
VALIDATION_SPLIT = 0.13
In [ ]:
def build_corpus(data, q1, q2):
"Creates a list of lists containing words from each sentence"
corpus = []
for col in [q1, q2]:
for sentence in data[col].iteritems():
word_list = sentence[1].split(" ")
corpus.append(word_list)
return corpus
corpus = build_corpus(train_df, q1, q2)
corpus.extend(build_corpus(test_df, q1, q2))
In [24]:
model = word2vec.Word2Vec(corpus, size=EMBEDDING_DIM, window=5, min_count=5, workers=4)
In [25]:
model.save('model.bin')
In [13]:
BASE_DIR = ''
EMBEDDING_FILE = BASE_DIR + 'model_best.bin'
In [14]:
word2vec = KeyedVectors.load(EMBEDDING_FILE)
In [ ]:
texts_1 = list(train_df.question1_final.values)
texts_2 = list(train_df.question2_final.values)
test_texts_1 = list(test_df.question1_final.values)
test_texts_2 = list(test_df.question2_final.values)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(train_df.is_duplicate.values)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
In [22]:
nb_words = min(MAX_NB_WORDS, len(word_index))+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
if word in word2vec.wv.vocab:
embedding_matrix[i] = word2vec.wv[word]
In [32]:
np.savetxt('labels.gz', labels)
np.savetxt('data_1.gz', data_1)
np.savetxt('data_2.gz', data_2)
np.savetxt('embedding_matrix.gz', embedding_matrix)
In [3]:
labels = np.loadtxt('labels.gz')
data_1 = np.loadtxt('data_1.gz')
data_2 = np.loadtxt('data_2.gz')
embedding_matrix = np.loadtxt('embedding_matrix.gz')
In [10]:
np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
In [11]:
data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
In [12]:
weight_val = np.ones(len(labels_val))
weight_val *= 0.472001959
weight_val[labels_val==0] = 1.309028344
GRU is related to LSTM as both are utilizing different way if gating information to prevent vanishing gradient problem.
Here are some pin-points about GRU vs LSTM:
In [ ]:
embedding_layer = Embedding(nb_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm, return_sequences=True)
gru_layer = GRU(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1 )
x2 = LSTM(num_lstm//2, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)(x1)
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)
y2 = LSTM(num_lstm//2, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)(y1)
merged = concatenate([x2, y2])
merged = Dropout(rate_drop_dense)(merged)
merged = Activation('sigmoid')(merged)
merged = BatchNormalization()(merged)
merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
class_weight = {0: 1.309028344, 1: 0.472001959}
model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
print(model.summary())
print(STAMP)
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(
bst_model_path, save_best_only=True, save_weights_only=True)
In [41]:
hist = model.fit(
[data_1_train, data_2_train],
labels_train,
validation_data=([data_1_val, data_2_val], labels_val, weight_val),
epochs=10,
batch_size=4092,
shuffle=True,
class_weight=class_weight,
callbacks=[early_stopping, model_checkpoint])
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
In [ ]:
num_lstm = 210 # int(np.random.randint(175, 230)) # 175-275
rate_drop_lstm = 0.3 # 0.15 + np.random.rand() * 0.25
num_dense = 135 # np.random.randint(100, 150)
class_weight = {0:1, 1:0.282} #{0: r*rtrain, 1: 1}# {0: 1.309028344, 1: 0.472001959}
firts_lstm={'num_lstm':num_lstm, 'rate_drop_lstm':rate_drop_lstm}
activation='relu'
drop_rate=0.3
dropout = 0.3
recurrent_dropout=0.3
rate_drop_lstm = firts_lstm['rate_drop_lstm']
embedding_layer = Embedding(nb_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
lstm_layer = LSTM(firts_lstm['num_lstm'], dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
return_sequences=True)
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
x = lstm_layer(embedded_sequences_1)
y = lstm_layer(embedded_sequences_2)
x = LSTM(num_lstm//2, dropout=dropout, recurrent_dropout=recurrent_dropout)(x)
y = LSTM(num_lstm//2, dropout=dropout, recurrent_dropout=recurrent_dropout)(y)
x = Dropout(drop_rate)(x)
y = Dropout(drop_rate)(y)
merged = concatenate([x, y])
merged = Dropout(drop_rate)(merged)
merged = BatchNormalization()(merged)
merged = Dense(num_dense, activation=activation)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(drop_rate)(merged)
merged = Dense(num_dense, activation=activation)(merged)
merged = Dropout(drop_rate)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
print(model.summary())
In [13]:
# load weights
model.load_weights("lstm_dense_dropout_rmsprop_True_relu_2048_0.3_210_0.3.h5")
# Compile model (required to make predictions)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print("Created model and loaded weights from file")
In [53]:
from keras.models import load_model
model = load_model('./lstm_dense_dropout_rmsprop_True_relu_2048_0.3_210_0.3.h5')
In [ ]:
test_ids = test_df['test_id'].values
In [ ]:
preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
preds /= 2
In [ ]:
submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('submissionNN.csv'', index=False)
In [ ]: