Baseline MAX_LEN = 40 EMBEDDING_DIM = 300 BATCH_SIZE = 128 VALID_SPLIT = 0.05 RE_WEIGHT = True LSTM(256) parameters of LSTM: 570368 parameters of Dense: 513 weights.001-0.3333.hdf5 475s - loss: 0.3051 - acc: 0.7766 - val_loss: 0.3333 - val_acc: 0.7743 LSTM(128) parameters of LSTM: 219648 parameters of Dense: 257 weights.001-0.3358.hdf5 389s - loss: 0.3218 - acc: 0.7598 - val_loss: 0.3358 - val_acc: 0.7611 LSTM(64) parameters of LSTM: 93440 parameters of Dense: 129 weights.002-0.3393.hdf5 356s - loss: 0.3138 - acc: 0.7675 - val_loss: 0.3393 - val_acc: 0.7641 LSTM(64)*2 parameters of LSTM: 126464 = 93440+33024 parameters of Dense: 129 weights.003-0.3385.hdf5 400s - loss: 0.2892 - acc: 0.7953 - val_loss: 0.3385 - val_acc: 0.7695 LSTM(64)*2 BATCH_SIZE: 256 ==> 128 parameters of LSTM: 126464 = 93440+33024 parameters of Dense: 129 weights.002-0.3367.hdf5 767s - loss: 0.2986 - acc: 0.7842 - val_loss: 0.3367 - val_acc: 0.7680 LSTM(64)*3 parameters of LSTM: 159,488 = 93440+33024+33024 parameters of Dense: 129 weights.002-0.3395.hdf5 604s - loss: 0.3081 - acc: 0.7750 - val_loss: 0.3395 - val_acc: 0.7612 BIDIRECT(LSTM(64)) parameters of LSTM: 186,880 parameters of Dense: 257 weights.006-0.3417.hdf5 193s - loss: 0.3007 - acc: 0.7826 - val_loss: 0.3417 - val_acc: 0.7639 LSTM(64) BatchNormalization() parameters of LSTM: 93440 parameters of Dense: 129 weights.003-0.3443.hdf5 205s - loss: 0.2958 - acc: 0.7875 - val_loss: 0.3443 - val_acc: 0.7702 LSTM(64,dropout=0.5) Dropout(0.5) parameters of LSTM: 93440 parameters of Dense: 129 weights.004-0.3344.hdf5 198s - loss: 0.3097 - acc: 0.7715 - val_loss: 0.3344 - val_acc: 0.7508 LSTM(64) DENSE(128) parameters of LSTM: 93440 parameters of Dense: 8256+129 weights.002-0.2749.hdf5 199s - loss: 0.2354 - acc: 0.8314 - val_loss: 0.2749 - val_acc: 0.8093 LSTM(64) DENSE(64) parameters of LSTM: 93440 parameters of Dense: 8256+65 weights.002-0.2769.hdf5 196s - loss: 0.2383 - acc: 0.8285 - val_loss: 0.2769 - val_acc: 0.8106 LSTM(64) DENSE(64) Dropout(0.5) weights.008-0.3096.hdf5 200s - loss: 0.2776 - acc: 0.7945 - val_loss: 0.3096 - val_acc: 0.7730 LSTM(64,dropout=0.5) DENSE(64) weights.002-0.2788.hdf5 197s - loss: 0.2391 - acc: 0.8271 - val_loss: 0.2788 - val_acc: 0.8087 LSTM(64) DENSE(64)*2 parameters of LSTM: 93440 parameters of Dense: 8256+4160+65 weights.002-0.2733.hdf5 198s - loss: 0.2340 - acc: 0.8354 - val_loss: 0.2733 - val_acc: 0.8190 LSTM(64) DENSE(64)*3 parameters of LSTM: 93440 parameters of Dense: 8256+4160+4160+65 weights.002-0.2723.hdf5 203s - loss: 0.2364 - acc: 0.8329 - val_loss: 0.2723 - val_acc: 0.8171 LSTM(64) DENSE(64) lr 1e-3 ==>1e-2 weights.005-0.2899.hdf5 198s - loss: 0.2458 - acc: 0.8240 - val_loss: 0.2899 - val_acc: 0.7940 LSTM(64) DENSE(64) lr 1e-3 ==>1e-4 weights.021-0.2894.hdf5 194s - loss: 0.2383 - acc: 0.8289 - val_loss: 0.2894 - val_acc: 0.7917 LSTM(128)*3 DENSE(128)*3 weights.001-0.2713.hdf5 709s - loss: 0.2539 - acc: 0.8190 - val_loss: 0.2713 - val_acc: 0.8094 LSTM(64) DENSE(64) EMBEDDING_TRAINABLE = True weights.000-0.2829.hdf5 345s - loss: 0.2980 - acc: 0.7705 - val_loss: 0.2829 - val_acc: 0.8035

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import datetime, time, json, os, math, pickle, sys
from string import punctuation
from __future__ import division
from __future__ import print_function

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras.layers import concatenate, Embedding, Dense, Input, Dropout, Bidirectional, LSTM, BatchNormalization, TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras import backend as K


Using TensorFlow backend.

In [2]:
DATA_DIR = '../data/'
MODEL = 'Baseline'
if os.getcwd().split('/')[-1] != MODEL:
    print('WRONG MODEL DIR!!!')
CHECKPOINT_DIR = './checkpoint/'
if not os.path.exists(CHECKPOINT_DIR):
    os.mkdir(CHECKPOINT_DIR)
LOG_DIR = './log/'
if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)
OUTPUT_DIR = './output/'
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
    
MAX_LEN = 40
EMBEDDING_DIM = 300
BATCH_SIZE = 256
VALID_SPLIT = 0.05
RE_WEIGHT = True # whether to re-weight classes to fit the 17.5% share in test set
# VOCAB_SIZE = 10000


def get_best_model(checkpoint_dir = CHECKPOINT_DIR):
    files = glob.glob(checkpoint_dir+'*')
    val_losses = [float(f.split('-')[-1][:-5]) for f in files]
    index = val_losses.index(min(val_losses))
    print('Loading model from checkpoint file ' + files[index])
    model = load_model(files[index])
    model_name = files[index].split('/')[-1]
    print('Loading model Done!')
    return (model, model_name)

In [3]:
trainval_df = pd.read_csv(DATA_DIR+"train.csv")
test_df = pd.read_csv(DATA_DIR+"test.csv")
print(trainval_df.shape)
print(test_df.shape)


(404290, 6)
(2345796, 3)
# Check for any null values # inds = pd.isnull(trainval_df).any(1).nonzero()[0] # trainval_df.loc[inds] # inds = pd.isnull(test_df).any(1).nonzero()[0] # test_df.loc[inds] # # Add the string 'empty' to empty strings # trainval_df = trainval_df.fillna('empty') # test_df = test_df.fillna('empty')

In [4]:
# data cleaning
abbr_dict={
    "i'm":"i am",
    "'re":" are",
    "'s":" is",
    "'ve":" have",
    "'ll":" will",
    "n't":" not",
}

_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")

# stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
#               'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
#               'Is','If','While','This']
# print('stop_words:', len(stop_words))

# # nltk.download("stopwords")
# stop_words = stopwords.words('english')
# print('stop_words:', len(stop_words))


def text_to_wordlist(text, abbr_dict=None, remove_stop_words=False, stem_words=False):
    
    if isinstance(text,float):
        # turn nan to empty string
        text = ""
    else:
#         Convert words to lower case and split them
#         text = text.lower()

#         # abbreviation replace
#         # Create a regular expression  from the dictionary keys
#         regex = re.compile("(%s)" % "|".join(map(re.escape, abbr_dict.keys())))
#         # For each match, look-up corresponding value in dictionary
#         text = regex.sub(lambda mo: abbr_dict[mo.string[mo.start():mo.end()]], text) 

        words = []
        for space_separated_fragment in text.strip().split():
            words.extend(_WORD_SPLIT.split(space_separated_fragment))
        text = [w for w in words if w]
        text = " ".join(text)

#         Remove punctuation from text
#         text = ''.join([c for c in text if c not in punctuation])

        # Optionally, remove stop words
        if remove_stop_words:
            text = text.split()
            text = [w for w in text if not w in stop_words]
            text = " ".join(text)

        # Optionally, shorten words to their stems
        if stem_words:
            text = text.split()
            stemmer = SnowballStemmer('english')
            stemmed_words = [stemmer.stem(word) for word in text]
            text = " ".join(stemmed_words)
        
    # Return a list of words
    return(text)
trainval_df['len1'] = trainval_df.apply(lambda row: len(row['question1_WL'].split()), axis=1) trainval_df['len2'] = trainval_df.apply(lambda row: len(row['question2_WL'].split()), axis=1) test_df['len1'] = test_df.apply(lambda row: len(row['question1_WL'].split()), axis=1) test_df['len2'] = test_df.apply(lambda row: len(row['question2_WL'].split()), axis=1) lengths = pd.concat([trainval_df['len1'],trainval_df['len2']], axis=0) print(lengths.describe()) print(np.percentile(lengths, 99.0)) print(np.percentile(lengths, 99.4)) print(np.percentile(lengths, 99.5)) print(np.percentile(lengths, 99.9))

In [5]:
# question to word list by data cleaning

file_name = 'trainval_df.pickle'
if os.path.exists(OUTPUT_DIR+file_name):
    print ('Loading from file '+file_name)
    trainval_df = pd.read_pickle(OUTPUT_DIR+file_name)
else:
    print ('Generating file '+file_name)  
    trainval_df['question1_WL'] = trainval_df.apply(lambda row: text_to_wordlist(row['question1']), axis=1)
    trainval_df['question2_WL'] = trainval_df.apply(lambda row: text_to_wordlist(row['question2']), axis=1)
    trainval_df.to_pickle(OUTPUT_DIR+file_name)      

file_name = 'test_df.pickle'
if os.path.exists(OUTPUT_DIR+file_name):
    print ('Loading from file '+file_name)
    test_df = pd.read_pickle(OUTPUT_DIR+file_name)
else:
    print ('Generating file '+file_name)  
    test_df['question1_WL'] = test_df.apply(lambda row: text_to_wordlist(row['question1']), axis=1)
    test_df['question2_WL'] = test_df.apply(lambda row: text_to_wordlist(row['question2']), axis=1)
    test_df.to_pickle(OUTPUT_DIR+file_name)   
    
test_size = trainval_df.shape[0]-int(math.ceil(trainval_df.shape[0]*(1-VALID_SPLIT)/1024)*1024)
train_df, valid_df = train_test_split(trainval_df, test_size=test_size, random_state=1986, stratify=trainval_df['is_duplicate'])


Loading from file trainval_df.pickle
Loading from file test_df.pickle

In [6]:
# tokenize and pad

all_questions = pd.concat([trainval_df['question1_WL'],trainval_df['question2_WL'],test_df['question1_WL'],test_df['question2_WL']], axis=0)
tokenizer = Tokenizer(num_words=None, lower=True)
tokenizer.fit_on_texts(all_questions)
word_index = tokenizer.word_index
nb_words = len(word_index)
print("Words in index: %d" % nb_words) #126355

train_q1 = pad_sequences(tokenizer.texts_to_sequences(train_df['question1_WL']), maxlen = MAX_LEN)
train_q2 = pad_sequences(tokenizer.texts_to_sequences(train_df['question2_WL']), maxlen = MAX_LEN)
valid_q1 = pad_sequences(tokenizer.texts_to_sequences(valid_df['question1_WL']), maxlen = MAX_LEN)
valid_q2 = pad_sequences(tokenizer.texts_to_sequences(valid_df['question2_WL']), maxlen = MAX_LEN)
y_train = train_df.is_duplicate.values
y_valid = valid_df.is_duplicate.values

train_q1_Double = np.vstack((train_q1, train_q2))
train_q2_Double = np.vstack((train_q2, train_q1))
valid_q1_Double = np.vstack((valid_q1, valid_q2))
valid_q2_Double = np.vstack((valid_q2, valid_q1))
y_train_Double = np.hstack((y_train, y_train))
y_valid_Double = np.hstack((y_valid, y_valid))

val_sample_weights = np.ones(len(y_valid_Double))
if RE_WEIGHT:
    class_weight = {0: 1.309028344, 1: 0.472001959}
    val_sample_weights *= 0.472001959
    val_sample_weights[y_valid_Double==0] = 1.309028344
else:
    class_weight = None
    val_sample_weights = None


Words in index: 126355

In [7]:
# load word_embedding_matrix

W2V = 'glove.840B.300d.txt'
file_name = W2V + '.word_embedding_matrix.pickle'
if os.path.exists(OUTPUT_DIR+file_name):
    print ('Loading from file '+file_name)
    with open(OUTPUT_DIR+file_name, 'rb') as f:
        word_embedding_matrix = pickle.load(f)
else:
    print ('Generating file '+file_name)   
    # Load GloVe to use pretrained vectors
    embeddings_index = {}
    with open(DATA_DIR+'/WordEmbedding/'+W2V) as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    print('Word embeddings:', len(embeddings_index)) #1,505,774

    # Need to use EMBEDDING_DIM for embedding dimensions to match GloVe's vectors.
    nb_words = len(word_index)
    null_embedding_words = []
    word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            word_embedding_matrix[i] = embedding_vector
        else:
            null_embedding_words.append(word)
    print('Null word embeddings: %d' %len(null_embedding_words)) #43,229

    with open(OUTPUT_DIR+file_name, 'wb') as f:
        pickle.dump(word_embedding_matrix, f)


Loading from file glove.840B.300d.word_embedding_matrix.pickle
word_counts = tokenizer.word_counts null_embedding_word_counts = { word: word_counts[word] for word in null_embedding_words } print(sum(null_embedding_word_counts.values())) #454210 word_docs = tokenizer.word_docs null_embedding_word_docs = { word: word_docs[word] for word in null_embedding_words } print(sum(null_embedding_word_docs.values())) #446584 # 446584/(404290+2345796)/2 = 0.08119

In [17]:
EMBEDDING_TRAINABLE = False
RNNCELL_SIZE = 64
RNNCELL_LAYERS = 1
RNNCELL_DROPOUT = 0
RNNCELL_RECURRENT_DROPOUT = 0
RNNCELL_BIDIRECT = False
DENSE_SIZE = 64
DENSE_LAYERS = 1
DENSE_DROPOUT = 0

In [18]:
encode_model = Sequential()
encode_model.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=MAX_LEN, trainable=EMBEDDING_TRAINABLE))
if RNNCELL_BIDIRECT:
    for i in range(RNNCELL_LAYERS-1):
        encode_model.add(Bidirectional(LSTM(RNNCELL_SIZE, dropout=RNNCELL_DROPOUT, recurrent_dropout=RNNCELL_RECURRENT_DROPOUT, 
                                            unroll=True, implementation=2, return_sequences=True)))
    encode_model.add(Bidirectional(LSTM(RNNCELL_SIZE, dropout=RNNCELL_DROPOUT, recurrent_dropout=RNNCELL_RECURRENT_DROPOUT, 
                                        unroll=True, implementation=2)))
else:
    for i in range(RNNCELL_LAYERS-1):
        encode_model.add(LSTM(RNNCELL_SIZE, dropout=RNNCELL_DROPOUT, recurrent_dropout=RNNCELL_RECURRENT_DROPOUT, 
                              unroll=True, implementation=2, return_sequences=True))
    encode_model.add(LSTM(RNNCELL_SIZE, dropout=RNNCELL_DROPOUT, recurrent_dropout=RNNCELL_RECURRENT_DROPOUT, 
                          unroll=True, implementation=2))

sequence1_input = Input(shape=(MAX_LEN,), name='q1')
sequence2_input = Input(shape=(MAX_LEN,), name='q2')
encoded_1 = encode_model(sequence1_input)
encoded_2 = encode_model(sequence2_input)
merged = concatenate([encoded_1, encoded_2], axis=-1)
merged = Dropout(DENSE_DROPOUT)(merged)
# merged = BatchNormalization()(merged)
for i in range(DENSE_LAYERS):
    merged = Dense(DENSE_SIZE, activation='relu', kernel_initializer='he_normal')(merged)
    merged = Dropout(DENSE_DROPOUT)(merged)
predictions = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[sequence1_input, sequence2_input], outputs=predictions)

In [14]:
encode_model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 40, 300)           37906800  
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                93440     
=================================================================
Total params: 38,000,240.0
Trainable params: 38,000,240
Non-trainable params: 0.0
_________________________________________________________________

In [15]:
model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
q1 (InputLayer)                  (None, 40)            0                                            
____________________________________________________________________________________________________
q2 (InputLayer)                  (None, 40)            0                                            
____________________________________________________________________________________________________
sequential_2 (Sequential)        (None, 64)            38000240                                     
____________________________________________________________________________________________________
concatenate_2 (Concatenate)      (None, 128)           0                                            
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 128)           0                                            
____________________________________________________________________________________________________
dense_5 (Dense)                  (None, 64)            8256                                         
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 64)            0                                            
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 1)             65                                           
====================================================================================================
Total params: 38,008,561.0
Trainable params: 38,008,561.0
Non-trainable params: 0.0
____________________________________________________________________________________________________

In [16]:
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

callbacks = [ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1),
             EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1),
             ModelCheckpoint(filepath=CHECKPOINT_DIR+'weights.{epoch:03d}-{val_loss:.4f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True),
             TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=False, write_images=True)]

print('BATCH_SIZE:', BATCH_SIZE)
model.fit({'q1': train_q1_Double, 'q2': train_q2_Double}, y_train_Double, 
          batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks, 
          validation_data=({'q1': valid_q1_Double, 'q2': valid_q2_Double}, y_valid_Double, val_sample_weights), 
          shuffle=True, class_weight=class_weight, initial_epoch=0)


BATCH_SIZE: 256
Train on 770048 samples, validate on 38532 samples
Epoch 1/100
Epoch 00000: val_loss improved from inf to 0.28294, saving model to ./checkpoint/weights.000-0.2829.hdf5
345s - loss: 0.2980 - acc: 0.7705 - val_loss: 0.2829 - val_acc: 0.8035
Epoch 2/100
Epoch 00001: val_loss did not improve
341s - loss: 0.1793 - acc: 0.8807 - val_loss: 0.3162 - val_acc: 0.8340
Epoch 3/100
Epoch 00002: val_loss did not improve
341s - loss: 0.1099 - acc: 0.9337 - val_loss: 0.3861 - val_acc: 0.8396
Epoch 4/100
Epoch 00003: val_loss did not improve
340s - loss: 0.0677 - acc: 0.9616 - val_loss: 0.4722 - val_acc: 0.8408
Epoch 5/100
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-16-c844293a0ea5> in <module>()
     11           batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks,
     12           validation_data=({'q1': valid_q1_Double, 'q2': valid_q2_Double}, y_valid_Double, val_sample_weights),
---> 13           shuffle=True, class_weight=class_weight, initial_epoch=0)

/usr/local/lib64/python2.7/site-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
   1483                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
   1484                               callback_metrics=callback_metrics,
-> 1485                               initial_epoch=initial_epoch)
   1486 
   1487     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/usr/local/lib64/python2.7/site-packages/keras/engine/training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
   1138                 batch_logs['size'] = len(batch_ids)
   1139                 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1140                 outs = f(ins_batch)
   1141                 if not isinstance(outs, list):
   1142                     outs = [outs]

/usr/local/lib64/python2.7/site-packages/keras/backend/tensorflow_backend.pyc in __call__(self, inputs)
   2071         session = get_session()
   2072         updated = session.run(self.outputs + [self.updates_op],
-> 2073                               feed_dict=feed_dict)
   2074         return updated[:len(self.outputs)]
   2075 

/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    765     try:
    766       result = self._run(None, fetches, feed_dict, options_ptr,
--> 767                          run_metadata_ptr)
    768       if run_metadata:
    769         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    963     if final_fetches or final_targets:
    964       results = self._do_run(handle, final_targets, final_fetches,
--> 965                              feed_dict_string, options, run_metadata)
    966     else:
    967       results = []

/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1013     if handle is None:
   1014       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1015                            target_list, options, run_metadata)
   1016     else:
   1017       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
   1020   def _do_call(self, fn, *args):
   1021     try:
-> 1022       return fn(*args)
   1023     except errors.OpError as e:
   1024       message = compat.as_text(e.message)

/usr/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1002         return tf_session.TF_Run(session, options,
   1003                                  feed_dict, fetch_list, target_list,
-> 1004                                  status, run_metadata)
   1005 
   1006     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 
#resume training model, model_name = get_best_model() # model = load_model(CHECKPOINT_DIR + 'weights.025-0.4508.hdf5') # model_name = 'weights.025-0.4508.hdf5' # print('model_name', model_name) # #try increasing learningrate # optimizer = Adam(lr=1e-4) # model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) # callbacks = [ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1), # EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1), # ModelCheckpoint(filepath=CHECKPOINT_DIR+'weights.{epoch:03d}-{val_loss:.4f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True), # TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=False, write_images=True)] print('BATCH_SIZE:', BATCH_SIZE) model.fit({'q1': train_q1_Double, 'q2': train_q2_Double}, y_train_Double, batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks, validation_data=({'q1': valid_q1_Double, 'q2': valid_q2_Double}, y_valid_Double, val_sample_weights), shuffle=True, class_weight=class_weight, initial_epoch=)

In [19]:
model = load_model(CHECKPOINT_DIR + 'weights.002-0.2769.hdf5')
model_name = 'weights.002-0.2769.hdf5'
print('model_name', model_name)
val_loss = model.evaluate({'q1': valid_q1_Double, 'q2': valid_q2_Double}, y_valid_Double, sample_weight=val_sample_weights, batch_size=BATCH_SIZE, verbose=2)
val_loss


model_name weights.002-0.2769.hdf5
Out[19]:
[0.27685219645314557, 0.81057303010965698]

In [21]:
#Create submission
test_q1 = pad_sequences(tokenizer.texts_to_sequences(test_df['question1_WL']), maxlen = MAX_LEN)
test_q2 = pad_sequences(tokenizer.texts_to_sequences(test_df['question2_WL']), maxlen = MAX_LEN)
predictions = model.predict({'q1': test_q1, 'q2': test_q2}, batch_size=BATCH_SIZE, verbose=2)
predictions += model.predict({'q1': test_q2, 'q2': test_q1}, batch_size=BATCH_SIZE, verbose=2)
predictions /= 2

submission = pd.DataFrame(predictions, columns=['is_duplicate'])
submission.insert(0, 'test_id', test_df.test_id)
file_name = MODEL+'_'+model_name+'_LSTM{:d}*{:d}_DENSE{:d}*{:d}_valloss{:.4f}.csv' \
.format(RNNCELL_SIZE,RNNCELL_LAYERS,DENSE_SIZE,DENSE_LAYERS,val_loss[0])
submission.to_csv(OUTPUT_DIR+file_name, index=False)
print(file_name)


Baseline_weights.002-0.2769.hdf5_LSTM64*1_DENSE64*1_valloss0.2769.csv
sys.stdout = open(OUTPUT_DIR+'training_output.txt', 'a') history = model.fit({'q1': train_q1, 'q2': train_q2}, y_train, batch_size=BATCH_SIZE, epochs=3, verbose=2, callbacks=callbacks, validation_data=({'q1': valid_q1, 'q2': valid_q2}, y_valid), shuffle=True, initial_epoch=0) sys.stdout = sys.__stdout__
summary_stats = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ], 'train_acc': history.history['acc'], 'valid_acc': history.history['val_acc'], 'train_loss': history.history['loss'], 'valid_loss': history.history['val_loss']}) summary_stats plt.plot(summary_stats.train_loss) # blue plt.plot(summary_stats.valid_loss) # green plt.show()
units = 128 # Number of nodes in the Dense layers dropout = 0.25 # Percentage of nodes to drop nb_filter = 32 # Number of filters to use in Convolution1D filter_length = 3 # Length of filter for Convolution1D # Initialize weights and biases for the Dense layers weights = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=2) bias = bias_initializer='zeros' model1 = Sequential() model1.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False)) model1.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same')) model1.add(BatchNormalization()) model1.add(Activation('relu')) model1.add(Dropout(dropout)) model1.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same')) model1.add(BatchNormalization()) model1.add(Activation('relu')) model1.add(Dropout(dropout)) model1.add(Flatten()) model2 = Sequential() model2.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False)) model2.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same')) model2.add(BatchNormalization()) model2.add(Activation('relu')) model2.add(Dropout(dropout)) model2.add(Convolution1D(filters=nb_filter, kernel_size=filter_length, padding='same')) model2.add(BatchNormalization()) model2.add(Activation('relu')) model2.add(Dropout(dropout)) model2.add(Flatten()) model3 = Sequential() model3.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False)) model3.add(TimeDistributed(Dense(EMBEDDING_DIM))) model3.add(BatchNormalization()) model3.add(Activation('relu')) model3.add(Dropout(dropout)) model3.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))) model4 = Sequential() model4.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length = MAX_LEN, trainable = False)) model4.add(TimeDistributed(Dense(EMBEDDING_DIM))) model4.add(BatchNormalization()) model4.add(Activation('relu')) model4.add(Dropout(dropout)) model4.add(Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))) modela = Sequential() modela.add(Merge([model1, model2], mode='concat')) modela.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias)) modela.add(BatchNormalization()) modela.add(Activation('relu')) modela.add(Dropout(dropout)) modela.add(Dense(units, kernel_initializer=weights, bias_initializer=bias)) modela.add(BatchNormalization()) modela.add(Activation('relu')) modela.add(Dropout(dropout)) modelb = Sequential() modelb.add(Merge([model3, model4], mode='concat')) modelb.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias)) modelb.add(BatchNormalization()) modelb.add(Activation('relu')) modelb.add(Dropout(dropout)) modelb.add(Dense(units, kernel_initializer=weights, bias_initializer=bias)) modelb.add(BatchNormalization()) modelb.add(Activation('relu')) modelb.add(Dropout(dropout)) model = Sequential() model.add(Merge([modela, modelb], mode='concat')) model.add(Dense(units*2, kernel_initializer=weights, bias_initializer=bias)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(dropout)) model.add(Dense(units, kernel_initializer=weights, bias_initializer=bias)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(dropout)) model.add(Dense(units, kernel_initializer=weights, bias_initializer=bias)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(dropout)) model.add(Dense(1, kernel_initializer=weights, bias_initializer=bias)) model.add(BatchNormalization()) model.add(Activation('sigmoid'))