In [1]:
%matplotlib inline
import pandas as pd
import gensim
import bcolz
import keras
import re
import os
import csv
import codecs
import numpy as np


from gensim.models import Word2Vec, KeyedVectors, word2vec
from matplotlib import pyplot as plt
from datetime import datetime
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from bcolz_array_iterator import BcolzArrayIterator
from string import punctuation
from keras.layers import InputSpec, Layer, Input, Dense, merge
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU, LSTM
from keras.layers.noise import GaussianNoise
from keras.layers.advanced_activations import ELU
from keras.models import Sequential, Model, model_from_json
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge

from keras.preprocessing.text import Tokenizer
from keras import optimizers
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Reshape, Lambda, Conv2D, Conv1D
from keras.layers import merge
from keras.layers.merge import concatenate, dot
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, BaseLogger, ProgbarLogger

import sys
np.random.seed(7)
np.set_printoptions(precision=3)
# !pip install gensim nltk keras pandas bcolz h5py


Using TensorFlow backend.

In [2]:
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
bcolz_chunklen = 64
stem = 0
init_random = 1
stop_words = 0
bcolz_prefix = '%s_%s_%s_%s_%s' % (MAX_SEQUENCE_LENGTH,
                                   bcolz_chunklen,
                                   stem,
                                   init_random,
                                   stop_words)
bcolz_prefix


Out[2]:
'30_64_0_1_0'

In [ ]:
df = pd.read_csv('train.csv.zip')
df = df[(df.question1.isnull()  == False) & (df.question2.isnull() == False)]
print(df.shape)
train_df = df.iloc[:380000]
val_df = df.iloc[380000:]
test_df = pd.read_csv('test.csv.zip')
print(test_df.shape)

In [3]:
# X1 = bcolz.open('bclz/all/q1_%s' % bcolz_prefix)
# X2 = bcolz.open('bclz/all/q2_%s' % bcolz_prefix)
# y = bcolz.open('bclz/all/y_%s' % bcolz_chunklen)

X1_train = bcolz.open('bclz/train/q1_%s' % bcolz_prefix)
X2_train = bcolz.open('bclz/train/q2_%s' % bcolz_prefix)
X1_pos_train = bcolz.open('bclz/train/pos1_%s' % bcolz_prefix)
X2_pos_train = bcolz.open('bclz/train/pos2_%s' % bcolz_prefix)
y_train = bcolz.open('bclz/train/y_%s' % bcolz_chunklen)

X1_val = bcolz.open('bclz/val/q1_%s' % bcolz_prefix)
X2_val = bcolz.open('bclz/val/q2_%s' % bcolz_prefix)
X1_pos_val = bcolz.open('bclz/val/pos1_%s' % bcolz_prefix)
X2_pos_val = bcolz.open('bclz/val/pos2_%s' % bcolz_prefix)
y_val = bcolz.open('bclz/val/y_%s' % bcolz_chunklen)

X1_test = bcolz.open('bclz/test/q1_%s' % bcolz_prefix)
X2_test = bcolz.open('bclz/test/q2_%s' % bcolz_prefix)
X1_pos_test = bcolz.open('bclz/test/pos1_%s' % bcolz_prefix)
X2_pos_test = bcolz.open('bclz/test/pos2_%s' % bcolz_prefix)

In [4]:
K.set_epsilon(1e-07)
np.random.seed(1)
STAMP = datetime.now().strftime('att_%m_%d_') + bcolz_prefix
STAMP


Out[4]:
'att_06_04_30_64_0_1_0'

In [8]:
class _Attention(object):
    def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'):
        self.max_length = max_length
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden,)))
        self.model.add(
            Dense(nr_hidden, name='attend1',
                kernel_initializer='he_normal', kernel_regularizer=l2(L2),
                input_shape=(nr_hidden,), activation='relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='attend2',
            kernel_initializer='he_normal', kernel_regularizer=l2(L2), activation='relu'))
        self.model = TimeDistributed(self.model)

    def __call__(self, sent1, sent2):
        def _outer(AB):
            att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1)))
            return K.permute_dimensions(att_ji,(0, 2, 1))
        return merge(
                [self.model(sent1), self.model(sent2)],
                mode=_outer,
                output_shape=(self.max_length, self.max_length))


class _SoftAlignment(object):
    def __init__(self, max_length, nr_hidden):
        self.max_length = max_length
        self.nr_hidden = nr_hidden

    def __call__(self, sentence, attention, transpose=False):
        def _normalize_attention(attmat):
            att = attmat[0]
            mat = attmat[1]
            if transpose:
                att = K.permute_dimensions(att,(0, 2, 1))
            # 3d softmax
            e = K.exp(att - K.max(att, axis=-1, keepdims=True))
            s = K.sum(e, axis=-1, keepdims=True)
            sm_att = e / s
            return K.batch_dot(sm_att, mat)
        return merge([attention, sentence], mode=_normalize_attention,
                      output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)


class _Comparison(object):
    def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0):
        self.words = words
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='compare1',
            kernel_initializer='he_normal', kernel_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='compare2',
                        kernel_regularizer=l2(L2), kernel_initializer='he_normal'))
        self.model.add(Activation('relu'))
        self.model = TimeDistributed(self.model)

    def __call__(self, sent, align, **kwargs):
        result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
        avged = GlobalAveragePooling1D()(result)
        maxed = GlobalMaxPooling1D()(result)
        merged = merge([avged, maxed])
        result = BatchNormalization()(merged)
        return result


Dense
class _Entailment(object):
    def __init__(self, nr_hidden, dropout=0.0, L2=0.0):
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='entail1',
            kernel_initializer='he_normal', kernel_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='entail2',
            kernel_initializer='he_normal', kernel_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dense(1, name='entail_out', activation='sigmoid',
                        kernel_regularizer=l2(L2), kernel_initializer='zero'))

    def __call__(self, feats1, feats2):
        features = merge([feats1, feats2], mode='concat')
        return self.model(features)
    
    
class _BiRNNEncoding(object):
    def __init__(self, max_length, nr_out, dropout=0.0):
        self.model = Sequential()
        self.model.add(Bidirectional(GRU(nr_out, return_sequences=True,
                                         dropout_W=dropout, dropout_U=dropout),
                                         input_shape=(max_length, 300)))
        self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal')))
        self.model.add(TimeDistributed(Dropout(0.2)))

    def __call__(self, sentence):
        return self.model(sentence)


class _GlobalSumPooling1D(Layer):
    '''Global sum pooling operation for temporal data.
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    '''
    def __init__(self, **kwargs):
        super(_GlobalSumPooling1D, self).__init__(**kwargs)
        self.input_spec = [InputSpec(ndim=3)]

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[2])

    def call(self, x, mask=None):
        if mask is not None:
            return K.sum(x * K.clip(mask, 0, 1), axis=1)
        else:
            return K.sum(x, axis=1)
        
        
def build_att_model(shape, settings=None):
    '''Compile the model.'''
    if settings is None:
        settings = {'dropout': 0.2, 'rnn_encode': True}
    max_length, nr_hidden = shape
    # Declare inputs.
    x1 = Input(shape=(MAX_SEQUENCE_LENGTH, 300), dtype='float32')
    x2 = Input(shape=(MAX_SEQUENCE_LENGTH, 300), dtype='float32')

    attend = _Attention(max_length, nr_hidden, dropout=settings['dropout'])
#     embed_dense = TimeDistributed(Dense(nr_hidden, activation=None, bias=False))
    align = _SoftAlignment(max_length, nr_hidden)
    compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout'])
    entail = _Entailment(nr_hidden, dropout=settings['dropout'])
    
#     sent1 = embed_dense(x1)
#     sent2 = embed_dense(x2)
    
    if settings['rnn_encode']:
        encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout'])
        sent1 = encode(x1)
        sent2 = encode(x2)
    else:
        sent1 = x1
        sent2 = x2

    attention = attend(sent1, sent2)

    align1 = align(sent2, attention)
    align2 = align(sent1, attention, transpose=True)

    feats1 = compare(sent1, align1)
    feats2 = compare(sent2, align2)

    scores = entail(feats1, feats2)
    
    # Now that we have the input/output, we can construct the Model object...
    model = Model(input=[x1, x2], output=scores)

    # ...Compile it...
    model.compile(
        optimizer=Adam(),
        loss='binary_crossentropy',
        metrics=['acc'])
    # ...And return it for training.
    return model

In [ ]:
model = build_att_model((30, 300),
                        {'dropout': 0.2,
                         'rnn_encode': False}
                       )
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = 'models/' + STAMP + '{epoch:02d}-{val_loss:.2f}'  + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path,
                                   save_best_only=True,
                                   save_weights_only=True)

In [11]:
batch_size = 1024
bclz_iter_train = BcolzArrayIterator([X1_train, X2_train], y_train, batch_size=batch_size, shuffle=True)
bclz_iter_val = BcolzArrayIterator([X1_val, X2_val], y_val, batch_size=batch_size, shuffle=True)
bclz_iter_test = BcolzArrayIterator([X1_test, X2_test], batch_size=64, shuffle=False)

In [ ]:
%%time
    
class_weight = {0: 1.309028344, 1: 0.472001959}
hist = model.fit_generator(bclz_iter_train, 1000, 10,
                           validation_data=bclz_iter_val, validation_steps=100,
                           callbacks=[model_checkpoint,
                                      early_stopping
                                     ],
                           class_weight=class_weight)

In [ ]:
!ls -w1 -t models | head -n5

In [12]:
STAMP = 'att_05_24_30_64_0_0_003-0.31'
model.load_weights('models/' + STAMP + '.h5')

In [13]:
%%time
!date
bclz_iter_eval = BcolzArrayIterator(X1, X2, y, batch_size=64, shuffle=False)
pred_iters_eval = int((df.shape[0] / bcolz_chunklen) + 1)
preds_eval = model.predict_generator(bclz_iter_eval, pred_iters_eval)
df['pred'] = preds_eval[:df.shape[0]]


CPU times: user 2min 24s, sys: 35.6 s, total: 3min
Wall time: 5min 2s

In [12]:
%%time
bclz_iter_cos = BcolzArrayIterator(X1, X2, y, batch_size=64, shuffle=False)
pred_iters_cos = int((df.shape[0] / bcolz_chunklen) + 1)
preds_cos = cos_model.predict_generator(bclz_iter_cos, pred_iters_cos)[:df.shape[0]]
# df['cos_distance'] = preds_cos[:df.shape[0]]


CPU times: user 2min 10s, sys: 27.7 s, total: 2min 38s
Wall time: 5min 52s

In [68]:
%%time
!date
bclz_iter_test = BcolzArrayIterator(X1_test, X2_test, batch_size=64, shuffle=False)
pred_iters = int((test_df.shape[0] / bcolz_chunklen) + 1)
preds = model.predict_generator(bclz_iter_test, pred_iters)
test_df['is_duplicate'] = preds[:test_df.shape[0]]
test_df.index.name = 'test_id'


Wed May 24 12:42:45 UTC 2017
CPU times: user 9min 22s, sys: 4min 24s, total: 13min 47s
Wall time: 18min 44s

In [14]:
def make_submission(data, prefix=STAMP):
    now = !date
    now = now[0]
    fname = 'submissions/' + prefix + '.csv'
    data.to_csv(fname,
                header=True,
                float_format='%.7f')
    with open("submissions/log.txt", "a") as log_:
        log_.write('%s - %s %s\n' % (now, prefix, STAMP))
    print(fname)

In [65]:
make_submission(test_df.is_duplicate)