In [1]:
# https://www.kaggle.com/lystdo/beat-kkbox-benchmark-without-using-metadata-0-62

In [1]:
########################################
## import packages
########################################

import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform
from keras.optimizers import RMSprop, Adam, SGD

import h5py


Using TensorFlow backend.
/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)

In [2]:
########################################
## load the data
########################################
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
song = pd.read_csv('./input/songs.csv')
song_extra = pd.read_csv('./input/song_extra_info.csv')
member = pd.read_csv('./input/members.csv')

In [3]:
train.dtypes


Out[3]:
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                 int64
dtype: object

In [4]:
test.dtypes


Out[4]:
id                     int64
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
dtype: object

In [5]:
song.dtypes


Out[5]:
song_id         object
song_length      int64
genre_ids       object
artist_name     object
composer        object
lyricist        object
language       float64
dtype: object

In [6]:
song_extra.dtypes


Out[6]:
song_id    object
name       object
isrc       object
dtype: object

In [7]:
member.dtypes


Out[7]:
msno                      object
city                       int64
bd                         int64
gender                    object
registered_via             int64
registration_init_time     int64
expiration_date            int64
dtype: object

In [17]:
########################################
## encoding
########################################
target = train.target
id_test = test.id

def encode_str(train_data, test_data):
    data_encoder = LabelEncoder()
    data_encoder.fit(train_data.append(test_data))
    t_train_data = data_encoder.transform(train_data)
    t_test_data = data_encoder.transform(test_data)
    return t_train_data, t_test_data

def generate_encoded_data(data_raw, data_test_raw):
    data, data_test = encode_str(data_raw, data_test_raw)
    data_cnt = int(max(data.max(), data_test.max()) + 1)
    return data, data_test, data_cnt

uid_raw = train.msno
sid_raw = train.song_id

uid_test_raw = test.msno
sid_test_raw = test.song_id

uid, uid_test = encode_str(uid_raw, uid_test_raw)
sid, sid_test = encode_str(sid_raw, sid_test_raw)

u_cnt = int(max(uid.max(), uid_test.max()) + 1)
s_cnt = int(max(sid.max(), sid_test.max()) + 1)

In [7]:
########################################
## train-validation split
########################################

perm = np.random.permutation(len(train))
trn_cnt = int(len(train) * 0.7)

uid_trn = uid[perm[:trn_cnt]]
sid_trn = sid[perm[:trn_cnt]]
target_trn = target[perm[:trn_cnt]]

uid_val = uid[perm[trn_cnt:]]
sid_val = sid[perm[trn_cnt:]]
target_val = target[perm[trn_cnt:]]

In [18]:
########################################
## define the model
########################################

def get_model():
    user_embeddings = Embedding(u_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)
    song_embeddings = Embedding(s_cnt,
            64,
            embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
            embeddings_regularizer=l2(1e-4),
            input_length=1,
            trainable=True)

    uid_input = Input(shape=(1,), dtype='int32')
    embedded_usr = user_embeddings(uid_input)
    embedded_usr = Reshape((64,))(embedded_usr)

    sid_input = Input(shape=(1,), dtype='int32')
    embedded_song = song_embeddings(sid_input)
    embedded_song = Reshape((64,))(embedded_song)

    preds = dot([embedded_usr, embedded_song], axes=1)
    preds = concatenate([embedded_usr, embedded_song, preds])
    
    preds = Dense(64, activation='relu')(preds)
    preds = Dropout(0.5)(preds)
    
    preds = Dense(128, activation='relu')(preds)
    preds = Dropout(0.5)(preds)
    
    preds = Dense(256, activation='relu')(preds)
    preds = Dropout(0.5)(preds)
    
    preds = Dense(1, activation='sigmoid')(preds)

    model = Model(inputs=[uid_input, sid_input], outputs=preds)
    
    opt = RMSprop(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])

    return model

In [19]:
########################################
## train the model
########################################
   
model = get_model()
early_stopping =EarlyStopping(monitor='val_acc', patience=3)
model_path = 'bst_model.h5'
model_checkpoint = ModelCheckpoint(model_path, save_best_only=True, \
        save_weights_only=True)

hist = model.fit([uid_trn, sid_trn], target_trn, validation_data=([uid_val, sid_val], \
        target_val), epochs=100, batch_size=32768, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])
model.load_weights(model_path)

preds_val = model.predict([uid_val, sid_val], batch_size=32768)
val_auc = roc_auc_score(target_val, preds_val)


Train on 5164192 samples, validate on 2213226 samples
Epoch 1/100
5164192/5164192 [==============================] - 12s 2us/step - loss: 1.9932 - acc: 0.6215 - val_loss: 0.6443 - val_acc: 0.6630
Epoch 2/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6352 - acc: 0.6671 - val_loss: 0.6264 - val_acc: 0.6704
Epoch 3/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6250 - acc: 0.6722 - val_loss: 0.6235 - val_acc: 0.6711
Epoch 4/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6197 - acc: 0.6776 - val_loss: 0.6176 - val_acc: 0.6772
Epoch 5/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6156 - acc: 0.6817 - val_loss: 0.6168 - val_acc: 0.6777
Epoch 6/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6126 - acc: 0.6850 - val_loss: 0.6120 - val_acc: 0.6841
Epoch 7/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6094 - acc: 0.6882 - val_loss: 0.6101 - val_acc: 0.6860
Epoch 8/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6066 - acc: 0.6911 - val_loss: 0.6078 - val_acc: 0.6890
Epoch 9/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6044 - acc: 0.6934 - val_loss: 0.6105 - val_acc: 0.6863
Epoch 10/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6022 - acc: 0.6958 - val_loss: 0.6081 - val_acc: 0.6899
Epoch 11/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.6002 - acc: 0.6982 - val_loss: 0.6060 - val_acc: 0.6938
Epoch 12/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.5980 - acc: 0.7015 - val_loss: 0.6058 - val_acc: 0.6953
Epoch 13/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.5950 - acc: 0.7059 - val_loss: 0.6075 - val_acc: 0.6953
Epoch 14/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.5903 - acc: 0.7125 - val_loss: 0.6076 - val_acc: 0.6988
Epoch 15/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.5824 - acc: 0.7233 - val_loss: 0.6113 - val_acc: 0.6992
Epoch 16/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.5695 - acc: 0.7392 - val_loss: 0.6222 - val_acc: 0.6992
Epoch 17/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.5512 - acc: 0.7598 - val_loss: 0.6418 - val_acc: 0.6940
Epoch 18/100
5164192/5164192 [==============================] - 10s 2us/step - loss: 0.5299 - acc: 0.7818 - val_loss: 0.6626 - val_acc: 0.6889

In [16]:
########################################
## make the submission
########################################

preds_test = model.predict([uid_test, sid_test], batch_size=32768, verbose=1)
sub = pd.DataFrame({'id': id_test, 'target': preds_test.ravel()})
sub.to_csv('./keras/sub_%.5f.csv'%(val_auc), index=False)


2556790/2556790 [==============================] - 1s 0us/step

In [ ]: