In [1]:
# https://www.kaggle.com/lystdo/beat-kkbox-benchmark-without-using-metadata-0-62
In [1]:
########################################
## import packages
########################################
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from keras.models import Model
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Reshape
from keras.layers.merge import concatenate, dot
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.initializers import RandomUniform
from keras.optimizers import RMSprop, Adam, SGD
import h5py
In [2]:
########################################
## load the data
########################################
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
song = pd.read_csv('./input/songs.csv')
song_extra = pd.read_csv('./input/song_extra_info.csv')
member = pd.read_csv('./input/members.csv')
In [3]:
train.dtypes
Out[3]:
In [4]:
test.dtypes
Out[4]:
In [5]:
song.dtypes
Out[5]:
In [6]:
song_extra.dtypes
Out[6]:
In [7]:
member.dtypes
Out[7]:
In [17]:
########################################
## encoding
########################################
target = train.target
id_test = test.id
def encode_str(train_data, test_data):
data_encoder = LabelEncoder()
data_encoder.fit(train_data.append(test_data))
t_train_data = data_encoder.transform(train_data)
t_test_data = data_encoder.transform(test_data)
return t_train_data, t_test_data
def generate_encoded_data(data_raw, data_test_raw):
data, data_test = encode_str(data_raw, data_test_raw)
data_cnt = int(max(data.max(), data_test.max()) + 1)
return data, data_test, data_cnt
uid_raw = train.msno
sid_raw = train.song_id
uid_test_raw = test.msno
sid_test_raw = test.song_id
uid, uid_test = encode_str(uid_raw, uid_test_raw)
sid, sid_test = encode_str(sid_raw, sid_test_raw)
u_cnt = int(max(uid.max(), uid_test.max()) + 1)
s_cnt = int(max(sid.max(), sid_test.max()) + 1)
In [7]:
########################################
## train-validation split
########################################
perm = np.random.permutation(len(train))
trn_cnt = int(len(train) * 0.7)
uid_trn = uid[perm[:trn_cnt]]
sid_trn = sid[perm[:trn_cnt]]
target_trn = target[perm[:trn_cnt]]
uid_val = uid[perm[trn_cnt:]]
sid_val = sid[perm[trn_cnt:]]
target_val = target[perm[trn_cnt:]]
In [18]:
########################################
## define the model
########################################
def get_model():
user_embeddings = Embedding(u_cnt,
64,
embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
embeddings_regularizer=l2(1e-4),
input_length=1,
trainable=True)
song_embeddings = Embedding(s_cnt,
64,
embeddings_initializer=RandomUniform(minval=-0.1, maxval=0.1),
embeddings_regularizer=l2(1e-4),
input_length=1,
trainable=True)
uid_input = Input(shape=(1,), dtype='int32')
embedded_usr = user_embeddings(uid_input)
embedded_usr = Reshape((64,))(embedded_usr)
sid_input = Input(shape=(1,), dtype='int32')
embedded_song = song_embeddings(sid_input)
embedded_song = Reshape((64,))(embedded_song)
preds = dot([embedded_usr, embedded_song], axes=1)
preds = concatenate([embedded_usr, embedded_song, preds])
preds = Dense(64, activation='relu')(preds)
preds = Dropout(0.5)(preds)
preds = Dense(128, activation='relu')(preds)
preds = Dropout(0.5)(preds)
preds = Dense(256, activation='relu')(preds)
preds = Dropout(0.5)(preds)
preds = Dense(1, activation='sigmoid')(preds)
model = Model(inputs=[uid_input, sid_input], outputs=preds)
opt = RMSprop(lr=1e-3)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])
return model
In [19]:
########################################
## train the model
########################################
model = get_model()
early_stopping =EarlyStopping(monitor='val_acc', patience=3)
model_path = 'bst_model.h5'
model_checkpoint = ModelCheckpoint(model_path, save_best_only=True, \
save_weights_only=True)
hist = model.fit([uid_trn, sid_trn], target_trn, validation_data=([uid_val, sid_val], \
target_val), epochs=100, batch_size=32768, shuffle=True, \
callbacks=[early_stopping, model_checkpoint])
model.load_weights(model_path)
preds_val = model.predict([uid_val, sid_val], batch_size=32768)
val_auc = roc_auc_score(target_val, preds_val)
In [16]:
########################################
## make the submission
########################################
preds_test = model.predict([uid_test, sid_test], batch_size=32768, verbose=1)
sub = pd.DataFrame({'id': id_test, 'target': preds_test.ravel()})
sub.to_csv('./keras/sub_%.5f.csv'%(val_auc), index=False)
In [ ]: