code from https://www.kaggle.com/kamilkk/simple-fast-lgbm-0-66683

submit history 0.66683 : initial


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm

print('Loading data...')
data_path = 'input/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs.csv')
members = pd.read_csv(data_path + 'members.csv')

print('Data preprocessing...')
song_cols = ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']
train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')

members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
members = members.drop(['registration_init_time'], axis=1)

members_cols = members.columns
train = train.merge(members[members_cols], on='msno', how='left')
test = test.merge(members[members_cols], on='msno', how='left')

train = train.fillna(-1)
test = test.fillna(-1)

import gc
del members, songs; gc.collect();

cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

X = np.array(train.drop(['target'], axis=1))
y = train['target'].values

X_test = np.array(test.drop(['id'], axis=1))
ids = test['id'].values

del train, test; gc.collect();

X_train, X_valid, y_train, y_valid = train_test_split(X, y, \
    test_size=0.1, random_state = 12)
    
del X, y; gc.collect();


d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_valid, label=y_valid) 

watchlist = [d_train, d_valid]


print('Training LGBM model...')
params = {}
params['learning_rate'] = 0.4
params['application'] = 'binary'
params['max_depth'] = 15
params['num_leaves'] = 2**8
params['verbosity'] = 0
params['metric'] = 'auc'

model = lgb.train(params, train_set=d_train, num_boost_round=200, valid_sets=watchlist, \
early_stopping_rounds=10, verbose_eval=10)

print('Making predictions and saving them...')
p_test = model.predict(X_test)

subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test
subm.to_csv('lgbm-66683/submission.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')
print('Done!')


Loading data...
Data preprocessing...
100%|██████████| 19/19 [04:22<00:00, 13.81s/it]
Training LGBM model...
Training until validation scores don't improve for 10 rounds.
[10]	training's auc: 0.711448	valid_1's auc: 0.710435
[20]	training's auc: 0.728443	valid_1's auc: 0.726761
[30]	training's auc: 0.739859	valid_1's auc: 0.737437
[40]	training's auc: 0.749608	valid_1's auc: 0.746279
[50]	training's auc: 0.75565	valid_1's auc: 0.75172
[60]	training's auc: 0.76264	valid_1's auc: 0.758014
[70]	training's auc: 0.767917	valid_1's auc: 0.762533
[80]	training's auc: 0.772212	valid_1's auc: 0.766092
[90]	training's auc: 0.776372	valid_1's auc: 0.769649
[100]	training's auc: 0.780174	valid_1's auc: 0.772931
[110]	training's auc: 0.783211	valid_1's auc: 0.775386
[120]	training's auc: 0.78603	valid_1's auc: 0.777629
[130]	training's auc: 0.788615	valid_1's auc: 0.779748
[140]	training's auc: 0.790999	valid_1's auc: 0.781422
[150]	training's auc: 0.793478	valid_1's auc: 0.783347
[160]	training's auc: 0.795818	valid_1's auc: 0.785116
[170]	training's auc: 0.797215	valid_1's auc: 0.785613
[180]	training's auc: 0.799265	valid_1's auc: 0.787058
[190]	training's auc: 0.801127	valid_1's auc: 0.788348
[200]	training's auc: 0.80268	valid_1's auc: 0.789168
Making predictions and saving them...
Done!

In [ ]: