code from https://www.kaggle.com/kamilkk/simple-fast-lgbm-0-66683
submit history 0.66683 : initial
In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm
print('Loading data...')
data_path = 'input/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
songs = pd.read_csv(data_path + 'songs.csv')
members = pd.read_csv(data_path + 'members.csv')
print('Data preprocessing...')
song_cols = ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']
train = train.merge(songs[song_cols], on='song_id', how='left')
test = test.merge(songs[song_cols], on='song_id', how='left')
members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))
members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
members = members.drop(['registration_init_time'], axis=1)
members_cols = members.columns
train = train.merge(members[members_cols], on='msno', how='left')
test = test.merge(members[members_cols], on='msno', how='left')
train = train.fillna(-1)
test = test.fillna(-1)
import gc
del members, songs; gc.collect();
cols = list(train.columns)
cols.remove('target')
for col in tqdm(cols):
if train[col].dtype == 'object':
train[col] = train[col].apply(str)
test[col] = test[col].apply(str)
le = LabelEncoder()
train_vals = list(train[col].unique())
test_vals = list(test[col].unique())
le.fit(train_vals + test_vals)
train[col] = le.transform(train[col])
test[col] = le.transform(test[col])
X = np.array(train.drop(['target'], axis=1))
y = train['target'].values
X_test = np.array(test.drop(['id'], axis=1))
ids = test['id'].values
del train, test; gc.collect();
X_train, X_valid, y_train, y_valid = train_test_split(X, y, \
test_size=0.1, random_state = 12)
del X, y; gc.collect();
d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_valid, label=y_valid)
watchlist = [d_train, d_valid]
print('Training LGBM model...')
params = {}
params['learning_rate'] = 0.4
params['application'] = 'binary'
params['max_depth'] = 15
params['num_leaves'] = 2**8
params['verbosity'] = 0
params['metric'] = 'auc'
model = lgb.train(params, train_set=d_train, num_boost_round=200, valid_sets=watchlist, \
early_stopping_rounds=10, verbose_eval=10)
print('Making predictions and saving them...')
p_test = model.predict(X_test)
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test
subm.to_csv('lgbm-66683/submission.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')
print('Done!')
In [ ]: