• 0.66869 : initial
  • 0.67947 : change kflod 3 -> 10, early stapping 10, num_round 1000, num_leaves 256
  • 0.68066 : drop wrong expiretime user's data, add register time/expire time year/month/day
  • 0.68041 : 'learning_rate': 0.3,'min_data_in_leaf':256,'num_leaves': 512,'max_bin': 256,'max_depth': 20,
  • 0.67679 : add cf result 'learning_rate': 0.1,'min_data_in_leaf':512,'num_leaves': 512,'max_bin': 512,'max_depth': 20,
  • 0.67682 : 'learning_rate': 0.3,'min_data_in_leaf':256,'num_leaves': 256,'max_bin': 256,'max_depth': 20,
  • 0.67758 : 'learning_rate': 0.3,'num_leaves': 256,'max_bin': 256,'max_depth': 20,'min_data_in_leaf':default
  • 0.67314 : add msno, artist_name avg/count/std
  • 0.67304 : 'learning_rate': 0.1,'num_leaves': 256,'max_bin': 256,'max_depth': 20,
  • 0.65317 : add song extra info
  • 0.65288 : 'learning_rate': 0.1,'num_leaves': 256,'max_bin': 256,'max_depth': 20, 'min_data_in_leaf':256,
  • 0.65764 : 'learning_rate': 0.1,'num_leaves': 2048,'max_bin': 512,'max_depth': 30, 'min_data_in_leaf':256,
  • 0.64783 : 'learning_rate': 0.1,'num_leaves': 2048,'max_bin': 512,'max_depth': 10, 'min_data_in_leaf':256,

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from six.moves import cPickle as pickle
import gc
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

%matplotlib inline

INPUT_DATA_PATH = 'input/'

def make_pickle(file_name, data, force=False):
    import os
    if not os.path.exists("pickle"):
        os.makedirs("pickle")
        
    if os.path.exists(file_name) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % file_name)
    else:
        print('Pickling %s.' % file_name)
        try:
            with open(file_name, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', file_name, ':', e)
    
    return file_name

# draw numeric column plot
def draw_scatter_plot(df, col_name):
    np_array = df[col_name].values
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
def draw_dist_plot(df, col_name):
    np_array = df[col_name].values
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel('count', fontsize=12)
    plt.show()

def draw_np_array_scatter_plot(np_array, col_name):
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
def draw_np_array_dist_plot(np_array, col_name):
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel('count', fontsize=12)
    plt.show()

# draw category column plot
def draw_category_col(df, col):
    print('null count : {}'.format(df[col].isnull().sum()))
    display(df[col].value_counts())
    draw_count_plot(df, col)
    draw_bar_plot(df, col, 'target')
    draw_factor_count_plot(df, col, "target")

def draw_count_plot(df, col_name, title='plot'):
    plt.figure(figsize=(12,8))
    sns.countplot(data=df, x=col_name)
    plt.xticks(rotation='vertical')
    plt.xlabel(col_name, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel('count', fontsize=12)
    plt.title(title, fontsize=15)
    plt.show()
    
def draw_box_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.boxplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(y_col, fontsize=12)
    plt.show()
    
def draw_violin_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.violinplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(y_col, fontsize=12)
    plt.show()

def draw_factor_count_plot(df, x_col, y_col):
    g = sns.factorplot(y_col, col=x_col, data=df, size=3, 
                       palette="muted", kind='count', col_wrap=4, aspect=.8)
    g.despine(left=True)
    g.set_ylabels(y_col)
    g.set_titles("{col_name}")
    g.set_xlabels("")
    plt.xticks(rotation='vertical')

def draw_bar_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    g = sns.barplot(x=x_col, y=y_col, data=df, palette="muted")
    plt.xlabel(x_col, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(y_col, fontsize=12)

# etc
def category_to_numeric(df, column_name):
    for category in df[column_name].unique():
        category_column = column_name + '_' + str(category)
        if category_column in df.columns:
            df = df.drop(category_column, axis=1)
    df= pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
    return df

def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
    np_array = df[col_name].values
    
    ulimit = np.percentile(np_array, upper_percentile)
    llimit = np.percentile(np_array, lower_percentile)
    print('upper limit :', ulimit, ', lower limit :', llimit)
    
    # convert
    df[col_name].loc[df[col_name] > ulimit] = ulimit
    df[col_name].loc[df[col_name] < llimit] = llimit

# save param
def save_obj(obj, datetime_key):
    with open('lightgbm/'+ datetime_key + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(datetime_key):
    with open('lightgbm/' + datetime_key + '.pkl', 'rb') as f:
        return pickle.load(f)

In [5]:
make_pickle('pickle/df_train', df_train, force=True)
make_pickle('pickle/df_test', df_test, force=True)


Pickling pickle/df_train.
Pickling pickle/df_test.
Out[5]:
'pickle/df_test'

In [10]:
with open('pickle/df_train', 'rb') as f:
    df_train = pickle.load(f)
with open('pickle/df_test', 'rb') as f:
    df_test = pickle.load(f)

In [11]:
df_train.dtypes


Out[11]:
msno                            category
song_id                         category
source_system_tab               category
source_screen_name              category
source_type                     category
target                             uint8
city                            category
bd                                 uint8
gender                          category
registered_via                  category
membership_days                    int64
registration_init_time_year        int64
registration_init_time_month       int64
registration_init_time_day         int64
expiration_date_year               int64
expiration_date_month              int64
expiration_date_day                int64
song_length                       uint32
genre_ids                       category
artist_name                     category
composer                        category
lyricist                        category
language                        category
artist_name_count                float64
artist_name_avg                  float64
artist_name_std                  float64
msno_count                       float64
msno_avg                         float64
msno_std                         float64
isrc_cc                         category
isrc_xxx                        category
isrc_yyyy                        float64
dtype: object

In [12]:
columns = list(df_train.columns)


columns.remove('registration_init_time_year')
columns.remove('registration_init_time_month')
columns.remove('registration_init_time_day')
columns.remove('expiration_date_year')
columns.remove('expiration_date_month')
columns.remove('expiration_date_day')
columns.remove('artist_name_count')
columns.remove('artist_name_avg')
columns.remove('artist_name_std')
columns.remove('msno_count')
columns.remove('msno_avg')
columns.remove('msno_std')
columns.remove('isrc_cc')
columns.remove('isrc_xxx')
columns.remove('isrc_yyyy')

test_columns = columns.copy()
test_columns.remove('target')

print(columns)


['msno', 'song_id', 'source_system_tab', 'source_screen_name', 'source_type', 'target', 'city', 'bd', 'gender', 'registered_via', 'membership_days', 'song_length', 'genre_ids', 'artist_name', 'composer', 'lyricist', 'language']

In [14]:
gc.collect()

d_train = df_train[columns]
d_test = df_test[test_columns]

# Create a Cross Validation with n splits
n_splits = 10
kf = KFold(n_splits=n_splits)

# This array will store the predictions made.
predictions = np.zeros(shape=[len(d_test)])

import datetime
datetime_key = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
datetime_key = "[{}]_{}".format(n_splits, datetime_key)

# Create the parameters for LGBM
# 'min_data_in_leaf':256,
# params = {
#     'verbose': 1,
#     'objective': 'binary',
#     'metric' : 'auc',
#     'boosting': 'gbdt',
#     'learning_rate': 0.1,
#     'num_leaves': 2048,
#     'max_bin': 1024,
#     'max_depth': 20,
#     'bagging_fraction': 0.95,
#     'bagging_freq': 1,
#     'bagging_seed': 1,
#     'feature_fraction': 0.9,
#     'feature_fraction_seed': 1,
#     'num_rounds': 1000,
#     'num_threads' : 8,
#     } 
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',

    'learning_rate': 0.3,'min_data_in_leaf':256,'num_leaves': 1024,'max_bin': 256,'max_depth': 20,
    
    'verbose': 0,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'num_rounds': 1000,
    'num_threads' : 8,
    'metric' : 'auc',
    } 

# For each KFold
for train_indices ,validate_indices in kf.split(d_train) : 
    train_data = lgb.Dataset(d_train.drop(['target'],axis=1).loc[train_indices,:],
                             label=d_train.loc[train_indices,'target'])
    val_data = lgb.Dataset(d_train.drop(['target'],axis=1).loc[validate_indices,:],
                           label=d_train.loc[validate_indices,'target'])
    
    # Train the model
    bst = lgb.train(params, train_data, valid_sets=[val_data],
                   early_stopping_rounds=10, verbose_eval=10)
    
    # Make the predictions storing them on the predictions array
    predictions += bst.predict(d_test)
    
    # draw feature importance
#     lgb.plot_importance(bst)
#     plt.show()
    
    # Release the model from memory for the next iteration
    del bst
    del train_data
    del val_data
    gc.collect()

print('Training process finished. Generating Output...')

# We get the ammount of predictions from the prediction list, by dividing the predictions by the number of Kfolds.
predictions = predictions/n_splits

# Read the sample_submission CSV
submission = pd.read_csv(INPUT_DATA_PATH + '/sample_submission.csv')
# Set the target to our predictions
submission.target=predictions
# Save the submission file
submission.to_csv('lightgbm/{}_submission.csv'.format(datetime_key),index=False)

print('Output created.')

save_obj(params, datetime_key + '_params')
save_obj(d_train.columns, datetime_key + '_columns')
print('param saved')


/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/lightgbm/engine.py:98: UserWarning: Found `num_rounds` in params. Will use it instead of argument
  warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.777594
[20]	valid_0's auc: 0.785565
[30]	valid_0's auc: 0.789705
[40]	valid_0's auc: 0.792352
[50]	valid_0's auc: 0.793547
[60]	valid_0's auc: 0.794828
[70]	valid_0's auc: 0.795411
[80]	valid_0's auc: 0.795808
[90]	valid_0's auc: 0.796211
[100]	valid_0's auc: 0.796843
[110]	valid_0's auc: 0.796903
[120]	valid_0's auc: 0.798007
[130]	valid_0's auc: 0.798114
[140]	valid_0's auc: 0.798199
[150]	valid_0's auc: 0.798476
[160]	valid_0's auc: 0.798639
Early stopping, best iteration is:
[157]	valid_0's auc: 0.798695
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.763357
[20]	valid_0's auc: 0.772586
[30]	valid_0's auc: 0.776801
[40]	valid_0's auc: 0.77944
[50]	valid_0's auc: 0.780882
[60]	valid_0's auc: 0.782442
[70]	valid_0's auc: 0.783128
[80]	valid_0's auc: 0.783847
[90]	valid_0's auc: 0.784834
[100]	valid_0's auc: 0.785007
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.739463
[20]	valid_0's auc: 0.74717
[30]	valid_0's auc: 0.752606
[40]	valid_0's auc: 0.756233
[50]	valid_0's auc: 0.757509
[60]	valid_0's auc: 0.758749
[70]	valid_0's auc: 0.759613
[80]	valid_0's auc: 0.760806
[90]	valid_0's auc: 0.761853
[100]	valid_0's auc: 0.762504
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.724063
[20]	valid_0's auc: 0.731736
[30]	valid_0's auc: 0.735603
[40]	valid_0's auc: 0.738383
[50]	valid_0's auc: 0.73995
[60]	valid_0's auc: 0.740972
[70]	valid_0's auc: 0.742015
[80]	valid_0's auc: 0.743394
[90]	valid_0's auc: 0.744202
[100]	valid_0's auc: 0.744563
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.7103
[20]	valid_0's auc: 0.718293
[30]	valid_0's auc: 0.722128
[40]	valid_0's auc: 0.724564
[50]	valid_0's auc: 0.726415
[60]	valid_0's auc: 0.728589
[70]	valid_0's auc: 0.729607
[80]	valid_0's auc: 0.73081
[90]	valid_0's auc: 0.73144
[100]	valid_0's auc: 0.732322
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.696564
[20]	valid_0's auc: 0.70373
[30]	valid_0's auc: 0.708036
[40]	valid_0's auc: 0.710588
[50]	valid_0's auc: 0.712451
[60]	valid_0's auc: 0.714064
[70]	valid_0's auc: 0.715029
[80]	valid_0's auc: 0.715849
[90]	valid_0's auc: 0.716853
[100]	valid_0's auc: 0.717101
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.687226
[20]	valid_0's auc: 0.69487
[30]	valid_0's auc: 0.69844
[40]	valid_0's auc: 0.701612
[50]	valid_0's auc: 0.703074
[60]	valid_0's auc: 0.7048
[70]	valid_0's auc: 0.706297
[80]	valid_0's auc: 0.707301
[90]	valid_0's auc: 0.708178
[100]	valid_0's auc: 0.709021
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.6795
[20]	valid_0's auc: 0.686165
[30]	valid_0's auc: 0.690141
[40]	valid_0's auc: 0.692694
[50]	valid_0's auc: 0.694364
[60]	valid_0's auc: 0.696147
[70]	valid_0's auc: 0.697549
[80]	valid_0's auc: 0.698195
[90]	valid_0's auc: 0.699649
[100]	valid_0's auc: 0.700057
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.674164
[20]	valid_0's auc: 0.680915
[30]	valid_0's auc: 0.685656
[40]	valid_0's auc: 0.68843
[50]	valid_0's auc: 0.689879
[60]	valid_0's auc: 0.691809
[70]	valid_0's auc: 0.692801
[80]	valid_0's auc: 0.69372
[90]	valid_0's auc: 0.695156
[100]	valid_0's auc: 0.695689
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's auc: 0.661388
[20]	valid_0's auc: 0.667599
[30]	valid_0's auc: 0.670661
[40]	valid_0's auc: 0.673039
[50]	valid_0's auc: 0.674294
[60]	valid_0's auc: 0.67586
[70]	valid_0's auc: 0.676789
[80]	valid_0's auc: 0.677549
[90]	valid_0's auc: 0.678221
[100]	valid_0's auc: 0.678934
Training process finished. Generating Output...
Output created.
param saved

In [ ]: