In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from six.moves import cPickle as pickle
import gc
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
%matplotlib inline
INPUT_DATA_PATH = 'input/'
def make_pickle(file_name, data, force=False):
import os
if not os.path.exists("pickle"):
os.makedirs("pickle")
if os.path.exists(file_name) and not force:
# You may override by setting force=True.
print('%s already present - Skipping pickling.' % file_name)
else:
print('Pickling %s.' % file_name)
try:
with open(file_name, 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
print('Unable to save data to', file_name, ':', e)
return file_name
# draw numeric column plot
def draw_scatter_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_dist_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel('count', fontsize=12)
plt.show()
def draw_np_array_scatter_plot(np_array, col_name):
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_np_array_dist_plot(np_array, col_name):
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel('count', fontsize=12)
plt.show()
# draw category column plot
def draw_category_col(df, col):
print('null count : {}'.format(df[col].isnull().sum()))
display(df[col].value_counts())
draw_count_plot(df, col)
draw_bar_plot(df, col, 'target')
draw_factor_count_plot(df, col, "target")
def draw_count_plot(df, col_name, title='plot'):
plt.figure(figsize=(12,8))
sns.countplot(data=df, x=col_name)
plt.xticks(rotation='vertical')
plt.xlabel(col_name, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel('count', fontsize=12)
plt.title(title, fontsize=15)
plt.show()
def draw_box_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.boxplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_violin_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.violinplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_factor_count_plot(df, x_col, y_col):
g = sns.factorplot(y_col, col=x_col, data=df, size=3,
palette="muted", kind='count', col_wrap=4, aspect=.8)
g.despine(left=True)
g.set_ylabels(y_col)
g.set_titles("{col_name}")
g.set_xlabels("")
plt.xticks(rotation='vertical')
def draw_bar_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
g = sns.barplot(x=x_col, y=y_col, data=df, palette="muted")
plt.xlabel(x_col, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(y_col, fontsize=12)
# etc
def category_to_numeric(df, column_name):
for category in df[column_name].unique():
category_column = column_name + '_' + str(category)
if category_column in df.columns:
df = df.drop(category_column, axis=1)
df= pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
return df
def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
np_array = df[col_name].values
ulimit = np.percentile(np_array, upper_percentile)
llimit = np.percentile(np_array, lower_percentile)
print('upper limit :', ulimit, ', lower limit :', llimit)
# convert
df[col_name].loc[df[col_name] > ulimit] = ulimit
df[col_name].loc[df[col_name] < llimit] = llimit
# save param
def save_obj(obj, datetime_key):
with open('lightgbm/'+ datetime_key + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(datetime_key):
with open('lightgbm/' + datetime_key + '.pkl', 'rb') as f:
return pickle.load(f)
In [5]:
make_pickle('pickle/df_train', df_train, force=True)
make_pickle('pickle/df_test', df_test, force=True)
Out[5]:
In [10]:
with open('pickle/df_train', 'rb') as f:
df_train = pickle.load(f)
with open('pickle/df_test', 'rb') as f:
df_test = pickle.load(f)
In [11]:
df_train.dtypes
Out[11]:
In [12]:
columns = list(df_train.columns)
columns.remove('registration_init_time_year')
columns.remove('registration_init_time_month')
columns.remove('registration_init_time_day')
columns.remove('expiration_date_year')
columns.remove('expiration_date_month')
columns.remove('expiration_date_day')
columns.remove('artist_name_count')
columns.remove('artist_name_avg')
columns.remove('artist_name_std')
columns.remove('msno_count')
columns.remove('msno_avg')
columns.remove('msno_std')
columns.remove('isrc_cc')
columns.remove('isrc_xxx')
columns.remove('isrc_yyyy')
test_columns = columns.copy()
test_columns.remove('target')
print(columns)
In [14]:
gc.collect()
d_train = df_train[columns]
d_test = df_test[test_columns]
# Create a Cross Validation with n splits
n_splits = 10
kf = KFold(n_splits=n_splits)
# This array will store the predictions made.
predictions = np.zeros(shape=[len(d_test)])
import datetime
datetime_key = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
datetime_key = "[{}]_{}".format(n_splits, datetime_key)
# Create the parameters for LGBM
# 'min_data_in_leaf':256,
# params = {
# 'verbose': 1,
# 'objective': 'binary',
# 'metric' : 'auc',
# 'boosting': 'gbdt',
# 'learning_rate': 0.1,
# 'num_leaves': 2048,
# 'max_bin': 1024,
# 'max_depth': 20,
# 'bagging_fraction': 0.95,
# 'bagging_freq': 1,
# 'bagging_seed': 1,
# 'feature_fraction': 0.9,
# 'feature_fraction_seed': 1,
# 'num_rounds': 1000,
# 'num_threads' : 8,
# }
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting': 'gbdt',
'learning_rate': 0.3,'min_data_in_leaf':256,'num_leaves': 1024,'max_bin': 256,'max_depth': 20,
'verbose': 0,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 1,
'feature_fraction': 0.9,
'feature_fraction_seed': 1,
'num_rounds': 1000,
'num_threads' : 8,
'metric' : 'auc',
}
# For each KFold
for train_indices ,validate_indices in kf.split(d_train) :
train_data = lgb.Dataset(d_train.drop(['target'],axis=1).loc[train_indices,:],
label=d_train.loc[train_indices,'target'])
val_data = lgb.Dataset(d_train.drop(['target'],axis=1).loc[validate_indices,:],
label=d_train.loc[validate_indices,'target'])
# Train the model
bst = lgb.train(params, train_data, valid_sets=[val_data],
early_stopping_rounds=10, verbose_eval=10)
# Make the predictions storing them on the predictions array
predictions += bst.predict(d_test)
# draw feature importance
# lgb.plot_importance(bst)
# plt.show()
# Release the model from memory for the next iteration
del bst
del train_data
del val_data
gc.collect()
print('Training process finished. Generating Output...')
# We get the ammount of predictions from the prediction list, by dividing the predictions by the number of Kfolds.
predictions = predictions/n_splits
# Read the sample_submission CSV
submission = pd.read_csv(INPUT_DATA_PATH + '/sample_submission.csv')
# Set the target to our predictions
submission.target=predictions
# Save the submission file
submission.to_csv('lightgbm/{}_submission.csv'.format(datetime_key),index=False)
print('Output created.')
save_obj(params, datetime_key + '_params')
save_obj(d_train.columns, datetime_key + '_columns')
print('param saved')
In [ ]: