https://www.kaggle.com/juanumusic/days-instead-of-dates-lgbm-0-66870
submit history
In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from six.moves import cPickle as pickle
import gc
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
%matplotlib inline
INPUT_DATA_PATH = 'input/'
def make_pickle(file_name, data, force=False):
import os
if not os.path.exists("pickle"):
os.makedirs("pickle")
if os.path.exists(file_name) and not force:
# You may override by setting force=True.
print('%s already present - Skipping pickling.' % file_name)
else:
print('Pickling %s.' % file_name)
try:
with open(file_name, 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
print('Unable to save data to', file_name, ':', e)
return file_name
# draw numeric column plot
def draw_scatter_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_dist_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel('count', fontsize=12)
plt.show()
def draw_np_array_scatter_plot(np_array, col_name):
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_np_array_dist_plot(np_array, col_name):
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel('count', fontsize=12)
plt.show()
# draw category column plot
def draw_category_col(df, col):
print('null count : {}'.format(df[col].isnull().sum()))
display(df[col].value_counts())
draw_count_plot(df, col)
draw_bar_plot(df, col, 'target')
draw_factor_count_plot(df, col, "target")
def draw_count_plot(df, col_name, title='plot'):
plt.figure(figsize=(12,8))
sns.countplot(data=df, x=col_name)
plt.xticks(rotation='vertical')
plt.xlabel(col_name, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel('count', fontsize=12)
plt.title(title, fontsize=15)
plt.show()
def draw_box_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.boxplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_violin_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.violinplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_factor_count_plot(df, x_col, y_col):
g = sns.factorplot(y_col, col=x_col, data=df, size=3,
palette="muted", kind='count', col_wrap=4, aspect=.8)
g.despine(left=True)
g.set_ylabels(y_col)
g.set_titles("{col_name}")
g.set_xlabels("")
plt.xticks(rotation='vertical')
def draw_bar_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
g = sns.barplot(x=x_col, y=y_col, data=df, palette="muted")
plt.xlabel(x_col, fontsize=12)
plt.xticks(rotation='vertical')
plt.ylabel(y_col, fontsize=12)
# etc
def category_to_numeric(df, column_name):
for category in df[column_name].unique():
category_column = column_name + '_' + str(category)
if category_column in df.columns:
df = df.drop(category_column, axis=1)
df= pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
return df
def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
np_array = df[col_name].values
ulimit = np.percentile(np_array, upper_percentile)
llimit = np.percentile(np_array, lower_percentile)
print('upper limit :', ulimit, ', lower limit :', llimit)
# convert
df[col_name].loc[df[col_name] > ulimit] = ulimit
df[col_name].loc[df[col_name] < llimit] = llimit
In [11]:
df_test = pd.read_csv(INPUT_DATA_PATH + 'test.csv',dtype={'msno' : 'category',
'source_system_tab' : 'category',
'source_screen_name' : 'category',
'source_type' : 'category',
'song_id' : 'category'})
df_train = pd.read_csv(INPUT_DATA_PATH + 'train.csv',dtype={'msno' : 'category',
'source_system_tab' : 'category',
'source_screen_name' : 'category',
'source_type' : 'category',
'target' : np.uint8,
'song_id' : 'category'})
In [14]:
draw_category_col(df_train, 'source_screen_name')
In [15]:
draw_category_col(df_train, 'source_system_tab')
In [16]:
draw_category_col(df_train, 'source_type')
In [9]:
df_members = pd.read_csv(INPUT_DATA_PATH + 'members.csv',dtype={'city' : 'category',
'bd' : np.uint8,
'gender' : 'category',
'registered_via' : 'category'},
parse_dates=['registration_init_time','expiration_date'])
# Convert date to number of days
df_members['membership_days'] = (df_members['expiration_date'] - df_members['registration_init_time']).dt.days.astype(int)
In [10]:
df_members['registration_init_time_year'] = df_members['registration_init_time']\
.map(lambda x:x.year)
df_members['registration_init_time_month'] = df_members['registration_init_time']\
.map(lambda x:x.month)
df_members['registration_init_time_day'] = df_members['registration_init_time']\
.map(lambda x:x.day)
df_members['expiration_date_year'] = df_members['expiration_date']\
.map(lambda x:x.year)
df_members['expiration_date_month'] = df_members['expiration_date']\
.map(lambda x:x.month)
df_members['expiration_date_day'] = df_members['expiration_date']\
.map(lambda x:x.day)
df_members = df_members.drop(['registration_init_time','expiration_date'], axis=1)
display(df_members)
make_pickle('pickle/df_members', df_members)
Out[10]:
In [12]:
with open('pickle/df_members', 'rb') as f:
df_members = pickle.load(f)
print('load df_members finish')
# Merge the members dataframe into the test dataframe
df_test = pd.merge(left = df_test,right = df_members,how='left',on='msno')
df_test.msno = df_test.msno.astype('category')
print('merge df_test + df_members finish')
# Merge the member dataframe into the train dataframe
df_train = pd.merge(left = df_train,right = df_members,how='left',on='msno')
df_train.msno = df_train.msno.astype('category')
print('merge df_train + df_members finish')
# Release memory
del df_members
In [13]:
df_train.head()
Out[13]:
In [92]:
draw_category_col(df_train, 'city')
In [94]:
draw_category_col(df_train, 'gender')
In [95]:
draw_category_col(df_train, 'registered_via')
In [14]:
draw_dist_plot(df_train, 'membership_days')
display(df_train[df_train['membership_days'] < 0])
df_train = df_train[df_train['membership_days'] > 0]
In [15]:
# Load the songs dataframe
df_songs = pd.read_csv(INPUT_DATA_PATH + 'songs.csv',dtype={'genre_ids': 'category',
'language' : 'category',
'artist_name' : 'category',
'composer' : 'category',
'lyricist' : 'category',
'song_id' : 'category'})
# Merge the Test Dataframe with the SONGS dataframe
df_test = pd.merge(left = df_test,right = df_songs,how = 'left',on='song_id')
df_test.song_length.fillna(200000,inplace=True)
df_test.song_length = df_test.song_length.astype(np.uint32)
df_test.song_id = df_test.song_id.astype('category')
# Merge the Train dataframe with the SONGS dataframe
df_train = pd.merge(left = df_train,right = df_songs,how = 'left',on='song_id')
df_train.song_length.fillna(200000,inplace=True)
df_train.song_length = df_train.song_length.astype(np.uint32)
df_train.song_id = df_train.song_id.astype('category')
# Release memory
del df_songs
In [12]:
draw_category_col(df_train, 'language')
In [6]:
def draw_count_mean(df_train, col, thresholds=[10]):
count_per_artist = df_train[[col, 'target']] \
.groupby(col) \
.count() \
.reset_index()
mean_per_artist = df_train[[col, 'target']] \
.groupby(col) \
.mean() \
.reset_index()
artist_stat_df = count_per_artist.merge(mean_per_artist, on=col)
artist_stat_df.columns = [col, 'count', 'avg']
print('total')
artist_stat_df.hist()
plt.show()
for threshold in thresholds:
print('count >= {}'.format(threshold))
artist_stat_df[artist_stat_df['count'] >= threshold].hist()
plt.show()
In [7]:
draw_count_mean(df_train, 'artist_name')
In [8]:
draw_count_mean(df_train, 'msno')
In [9]:
draw_count_mean(df_train, 'song_id')
In [10]:
draw_count_mean(df_train, 'genre_ids')
In [11]:
draw_count_mean(df_train, 'composer')
In [12]:
draw_count_mean(df_train, 'lyricist')
In [6]:
def get_count_mean_std_per_category(df_train, col):
count_per_artist = df_train[[col, 'target']] \
.groupby(col) \
.count() \
.reset_index()
mean_per_artist = df_train[[col, 'target']] \
.groupby(col) \
.mean() \
.reset_index()
std_per_artist = df_train[[col, 'target']] \
.groupby(col) \
.std() \
.reset_index()
artist_stat_df = count_per_artist.merge(mean_per_artist, on=col)
artist_stat_df = artist_stat_df.merge(std_per_artist, on=col)
artist_stat_df.columns = [col, col+'_count', col+'_avg', col+'_std']
gc.collect()
return artist_stat_df[artist_stat_df[col+'_count']>=100]
def add_category_column_count_mean_std(df1, df2, col):
category_df = get_count_mean_std_per_category(df1, col)
if col + '_count' in df1:
df1.drop(col + '_count', axis=1, inplace=True)
if col + '_avg' in df1:
df1.drop(col + '_avg', axis=1, inplace=True)
if col + '_std' in df1:
df1.drop(col + '_std', axis=1, inplace=True)
df1 = df1.merge(category_df, on=col, how='left')
if col + '_count' in df2:
df2.drop(col + '_count', axis=1, inplace=True)
if col + '_avg' in df2:
df2.drop(col + '_avg', axis=1, inplace=True)
if col + '_std' in df2:
df2.drop(col + '_std', axis=1, inplace=True)
df2 = df2.merge(category_df, on=col, how='left')
gc.collect()
return df1, df2
In [7]:
df_train, df_test = add_category_column_count_mean_std(df_train,
df_test,
'artist_name')
df_train, df_test = add_category_column_count_mean_std(df_train,
df_test,
'msno')
# df_train, df_test = add_category_column_count_mean_std(df_train,
# df_test,
# 'song_id')
# df_train, df_test = add_category_column_count_mean_std(df_train,
# df_test,
# 'genre_ids')
# df_train, df_test = add_category_column_count_mean_std(df_train,
# df_test,
# 'composer')
# df_train, df_test = add_category_column_count_mean_std(df_train,
# df_test,
# 'lyricist')
gc.collect()
Out[7]:
In [4]:
print(df_train.columns)
In [5]:
# TODO Load the song extra info
df_song_extra_info = pd.read_csv(INPUT_DATA_PATH + 'song_extra_info.csv')
print(df_song_extra_info.head())
In [6]:
df_song_extra_info['isrc_cc'] = df_song_extra_info.isrc.str.slice(0,2) # Country Code column
df_song_extra_info['isrc_xxx'] = df_song_extra_info.isrc.str.slice(2,5) # IRSC Issuer
df_song_extra_info['isrc_yy'] = df_song_extra_info.isrc.str.slice(5,7).astype(float) # IRSC issue date
df_song_extra_info['isrc_id'] = df_song_extra_info.isrc.str.slice(7) # IRSC id
gc.collect()
print(df_song_extra_info.head())
In [7]:
df_song_extra_info.loc[df_song_extra_info['isrc_yy'] > 17, 'isrc_yy'] += 1900 # 1900's songs
df_song_extra_info.loc[df_song_extra_info['isrc_yy'] < 18, 'isrc_yy'] += 2000 # 2000's songs
df_song_extra_info.rename(columns={'isrc_yy': 'isrc_yyyy'}, inplace=True)
print(df_song_extra_info.head())
In [8]:
df_train = df_train.merge(df_song_extra_info, on='song_id', how='left')
df_test = df_test.merge(df_song_extra_info, on='song_id', how='left')
In [11]:
df_train.msno = df_train.msno.astype('category')
df_train.song_id = df_train.song_id.astype('category')
df_train.isrc_cc = df_train.isrc_cc.astype('category')
df_train.isrc_xxx = df_train.isrc_xxx.astype('category')
df_test.msno = df_test.msno.astype('category')
df_test.song_id = df_test.song_id.astype('category')
df_test.isrc_cc = df_test.isrc_cc.astype('category')
df_test.isrc_xxx = df_test.isrc_xxx.astype('category')
In [6]:
sns.heatmap(df_train[['target', 'isrc_cc', 'isrc_xxx', 'isrc_yyyy']].corr(),
cmap='coolwarm', center=0)
Out[6]:
In [4]:
# load collaborative filtering result
with open('pickle/cf_result_df_train_pred', 'rb') as f:
cf_df_train = pickle.load(f)
with open('pickle/cf_result_df_test_pred', 'rb') as f:
cf_df_test = pickle.load(f)
In [5]:
df_train_org=df_train
df_test_org=df_test
In [6]:
cf_df_train['cf_result'] = cf_df_train['prediction']
cf_df_test['cf_result'] = cf_df_test['prediction']
In [7]:
# attach cf result to df_train, df_test
df_train = df_train_org.merge(cf_df_train[['msno', 'song_id', 'cf_result']], on=['msno', 'song_id'], how='left')
df_test = df_test_org.merge(cf_df_test[['msno', 'song_id', 'cf_result']], on=['msno', 'song_id'], how='left')
In [8]:
df_train.msno = df_train.msno.astype('category')
df_train.song_id = df_train.song_id.astype('category')
df_test.msno = df_test.msno.astype('category')
df_test.song_id = df_test.song_id.astype('category')
In [9]:
print(len(df_train), len(df_test))
In [4]:
sns.heatmap(df_train.corr(), cmap='coolwarm', center=0)
Out[4]:
In [82]:
def draw_heatmap(df, col):
sns.heatmap(category_to_numeric(df[[col, 'target']], col).corr(),
cmap='coolwarm', center=0)
plt.show()
draw_heatmap(df_train, 'source_system_tab')
draw_heatmap(df_train, 'source_screen_name')
draw_heatmap(df_train, 'source_type')
draw_heatmap(df_train, 'language')
In [12]:
make_pickle('pickle/df_train', df_train, force=True)
make_pickle('pickle/df_test', df_test, force=True)
Out[12]:
In [4]:
df_train.drop(['name', 'isrc', 'isrc_id'], axis=1, inplace=True)
df_test.drop(['name', 'isrc', 'isrc_id'], axis=1, inplace=True)
In [3]:
with open('pickle/df_train', 'rb') as f:
df_train = pickle.load(f)
with open('pickle/df_test', 'rb') as f:
df_test = pickle.load(f)
In [11]:
gc.collect()
# Create a Cross Validation with n splits
n_splits = 10
kf = KFold(n_splits=n_splits)
# This array will store the predictions made.
predictions = np.zeros(shape=[len(df_test)])
# For each KFold
for train_indices ,validate_indices in kf.split(df_train) :
train_data = lgb.Dataset(df_train.drop(['target'],axis=1).loc[train_indices,:],
label=df_train.loc[train_indices,'target'])
val_data = lgb.Dataset(df_train.drop(['target'],axis=1).loc[validate_indices,:],
label=df_train.loc[validate_indices,'target'])
# Create the parameters for LGBM
params = {
'verbose': 1,
'objective': 'binary',
'metric' : 'auc',
'boosting': 'gbdt',
'learning_rate': 0.1,
'num_leaves': 2048,
'max_bin': 512,
'max_depth': 30,
'min_data_in_leaf':256,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 1,
'feature_fraction': 0.9,
'feature_fraction_seed': 1,
'num_rounds': 1000,
'num_threads' : 8,
}
# Train the model
bst = lgb.train(params, train_data, valid_sets=[val_data],
early_stopping_rounds=20, verbose_eval=10)
# Make the predictions storing them on the predictions array
predictions += bst.predict(df_test.drop(['id'],axis=1))
# draw feature importance
# lgb.plot_importance(bst)
# plt.show()
# Release the model from memory for the next iteration
del bst
del train_data
del val_data
gc.collect()
print('Training process finished. Generating Output...')
# We get the ammount of predictions from the prediction list, by dividing the predictions by the number of Kfolds.
predictions = predictions/n_splits
# Read the sample_submission CSV
submission = pd.read_csv(INPUT_DATA_PATH + '/sample_submission.csv')
# Set the target to our predictions
submission.target=predictions
# Save the submission file
submission.to_csv('lgbm-66870/submission.csv',index=False)
print('Output created.')
In [ ]: