https://www.kaggle.com/juanumusic/days-instead-of-dates-lgbm-0-66870

submit history

  • 0.66869 : initial
  • 0.67947 : change kflod 3 -> 10, early stapping 10, num_round 1000, num_leaves 256
  • 0.68066 : drop wrong expiretime user's data, add register time/expire time year/month/day
  • 0.68041 : 'learning_rate': 0.3,'min_data_in_leaf':256,'num_leaves': 512,'max_bin': 256,'max_depth': 20,
  • 0.67679 : add cf result 'learning_rate': 0.1,'min_data_in_leaf':512,'num_leaves': 512,'max_bin': 512,'max_depth': 20,
  • 0.67682 : 'learning_rate': 0.3,'min_data_in_leaf':256,'num_leaves': 256,'max_bin': 256,'max_depth': 20,
  • 0.67758 : 'learning_rate': 0.3,'num_leaves': 256,'max_bin': 256,'max_depth': 20,'min_data_in_leaf':default
  • 0.67314 : add msno, artist_name avg/count/std
  • 0.67304 : 'learning_rate': 0.1,'num_leaves': 256,'max_bin': 256,'max_depth': 20,
  • 0.65317 : add song extra info
  • 0.65288 : 'learning_rate': 0.1,'num_leaves': 256,'max_bin': 256,'max_depth': 20, 'min_data_in_leaf':256,
  • 0.65418 : 'learning_rate': 0.1,'num_leaves': 512,'max_bin': 512,'max_depth': 20, 'min_data_in_leaf':256,

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from six.moves import cPickle as pickle
import gc
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

%matplotlib inline

INPUT_DATA_PATH = 'input/'

def make_pickle(file_name, data, force=False):
    import os
    if not os.path.exists("pickle"):
        os.makedirs("pickle")
        
    if os.path.exists(file_name) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % file_name)
    else:
        print('Pickling %s.' % file_name)
        try:
            with open(file_name, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', file_name, ':', e)
    
    return file_name

# draw numeric column plot
def draw_scatter_plot(df, col_name):
    np_array = df[col_name].values
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
def draw_dist_plot(df, col_name):
    np_array = df[col_name].values
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel('count', fontsize=12)
    plt.show()

def draw_np_array_scatter_plot(np_array, col_name):
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
def draw_np_array_dist_plot(np_array, col_name):
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel('count', fontsize=12)
    plt.show()

# draw category column plot
def draw_category_col(df, col):
    print('null count : {}'.format(df[col].isnull().sum()))
    display(df[col].value_counts())
    draw_count_plot(df, col)
    draw_bar_plot(df, col, 'target')
    draw_factor_count_plot(df, col, "target")

def draw_count_plot(df, col_name, title='plot'):
    plt.figure(figsize=(12,8))
    sns.countplot(data=df, x=col_name)
    plt.xticks(rotation='vertical')
    plt.xlabel(col_name, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel('count', fontsize=12)
    plt.title(title, fontsize=15)
    plt.show()
    
def draw_box_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.boxplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(y_col, fontsize=12)
    plt.show()
    
def draw_violin_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.violinplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(y_col, fontsize=12)
    plt.show()

def draw_factor_count_plot(df, x_col, y_col):
    g = sns.factorplot(y_col, col=x_col, data=df, size=3, 
                       palette="muted", kind='count', col_wrap=4, aspect=.8)
    g.despine(left=True)
    g.set_ylabels(y_col)
    g.set_titles("{col_name}")
    g.set_xlabels("")
    plt.xticks(rotation='vertical')

def draw_bar_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    g = sns.barplot(x=x_col, y=y_col, data=df, palette="muted")
    plt.xlabel(x_col, fontsize=12)
    plt.xticks(rotation='vertical')
    plt.ylabel(y_col, fontsize=12)

# etc
def category_to_numeric(df, column_name):
    for category in df[column_name].unique():
        category_column = column_name + '_' + str(category)
        if category_column in df.columns:
            df = df.drop(category_column, axis=1)
    df= pd.concat([df,pd.get_dummies(df[column_name], prefix=column_name)],axis=1)
    return df

def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
    np_array = df[col_name].values
    
    ulimit = np.percentile(np_array, upper_percentile)
    llimit = np.percentile(np_array, lower_percentile)
    print('upper limit :', ulimit, ', lower limit :', llimit)
    
    # convert
    df[col_name].loc[df[col_name] > ulimit] = ulimit
    df[col_name].loc[df[col_name] < llimit] = llimit

In [11]:
df_test = pd.read_csv(INPUT_DATA_PATH + 'test.csv',dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                'source_screen_name' : 'category',
                                                'source_type' : 'category',
                                                'song_id' : 'category'})

df_train = pd.read_csv(INPUT_DATA_PATH + 'train.csv',dtype={'msno' : 'category',
                                                 'source_system_tab' : 'category',
                                                  'source_screen_name' : 'category',
                                                  'source_type' : 'category',
                                                  'target' : np.uint8,
                                                  'song_id' : 'category'})

In [14]:
draw_category_col(df_train, 'source_screen_name')


null count : 413817
Local playlist more     3227226
Online playlist more    1292861
Radio                    474435
Album more               419612
Search                   297928
Artist more              252333
Discover Feature         243922
Discover Chart           213426
Others profile more      201719
Discover Genre            82137
My library                75980
Explore                   72295
Unknown                   54157
Discover New              15932
Search Trends             13572
Search Home               13482
My library_Search          6451
Self profile more           212
Concert                      47
Payment                      12
Name: source_screen_name, dtype: int64

In [15]:
draw_category_col(df_train, 'source_system_tab')


null count : 18366
my library      3683275
discover        2175531
search           622997
radio            476666
listen with      212198
explore          167668
null               6478
notification       6178
settings           2199
Name: source_system_tab, dtype: int64

In [16]:
draw_category_col(df_train, 'source_type')


null count : 21521
local-library             2260483
online-playlist           1965119
local-playlist            1079312
radio                      483062
album                      476742
top-hits-for-artist        423023
song                       244374
song-based-playlist        210278
listen-with                192764
topic-article-playlist      11194
artist                       3038
my-daily-playlist             646
Name: source_type, dtype: int64

In [9]:
df_members = pd.read_csv(INPUT_DATA_PATH + 'members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                                                      parse_dates=['registration_init_time','expiration_date'])

# Convert date to number of days
df_members['membership_days'] = (df_members['expiration_date'] - df_members['registration_init_time']).dt.days.astype(int)

In [10]:
df_members['registration_init_time_year'] = df_members['registration_init_time']\
                                            .map(lambda x:x.year)
df_members['registration_init_time_month'] = df_members['registration_init_time']\
                                            .map(lambda x:x.month)
df_members['registration_init_time_day'] = df_members['registration_init_time']\
                                            .map(lambda x:x.day)
df_members['expiration_date_year'] = df_members['expiration_date']\
                                            .map(lambda x:x.year)
df_members['expiration_date_month'] = df_members['expiration_date']\
                                            .map(lambda x:x.month)
df_members['expiration_date_day'] = df_members['expiration_date']\
                                            .map(lambda x:x.day)

df_members = df_members.drop(['registration_init_time','expiration_date'], axis=1)
display(df_members)

make_pickle('pickle/df_members', df_members)


msno city bd gender registered_via membership_days registration_init_time_year registration_init_time_month registration_init_time_day expiration_date_year expiration_date_month expiration_date_day
0 XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw= 1 0 NaN 7 2223 2011 8 20 2017 9 20
1 UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM= 1 0 NaN 7 725 2015 6 28 2017 6 22
2 D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A= 1 0 NaN 4 457 2016 4 11 2017 7 12
3 mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI= 1 0 NaN 9 1 2015 9 6 2015 9 7
4 q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ= 1 0 NaN 4 138 2017 1 26 2017 6 13
5 zgPOEyUn5a/Fvuzb3m69ajzxjkbblVtObglW89FzLdo= 13 43 female 9 1921 2012 7 3 2017 10 6
6 Sw9AT8QoR4wWiNUqHZUH6g5ahzGUx4lo1g+Y3xE2f2M= 1 0 NaN 4 3 2016 3 26 2016 3 29
7 pg6bT2XZkSP1TDBy4qn3HBPY/HffKQ/bg8WIISQYBSY= 1 0 NaN 7 609 2016 1 30 2017 9 30
8 kfk1AdTNH2dNqF5LzIs4e0vwGPejw2jrnFjJlcYnEgk= 1 0 NaN 7 2454 2011 1 11 2017 9 30
9 tscijwx4dbEp0NXGl+iFtHJ8zrj+TkcMrduOQk9t+gE= 1 0 NaN 7 482 2016 2 17 2017 6 13
10 GoFYKhcq8Q5Gjz1M5L0azHZOhcY+Za/T7fRqIGgBuYA= 12 28 male 9 4095 2006 5 31 2017 8 16
11 Hiw71XZQSoEe1I7pBhPWi3kIrP7Ss7pwExZgtqFmtYw= 1 0 NaN 7 731 2015 9 9 2017 9 9
12 V6QgFYHgsfZ2G6hFz1z2u3RrLt5BoLsZKBr+h0ZnFsA= 1 0 NaN 7 2083 2012 1 5 2017 9 18
13 RoSfblbwJN/izEnFIVw8TgOpm8R/NEpUC84Oz/b32HQ= 15 33 female 3 765 2015 9 15 2017 10 19
14 N+dxmo1qvkKAlzYtcxGzjrqcpVyX9J7AMlXFuYASKuY= 1 0 NaN 3 1221 2014 4 12 2017 8 15
15 vUzJAyFEudsnkWwfcLIKT5mJAV+uVRTW2uajCrghFwI= 13 20 male 3 1113 2014 4 10 2017 4 27
16 Ev+ouoRTpAjrtxNVeThP6t9xnDG1puQaG28DvJ2db+A= 13 30 male 9 3727 2007 3 30 2017 6 12
17 /NFMlPP8FvyxVLHcSA4af/BfbRvGXwD+IYpZqVylb7I= 1 0 NaN 7 1737 2012 12 28 2017 9 30
18 XeAoaR5gMiy74l4HUhHt10o36U7VevuMZOymd0EWqD0= 1 0 NaN 4 172 2016 12 24 2017 6 14
19 v8+kU3kqYo9P8kIcuLrliJ/57jCTZG75cXvyYdagbhU= 1 0 NaN 9 367 2016 9 23 2017 9 25
20 +QyD+0xRgzi2552jFOFk5FfDJsqfe5D6xmHpHf6J5Bk= 1 0 NaN 4 3 2017 1 24 2017 1 27
21 2g1ydVe2XxKPTMf2P9BhFfZOUC0Kteti8KyMr6iVaj0= 5 29 male 4 257 2016 11 24 2017 8 8
22 IptzXjBNx4yZaetmolc/moHn2SYT2oZM7SX9mGyMKDo= 1 0 NaN 9 3 2016 12 9 2016 12 12
23 v1/E8gN8p5TFweTX3HAc4mm31LxqP6c1ZGGJiKyCuH4= 1 0 NaN 4 200 2017 1 20 2017 8 8
24 ed0hCzvhKVxHMIodLtgkzb1G4ryiet50rO/GE3W3pEA= 1 0 NaN 4 3 2016 12 9 2016 12 12
25 tXcypQ/1TwiThcv2HfBhStBxDlza66KHCcbLAXFxey4= 1 0 NaN 4 289 2016 8 26 2017 6 11
26 KxIHfr086p8f8yKp1ZUJF4EsA7qfpPguo0bNG2PkQBA= 1 0 NaN 4 3 2017 1 4 2017 1 7
27 R01NRe7BNIWmterVXYzXQ82GZRu8rVHjRYyEpsIFxwQ= 1 0 NaN 7 1335 2014 1 28 2017 9 24
28 ox62m0QICnrFa8/ZM5jJ5WBidWIieREUjUdCiWExLSU= 6 26 female 7 1338 2014 1 18 2017 9 17
29 95tnnY+Zq8f4jpk5SaDWcmnngD9dFe/OwxAqhss6T8k= 4 25 female 3 7 2015 6 24 2015 7 1
... ... ... ... ... ... ... ... ... ... ... ... ...
34373 XIdeDgI5AMOM/Afw3+sHbZidBnntHR8ylmmjNv348gM= 13 17 female 9 596 2016 1 20 2017 9 7
34374 7LDn/wAl6BG8aTXmNV6imxo/epsPwm5J6Iq62fw4NQA= 5 25 female 3 2217 2012 7 25 2018 8 20
34375 PJg7fHQwFtVB8A1JPBM5YkirpqllPnn71c8u4rVfkyY= 1 0 NaN 4 3 2017 1 7 2017 1 10
34376 mQLnd3+nlBM8xWtb3oG1OxtOka9qkdFeSyf3ij1gnY4= 5 18 male 9 851 2015 12 16 2018 4 15
34377 gPFwVN+u056/4qjUnTvXM4Umu4dZkXGSrV6V/RHxTCE= 5 49 female 9 829 2015 10 23 2018 1 29
34378 m4SCHKyzxbyWwmjLokwGYHMuNWMKVu+ZsRRnpATEXV4= 6 27 male 9 3819 2007 5 6 2017 10 19
34379 Nvn1wyXW5ljzQ5ytMuxrMX5DJb6qvMVkLX92yq2L1Gg= 1 0 NaN 7 2360 2011 4 20 2017 10 5
34380 4rm/qx08CT1zc6Cf51s7CGislHdUbozhWKTQwkeXQzA= 1 0 NaN 7 477 2016 6 15 2017 10 5
34381 RzhTfn+9ZL/agg3KU+w3OV771vPZBdME/zVw8L+JqE4= 22 17 female 9 2286 2011 6 30 2017 10 2
34382 h+y94tam2umL5tkLQ2Bb7e7KZkrvAn0VYTi0T8VfJZA= 6 21 female 3 807 2015 5 1 2017 7 16
34383 cZnVSva/eyG76AnN5FVhcd3E6rDlxZYodHmoNmL5jRI= 1 0 NaN 7 337 2016 10 28 2017 9 30
34384 mZv23MYsFZN0pupsxqlHCUFqNgZPw/4osbyJkei/SRA= 15 25 female 9 1437 2013 10 16 2017 9 22
34385 gvPKk3epmZYeBHTsfhss+QnaeR1yLQbsZMvw1WcWuFg= 1 0 NaN 7 728 2015 8 29 2017 8 26
34386 w9cnrfPt9BJZXB2qCjuneN+9gJZ1rXxrCR6lpazzZn0= 1 0 NaN 7 1450 2013 10 13 2017 10 2
34387 Hm+tHiwBnztX5L6FORYQI6Qr8bCUc7P276HQWganPwg= 9 26 male 9 1665 2013 3 2 2017 9 22
34388 V64FfynlRd8b7s76Lga3I2M9pJXc16jQi3TyedJNNTU= 21 17 female 3 1296 2013 12 25 2017 7 13
34389 38pbfUNqxyF3kAQ5Ops549OfoCa3vVAbryBlyQ5fElA= 4 20 male 3 1360 2014 9 22 2018 6 13
34390 buUH+5Pt7Oe5MRJmHzAoaGqApNIlepdLtMPbaYRLsF8= 12 24 female 7 1613 2013 5 4 2017 10 3
34391 Lhhn8qbJj1UojqarOLrSI0RLb+llCBYi3qsdYWK31WA= 1 0 NaN 4 3 2016 12 26 2016 12 29
34392 MztOLNMRfOazE4FOBfQPUAoCSkaXJzI/42gRSwm0+hE= 22 43 male 9 2938 2009 8 30 2017 9 15
34393 b2sH2FEdfr+2iyi3FlbQ3aMQwSmx/Mu0QLQHy7PinLA= 4 19 male 9 813 2015 6 25 2017 9 15
34394 gwRDIcbF3glSxxzs0ReSAhL5qezHN2Z4MMr29IBxeDo= 18 10 male 9 3500 2008 3 1 2017 9 30
34395 Azz7DTejzI4MPY2+mLnzDu8GcN/gdoBkC4ncxx+Y+Ss= 1 16 NaN 9 725 2016 1 6 2017 12 31
34396 UUpdbYb6xeTNAt7dz8EQ7MNOpwH+gk9jMpIx5Hh/WI0= 13 25 female 3 1985 2012 1 4 2017 6 11
34397 9atvTKXo3kVao2jCDpsRDBmwXEMREmDTj3o9BQWEDQk= 15 31 NaN 3 1653 2013 8 6 2018 2 14
34398 Wwd/cudKVuLJ3txRVxlg2Zaeliu+LRUfiBmfrnxhRCY= 1 0 NaN 7 1399 2013 11 11 2017 9 10
34399 g3JGnJX6Hg50lFbrNWfsHwCUmApIkiv2M8sXOaeXoIQ= 4 18 male 3 937 2014 10 24 2017 5 18
34400 IMaPMJuyN+ip9Vqi+z2XuXbFAP2kbHr+EvvCNkFfj+o= 1 0 NaN 7 1498 2013 8 2 2017 9 8
34401 WAnCAJjUty9Stv8yKtV7ZC7PN+ilOy5FX3aIJgGPANM= 1 0 NaN 7 701 2015 10 20 2017 9 20
34402 xH8KpzKGeNNq6dOvy51c/8VzqOiGG+m6vabhsPSDHX4= 1 0 NaN 4 3 2016 8 15 2016 8 18

34403 rows × 12 columns

Pickling pickle/df_members.
Out[10]:
'pickle/df_members'

In [12]:
with open('pickle/df_members', 'rb') as f:
    df_members = pickle.load(f)
print('load df_members finish')

# Merge the members dataframe into the test dataframe
df_test = pd.merge(left = df_test,right = df_members,how='left',on='msno')
df_test.msno = df_test.msno.astype('category')
print('merge df_test + df_members finish')

# Merge the member dataframe into the train dataframe
df_train = pd.merge(left = df_train,right = df_members,how='left',on='msno')
df_train.msno = df_train.msno.astype('category')
print('merge df_train + df_members finish')

# Release memory
del df_members


load df_members finish
merge df_test + df_members finish
merge df_train + df_members finish

In [13]:
df_train.head()


Out[13]:
msno song_id source_system_tab source_screen_name source_type target city bd gender registered_via membership_days registration_init_time_year registration_init_time_month registration_init_time_day expiration_date_year expiration_date_month expiration_date_day
0 FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg= BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik= explore Explore online-playlist 1 1 0 NaN 7 2103 2012 1 2 2017 10 5
1 Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8= bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM= my library Local playlist more local-playlist 1 13 24 female 9 2301 2011 5 25 2017 9 11
2 Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8= JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY= my library Local playlist more local-playlist 1 13 24 female 9 2301 2011 5 25 2017 9 11
3 Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8= 2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs= my library Local playlist more local-playlist 1 13 24 female 9 2301 2011 5 25 2017 9 11
4 FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg= 3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc= explore Explore online-playlist 1 1 0 NaN 7 2103 2012 1 2 2017 10 5

In [92]:
draw_category_col(df_train, 'city')


1     2621293
13    1144863
5      830128
4      545820
15     484078
22     460330
6      275781
14     236639
12     147133
9       95237
8       85734
18      81780
11      71542
10      69158
21      67097
3       63555
17      44688
7       31621
16       7887
19       6961
20       6093
Name: city, dtype: int64

In [94]:
draw_category_col(df_train, 'gender')


male      2297623
female    2118316
Name: gender, dtype: int64

In [95]:
draw_category_col(df_train, 'registered_via')


9     2814535
7     2551743
3     1250790
4      745251
13      15099
16          0
Name: registered_via, dtype: int64

In [14]:
draw_dist_plot(df_train, 'membership_days')
display(df_train[df_train['membership_days'] < 0])
df_train = df_train[df_train['membership_days'] > 0]


msno song_id source_system_tab source_screen_name source_type target city bd gender registered_via membership_days registration_init_time_year registration_init_time_month registration_init_time_day expiration_date_year expiration_date_month expiration_date_day
2487533 1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA= WznMG5LmzE4k7q1OQLPAV2s96k8ZIrVvG/rihErlYWk= NaN NaN NaN 0 1 0 NaN 9 -16191 2014 5 1 1970 1 1
2487534 1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA= DdKsqy3JAygpcHwihcjBKzzp8SDYhdtXbEZmhKDrOSo= NaN NaN NaN 0 1 0 NaN 9 -16191 2014 5 1 1970 1 1
2487535 1Y+bNz3FxSoJnKOcR/Q8VJGXZbWIstrW0HfBe5LZzKA= xEjg9Bs0QcYD3BBQrzPUk89Eb2jBCWu/aki+pOy6H0w= NaN NaN NaN 0 1 0 NaN 9 -16191 2014 5 1 1970 1 1

In [15]:
# Load the songs dataframe
df_songs = pd.read_csv(INPUT_DATA_PATH + 'songs.csv',dtype={'genre_ids': 'category',
                                                  'language' : 'category',
                                                  'artist_name' : 'category',
                                                  'composer' : 'category',
                                                  'lyricist' : 'category',
                                                  'song_id' : 'category'})

# Merge the Test Dataframe with the SONGS dataframe
df_test = pd.merge(left = df_test,right = df_songs,how = 'left',on='song_id')
df_test.song_length.fillna(200000,inplace=True)
df_test.song_length = df_test.song_length.astype(np.uint32)
df_test.song_id = df_test.song_id.astype('category')

# Merge the Train dataframe with the SONGS dataframe
df_train = pd.merge(left = df_train,right = df_songs,how = 'left',on='song_id')
df_train.song_length.fillna(200000,inplace=True)
df_train.song_length = df_train.song_length.astype(np.uint32)
df_train.song_id = df_train.song_id.astype('category')

# Release memory
del df_songs

In [12]:
draw_category_col(df_train, 'language')


null count : 150
3.0     4041551
52.0    1863598
31.0     655586
-1.0     308653
17.0     244949
10.0     171702
24.0      78569
59.0       4191
45.0       2397
38.0        210
Name: language, dtype: int64

In [6]:
def draw_count_mean(df_train, col, thresholds=[10]):
    count_per_artist = df_train[[col, 'target']] \
        .groupby(col) \
        .count() \
        .reset_index()

    mean_per_artist = df_train[[col, 'target']] \
        .groupby(col) \
        .mean() \
        .reset_index()

    artist_stat_df = count_per_artist.merge(mean_per_artist, on=col)
    artist_stat_df.columns = [col, 'count', 'avg']
    print('total')
    artist_stat_df.hist()
    plt.show()
    
    for threshold in thresholds:
        print('count >= {}'.format(threshold))
        artist_stat_df[artist_stat_df['count'] >= threshold].hist()
        plt.show()

In [7]:
draw_count_mean(df_train, 'artist_name')


total
count >= 10

In [8]:
draw_count_mean(df_train, 'msno')


total
count >= 10

In [9]:
draw_count_mean(df_train, 'song_id')


total
count >= 10

In [10]:
draw_count_mean(df_train, 'genre_ids')


total
count >= 10

In [11]:
draw_count_mean(df_train, 'composer')


total
count >= 10

In [12]:
draw_count_mean(df_train, 'lyricist')


total
count >= 10

In [6]:
def get_count_mean_std_per_category(df_train, col):
    count_per_artist = df_train[[col, 'target']] \
        .groupby(col) \
        .count() \
        .reset_index()

    mean_per_artist = df_train[[col, 'target']] \
        .groupby(col) \
        .mean() \
        .reset_index()

    std_per_artist = df_train[[col, 'target']] \
        .groupby(col) \
        .std() \
        .reset_index()

    artist_stat_df = count_per_artist.merge(mean_per_artist, on=col)
    artist_stat_df = artist_stat_df.merge(std_per_artist, on=col)
    artist_stat_df.columns = [col, col+'_count', col+'_avg', col+'_std']
    gc.collect()
    return artist_stat_df[artist_stat_df[col+'_count']>=100]

def add_category_column_count_mean_std(df1, df2, col):
    category_df = get_count_mean_std_per_category(df1, col)

    if col + '_count' in df1:
        df1.drop(col + '_count', axis=1, inplace=True)
    if col + '_avg' in df1:
        df1.drop(col + '_avg', axis=1, inplace=True)
    if col + '_std' in df1:
        df1.drop(col + '_std', axis=1, inplace=True)
    df1 = df1.merge(category_df, on=col, how='left')

    if col + '_count' in df2:
        df2.drop(col + '_count', axis=1, inplace=True)
    if col + '_avg' in df2:
        df2.drop(col + '_avg', axis=1, inplace=True)
    if col + '_std' in df2:
        df2.drop(col + '_std', axis=1, inplace=True)
    df2 = df2.merge(category_df, on=col, how='left')

    gc.collect()
    return df1, df2

In [7]:
df_train, df_test = add_category_column_count_mean_std(df_train, 
                                                       df_test, 
                                                       'artist_name')
df_train, df_test = add_category_column_count_mean_std(df_train, 
                                                       df_test, 
                                                       'msno')
# df_train, df_test = add_category_column_count_mean_std(df_train, 
#                                                        df_test, 
#                                                        'song_id')
# df_train, df_test = add_category_column_count_mean_std(df_train, 
#                                                        df_test, 
#                                                        'genre_ids')
# df_train, df_test = add_category_column_count_mean_std(df_train, 
#                                                        df_test, 
#                                                        'composer')
# df_train, df_test = add_category_column_count_mean_std(df_train, 
#                                                        df_test, 
#                                                        'lyricist')
gc.collect()


Out[7]:
124

In [4]:
print(df_train.columns)


Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'city', 'bd', 'gender', 'registered_via',
       'membership_days', 'registration_init_time_year',
       'registration_init_time_month', 'registration_init_time_day',
       'expiration_date_year', 'expiration_date_month', 'expiration_date_day',
       'song_length', 'genre_ids', 'artist_name', 'composer', 'lyricist',
       'language', 'artist_name_count', 'artist_name_avg', 'artist_name_std',
       'msno_count', 'msno_avg', 'msno_std'],
      dtype='object')

In [5]:
# TODO Load the song extra info
df_song_extra_info = pd.read_csv(INPUT_DATA_PATH + 'song_extra_info.csv')
print(df_song_extra_info.head())


                                        song_id             name          isrc
0  LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=               我們  TWUM71200043
1  ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=  Let Me Love You  QMZSY1600015
2  u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=              原諒我  TWA530887303
3  92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=          Classic  USSM11301446
4  0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=             愛投羅網  TWA471306001

In [6]:
df_song_extra_info['isrc_cc'] = df_song_extra_info.isrc.str.slice(0,2)  # Country Code column
df_song_extra_info['isrc_xxx'] = df_song_extra_info.isrc.str.slice(2,5) # IRSC Issuer
df_song_extra_info['isrc_yy'] = df_song_extra_info.isrc.str.slice(5,7).astype(float)  # IRSC issue date
df_song_extra_info['isrc_id'] = df_song_extra_info.isrc.str.slice(7)  # IRSC id
gc.collect()
print(df_song_extra_info.head())


                                        song_id             name  \
0  LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=               我們   
1  ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=  Let Me Love You   
2  u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=              原諒我   
3  92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=          Classic   
4  0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=             愛投羅網   

           isrc isrc_cc isrc_xxx  isrc_yy isrc_id  
0  TWUM71200043      TW      UM7     12.0   00043  
1  QMZSY1600015      QM      ZSY     16.0   00015  
2  TWA530887303      TW      A53      8.0   87303  
3  USSM11301446      US      SM1     13.0   01446  
4  TWA471306001      TW      A47     13.0   06001  

In [7]:
df_song_extra_info.loc[df_song_extra_info['isrc_yy'] > 17, 'isrc_yy'] += 1900  # 1900's songs
df_song_extra_info.loc[df_song_extra_info['isrc_yy'] < 18, 'isrc_yy'] += 2000  # 2000's songs
df_song_extra_info.rename(columns={'isrc_yy': 'isrc_yyyy'}, inplace=True)
print(df_song_extra_info.head())


                                        song_id             name  \
0  LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=               我們   
1  ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=  Let Me Love You   
2  u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=              原諒我   
3  92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=          Classic   
4  0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=             愛投羅網   

           isrc isrc_cc isrc_xxx  isrc_yyyy isrc_id  
0  TWUM71200043      TW      UM7     2012.0   00043  
1  QMZSY1600015      QM      ZSY     2016.0   00015  
2  TWA530887303      TW      A53     2008.0   87303  
3  USSM11301446      US      SM1     2013.0   01446  
4  TWA471306001      TW      A47     2013.0   06001  

In [8]:
df_train = df_train.merge(df_song_extra_info, on='song_id', how='left')
df_test = df_test.merge(df_song_extra_info, on='song_id', how='left')

In [11]:
df_train.msno = df_train.msno.astype('category')
df_train.song_id = df_train.song_id.astype('category')
df_train.isrc_cc = df_train.isrc_cc.astype('category')
df_train.isrc_xxx = df_train.isrc_xxx.astype('category')

df_test.msno = df_test.msno.astype('category')
df_test.song_id = df_test.song_id.astype('category')
df_test.isrc_cc = df_test.isrc_cc.astype('category')
df_test.isrc_xxx = df_test.isrc_xxx.astype('category')

In [6]:
sns.heatmap(df_train[['target', 'isrc_cc', 'isrc_xxx', 'isrc_yyyy']].corr(), 
            cmap='coolwarm', center=0)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa0c1e836a0>

In [4]:
# load collaborative filtering result 
with open('pickle/cf_result_df_train_pred', 'rb') as f:
    cf_df_train = pickle.load(f)
with open('pickle/cf_result_df_test_pred', 'rb') as f:
    cf_df_test = pickle.load(f)

In [5]:
df_train_org=df_train
df_test_org=df_test

In [6]:
cf_df_train['cf_result'] = cf_df_train['prediction']
cf_df_test['cf_result'] = cf_df_test['prediction']

In [7]:
# attach cf result to df_train, df_test
df_train = df_train_org.merge(cf_df_train[['msno', 'song_id', 'cf_result']], on=['msno', 'song_id'], how='left')
df_test = df_test_org.merge(cf_df_test[['msno', 'song_id', 'cf_result']], on=['msno', 'song_id'], how='left')

In [8]:
df_train.msno = df_train.msno.astype('category')
df_train.song_id = df_train.song_id.astype('category')
df_test.msno = df_test.msno.astype('category')
df_test.song_id = df_test.song_id.astype('category')

In [9]:
print(len(df_train), len(df_test))


7371556 2556790

In [4]:
sns.heatmap(df_train.corr(), cmap='coolwarm', center=0)


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa104334ba8>

In [82]:
def draw_heatmap(df, col):
    sns.heatmap(category_to_numeric(df[[col, 'target']], col).corr(), 
                cmap='coolwarm', center=0)
    plt.show()

draw_heatmap(df_train, 'source_system_tab')
draw_heatmap(df_train, 'source_screen_name')
draw_heatmap(df_train, 'source_type')
draw_heatmap(df_train, 'language')



In [12]:
make_pickle('pickle/df_train', df_train, force=True)
make_pickle('pickle/df_test', df_test, force=True)


Pickling pickle/df_train.
Pickling pickle/df_test.
Out[12]:
'pickle/df_test'

In [4]:
df_train.drop(['name', 'isrc', 'isrc_id'], axis=1, inplace=True)
df_test.drop(['name', 'isrc', 'isrc_id'], axis=1, inplace=True)

In [3]:
with open('pickle/df_train', 'rb') as f:
    df_train = pickle.load(f)
with open('pickle/df_test', 'rb') as f:
    df_test = pickle.load(f)

In [11]:
gc.collect()

# Create a Cross Validation with n splits
n_splits = 10
kf = KFold(n_splits=n_splits)

# This array will store the predictions made.
predictions = np.zeros(shape=[len(df_test)])

# For each KFold
for train_indices ,validate_indices in kf.split(df_train) : 
    train_data = lgb.Dataset(df_train.drop(['target'],axis=1).loc[train_indices,:],
                             label=df_train.loc[train_indices,'target'])
    val_data = lgb.Dataset(df_train.drop(['target'],axis=1).loc[validate_indices,:],
                           label=df_train.loc[validate_indices,'target'])
    
    # Create the parameters for LGBM
    params = {
        'verbose': 1,
        'objective': 'binary',
        'metric' : 'auc',
        'boosting': 'gbdt',
        'learning_rate': 0.1,
        'num_leaves': 2048,
        'max_bin': 512,
        'max_depth': 30,
        'min_data_in_leaf':256,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'num_rounds': 1000,
        'num_threads' : 8,
        } 
    
    # Train the model
    bst = lgb.train(params, train_data, valid_sets=[val_data],
                   early_stopping_rounds=20, verbose_eval=10)
    
    # Make the predictions storing them on the predictions array
    predictions += bst.predict(df_test.drop(['id'],axis=1))
    
    # draw feature importance
#     lgb.plot_importance(bst)
#     plt.show()
    
    # Release the model from memory for the next iteration
    del bst
    del train_data
    del val_data
    gc.collect()

print('Training process finished. Generating Output...')

# We get the ammount of predictions from the prediction list, by dividing the predictions by the number of Kfolds.
predictions = predictions/n_splits

# Read the sample_submission CSV
submission = pd.read_csv(INPUT_DATA_PATH + '/sample_submission.csv')
# Set the target to our predictions
submission.target=predictions
# Save the submission file
submission.to_csv('lgbm-66870/submission.csv',index=False)

print('Output created.')


/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/lightgbm/engine.py:98: UserWarning: Found `num_rounds` in params. Will use it instead of argument
  warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-11-55705c9f8bc5> in <module>()
     37     # Train the model
     38     bst = lgb.train(params, train_data, valid_sets=[val_data],
---> 39                    early_stopping_rounds=20, verbose_eval=10)
     40 
     41     # Make the predictions storing them on the predictions array

~/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/lightgbm/engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks)
    176     """construct booster"""
    177     try:
--> 178         booster = Booster(params=params, train_set=train_set)
    179         if is_valid_contain_train:
    180             booster.set_train_data_name(train_data_name)

~/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/lightgbm/basic.py in __init__(self, params, train_set, model_file, silent)
   1266             self.handle = ctypes.c_void_p()
   1267             _safe_call(_LIB.LGBM_BoosterCreate(
-> 1268                 train_set.construct().handle,
   1269                 c_str(params_str),
   1270                 ctypes.byref(self.handle)))

~/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/lightgbm/basic.py in construct(self)
    820                                 weight=self.weight, group=self.group, predictor=self._predictor,
    821                                 silent=self.silent, feature_name=self.feature_name,
--> 822                                 categorical_feature=self.categorical_feature, params=self.params)
    823             if self.free_raw_data:
    824                 self.data = None

~/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/lightgbm/basic.py in _lazy_init(self, data, label, max_bin, reference, weight, group, predictor, silent, feature_name, categorical_feature, params)
    624             self.pandas_categorical = reference.pandas_categorical
    625             categorical_feature = reference.categorical_feature
--> 626         data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
    627         label = _label_from_pandas(label)
    628         self.data_has_header = False

~/develop/anaconda3/envs/kaggle/lib/python3.6/site-packages/lightgbm/basic.py in _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical)
    256             msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
    257             raise ValueError(msg + ', '.join(bad_fields))
--> 258         data = data.values.astype('float')
    259     else:
    260         if feature_name == 'auto':

MemoryError: 

In [ ]: