Rossmann Store Sales


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

import pylab
import csv
import datetime
import math
import re
import time
import random
import os

from pandas.tseries.offsets import *
from operator import *

from sklearn.cross_validation import train_test_split

%matplotlib inline

# plt.style.use('ggplot') # Good looking plots

np.set_printoptions(precision=4, threshold=10000, linewidth=100, edgeitems=999, suppress=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100)
pd.set_option('expand_frame_repr', False)
pd.set_option('precision', 6)

start_time = time.time()


C:\Users\Administrator\Anaconda2\envs\gl-env\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [3]:
# Thanks to Chenglong Chen for providing this in the forum
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

Setting seed


In [4]:
seed = 42

Reading sales data


In [5]:
nrows = None

df_train = pd.read_csv('train.csv', 
                       nrows=nrows,
                       parse_dates=['Date'],
                       date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))

nrows = nrows

df_submit = pd.read_csv('test.csv', 
                        nrows=nrows,
                        parse_dates=['Date'],
                        date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))


C:\Users\Administrator\Anaconda2\envs\gl-env\lib\site-packages\IPython\core\interactiveshell.py:2723: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [6]:
### Setting a variable to easily distinguish train (1) from submit (0) set
df_train['Set'] = 1
df_submit['Set'] = 0

In [7]:
### Combine train and test set
frames = [df_train, df_submit]
df = pd.concat(frames)

In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1058297 entries, 0 to 41087
Data columns (total 11 columns):
Customers        1017209 non-null float64
Date             1058297 non-null datetime64[ns]
DayOfWeek        1058297 non-null int64
Id               41088 non-null float64
Open             1058286 non-null float64
Promo            1058297 non-null int64
Sales            1017209 non-null float64
SchoolHoliday    1058297 non-null int64
Set              1058297 non-null int64
StateHoliday     1058297 non-null object
Store            1058297 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(5), object(1)
memory usage: 96.9+ MB

In [10]:
df_train.head()


Out[10]:
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday Set
0 1 5 2015-07-31 5263 555 1 1 0 1 1
1 2 5 2015-07-31 6064 625 1 1 0 1 1
2 3 5 2015-07-31 8314 821 1 1 0 1 1
3 4 5 2015-07-31 13995 1498 1 1 0 1 1
4 5 5 2015-07-31 4822 559 1 1 0 1 1

In [11]:
df_submit.head()


Out[11]:
Id Store DayOfWeek Date Open Promo StateHoliday SchoolHoliday Set
0 1 1 4 2015-09-17 1.0 1 0 0 0
1 2 3 4 2015-09-17 1.0 1 0 0 0
2 3 7 4 2015-09-17 1.0 1 0 0 0
3 4 8 4 2015-09-17 1.0 1 0 0 0
4 5 9 4 2015-09-17 1.0 1 0 0 0

In [23]:
features_x = ['Store', 'Date', 'DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'StateHoliday']
features_y = ['SalesLog']

In [24]:
### Remove rows where store is open, but no sales.
df = df.loc[~((df['Open'] == 1) & (df['Sales'] == 0))]

In [25]:
df.loc[df['Set'] == 1, 'SalesLog'] = np.log1p(df.loc[df['Set'] == 1]['Sales']) # = np.log(df['Sales'] + 1)

In [26]:
df['StateHoliday'] = df['StateHoliday'].astype('category').cat.codes

In [27]:
var_name = 'Date'

df[var_name + 'Day'] = pd.Index(df[var_name]).day
df[var_name + 'Week'] = pd.Index(df[var_name]).week
df[var_name + 'Month'] = pd.Index(df[var_name]).month
df[var_name + 'Year'] = pd.Index(df[var_name]).year
df[var_name + 'DayOfYear'] = pd.Index(df[var_name]).dayofyear

df[var_name + 'Day'] = df[var_name + 'Day'].fillna(0)
df[var_name + 'Week'] = df[var_name + 'Week'].fillna(0)
df[var_name + 'Month'] = df[var_name + 'Month'].fillna(0)
df[var_name + 'Year'] = df[var_name + 'Year'].fillna(0)
df[var_name + 'DayOfYear'] = df[var_name + 'DayOfYear'].fillna(0)

features_x.remove(var_name)
features_x.append(var_name + 'Day')
features_x.append(var_name + 'Week')
features_x.append(var_name + 'Month')
features_x.append(var_name + 'Year')
features_x.append(var_name + 'DayOfYear')

In [31]:
features_x


Out[31]:
['Store',
 'DayOfWeek',
 'Open',
 'Promo',
 'SchoolHoliday',
 'StateHoliday',
 'DateDay',
 'DateWeek',
 'DateMonth',
 'DateYear',
 'DateDayOfYear']

In [32]:
df.head(100)


Out[32]:
Customers Date DayOfWeek Id Open Promo Sales SchoolHoliday Set StateHoliday Store SalesLog DateDay DateWeek DateMonth DateYear DateDayOfYear DateInt
0 555.0 2015-07-31 5 NaN 1.0 1 5263.0 1 1 1 1 8.568646 31 31 7 2015 212 1438300800000000000
1 625.0 2015-07-31 5 NaN 1.0 1 6064.0 1 1 1 2 8.710290 31 31 7 2015 212 1438300800000000000
2 821.0 2015-07-31 5 NaN 1.0 1 8314.0 1 1 1 3 9.025816 31 31 7 2015 212 1438300800000000000
3 1498.0 2015-07-31 5 NaN 1.0 1 13995.0 1 1 1 4 9.546527 31 31 7 2015 212 1438300800000000000
4 559.0 2015-07-31 5 NaN 1.0 1 4822.0 1 1 1 5 8.481151 31 31 7 2015 212 1438300800000000000
5 589.0 2015-07-31 5 NaN 1.0 1 5651.0 1 1 1 6 8.639765 31 31 7 2015 212 1438300800000000000
6 1414.0 2015-07-31 5 NaN 1.0 1 15344.0 1 1 1 7 9.638545 31 31 7 2015 212 1438300800000000000
7 833.0 2015-07-31 5 NaN 1.0 1 8492.0 1 1 1 8 9.046998 31 31 7 2015 212 1438300800000000000
8 687.0 2015-07-31 5 NaN 1.0 1 8565.0 1 1 1 9 9.055556 31 31 7 2015 212 1438300800000000000
9 681.0 2015-07-31 5 NaN 1.0 1 7185.0 1 1 1 10 8.879890 31 31 7 2015 212 1438300800000000000
10 1236.0 2015-07-31 5 NaN 1.0 1 10457.0 1 1 1 11 9.255123 31 31 7 2015 212 1438300800000000000
11 962.0 2015-07-31 5 NaN 1.0 1 8959.0 1 1 1 12 9.100526 31 31 7 2015 212 1438300800000000000
12 568.0 2015-07-31 5 NaN 1.0 1 8821.0 0 1 1 13 9.085004 31 31 7 2015 212 1438300800000000000
13 710.0 2015-07-31 5 NaN 1.0 1 6544.0 1 1 1 14 8.786457 31 31 7 2015 212 1438300800000000000
14 766.0 2015-07-31 5 NaN 1.0 1 9191.0 1 1 1 15 9.126089 31 31 7 2015 212 1438300800000000000
15 979.0 2015-07-31 5 NaN 1.0 1 10231.0 1 1 1 16 9.233275 31 31 7 2015 212 1438300800000000000
16 946.0 2015-07-31 5 NaN 1.0 1 8430.0 1 1 1 17 9.039671 31 31 7 2015 212 1438300800000000000
17 936.0 2015-07-31 5 NaN 1.0 1 10071.0 1 1 1 18 9.217515 31 31 7 2015 212 1438300800000000000
18 718.0 2015-07-31 5 NaN 1.0 1 8234.0 1 1 1 19 9.016149 31 31 7 2015 212 1438300800000000000
19 974.0 2015-07-31 5 NaN 1.0 1 9593.0 0 1 1 20 9.168893 31 31 7 2015 212 1438300800000000000
20 682.0 2015-07-31 5 NaN 1.0 1 9515.0 1 1 1 21 9.160730 31 31 7 2015 212 1438300800000000000
21 633.0 2015-07-31 5 NaN 1.0 1 6566.0 0 1 1 22 8.789812 31 31 7 2015 212 1438300800000000000
22 560.0 2015-07-31 5 NaN 1.0 1 7273.0 1 1 1 23 8.892062 31 31 7 2015 212 1438300800000000000
23 1082.0 2015-07-31 5 NaN 1.0 1 14190.0 1 1 1 24 9.560363 31 31 7 2015 212 1438300800000000000
24 1586.0 2015-07-31 5 NaN 1.0 1 14180.0 1 1 1 25 9.559658 31 31 7 2015 212 1438300800000000000
25 611.0 2015-07-31 5 NaN 1.0 1 7818.0 1 1 1 26 8.964312 31 31 7 2015 212 1438300800000000000
26 1263.0 2015-07-31 5 NaN 1.0 1 13213.0 1 1 1 27 9.489032 31 31 7 2015 212 1438300800000000000
27 663.0 2015-07-31 5 NaN 1.0 1 7301.0 1 1 1 28 8.895904 31 31 7 2015 212 1438300800000000000
28 737.0 2015-07-31 5 NaN 1.0 1 9411.0 1 1 1 29 9.149741 31 31 7 2015 212 1438300800000000000
29 625.0 2015-07-31 5 NaN 1.0 1 4776.0 1 1 1 30 8.471568 31 31 7 2015 212 1438300800000000000
30 645.0 2015-07-31 5 NaN 1.0 1 7248.0 1 1 1 31 8.888619 31 31 7 2015 212 1438300800000000000
31 635.0 2015-07-31 5 NaN 1.0 1 6395.0 0 1 1 32 8.763428 31 31 7 2015 212 1438300800000000000
32 915.0 2015-07-31 5 NaN 1.0 1 10789.0 1 1 1 33 9.286375 31 31 7 2015 212 1438300800000000000
33 1162.0 2015-07-31 5 NaN 1.0 1 11144.0 1 1 1 34 9.318746 31 31 7 2015 212 1438300800000000000
34 777.0 2015-07-31 5 NaN 1.0 1 12412.0 1 1 1 35 9.426500 31 31 7 2015 212 1438300800000000000
35 1251.0 2015-07-31 5 NaN 1.0 1 12422.0 0 1 1 36 9.427305 31 31 7 2015 212 1438300800000000000
36 888.0 2015-07-31 5 NaN 1.0 1 9091.0 1 1 1 37 9.115150 31 31 7 2015 212 1438300800000000000
37 608.0 2015-07-31 5 NaN 1.0 1 7434.0 1 1 1 38 8.913954 31 31 7 2015 212 1438300800000000000
38 929.0 2015-07-31 5 NaN 1.0 1 8336.0 1 1 1 39 9.028459 31 31 7 2015 212 1438300800000000000
39 678.0 2015-07-31 5 NaN 1.0 1 5464.0 1 1 1 40 8.606119 31 31 7 2015 212 1438300800000000000
40 494.0 2015-07-31 5 NaN 1.0 1 6938.0 0 1 1 41 8.844913 31 31 7 2015 212 1438300800000000000
41 1235.0 2015-07-31 5 NaN 1.0 1 11946.0 1 1 1 42 9.388235 31 31 7 2015 212 1438300800000000000
42 683.0 2015-07-31 5 NaN 1.0 1 8096.0 1 1 1 43 8.999249 31 31 7 2015 212 1438300800000000000
43 665.0 2015-07-31 5 NaN 1.0 1 6670.0 1 1 1 44 8.805525 31 31 7 2015 212 1438300800000000000
44 442.0 2015-07-31 5 NaN 1.0 1 6301.0 1 1 1 45 8.748622 31 31 7 2015 212 1438300800000000000
45 775.0 2015-07-31 5 NaN 1.0 1 7628.0 0 1 1 46 8.939712 31 31 7 2015 212 1438300800000000000
46 1021.0 2015-07-31 5 NaN 1.0 1 9379.0 1 1 1 47 9.146335 31 31 7 2015 212 1438300800000000000
47 390.0 2015-07-31 5 NaN 1.0 1 3874.0 1 1 1 48 8.262301 31 31 7 2015 212 1438300800000000000
48 666.0 2015-07-31 5 NaN 1.0 1 10310.0 1 1 1 49 9.240967 31 31 7 2015 212 1438300800000000000
49 441.0 2015-07-31 5 NaN 1.0 1 5729.0 1 1 1 50 8.653471 31 31 7 2015 212 1438300800000000000
50 610.0 2015-07-31 5 NaN 1.0 1 9198.0 0 1 1 51 9.126850 31 31 7 2015 212 1438300800000000000
51 527.0 2015-07-31 5 NaN 1.0 1 8031.0 0 1 1 52 8.991189 31 31 7 2015 212 1438300800000000000
52 690.0 2015-07-31 5 NaN 1.0 1 7540.0 1 1 1 53 8.928110 31 31 7 2015 212 1438300800000000000
53 889.0 2015-07-31 5 NaN 1.0 1 10732.0 1 1 1 54 9.281078 31 31 7 2015 212 1438300800000000000
54 673.0 2015-07-31 5 NaN 1.0 1 6096.0 1 1 1 55 8.715552 31 31 7 2015 212 1438300800000000000
55 619.0 2015-07-31 5 NaN 1.0 1 10022.0 1 1 1 56 9.212638 31 31 7 2015 212 1438300800000000000
56 1025.0 2015-07-31 5 NaN 1.0 1 11594.0 1 1 1 57 9.358329 31 31 7 2015 212 1438300800000000000
57 603.0 2015-07-31 5 NaN 1.0 1 8581.0 0 1 1 58 9.057422 31 31 7 2015 212 1438300800000000000
58 628.0 2015-07-31 5 NaN 1.0 1 7148.0 1 1 1 59 8.874728 31 31 7 2015 212 1438300800000000000
59 776.0 2015-07-31 5 NaN 1.0 1 9762.0 1 1 1 60 9.186355 31 31 7 2015 212 1438300800000000000
60 665.0 2015-07-31 5 NaN 1.0 1 5572.0 1 1 1 61 8.625689 31 31 7 2015 212 1438300800000000000
61 772.0 2015-07-31 5 NaN 1.0 1 7495.0 1 1 1 62 8.922125 31 31 7 2015 212 1438300800000000000
62 846.0 2015-07-31 5 NaN 1.0 1 11525.0 1 1 1 63 9.352361 31 31 7 2015 212 1438300800000000000
63 935.0 2015-07-31 5 NaN 1.0 1 13251.0 1 1 1 64 9.491904 31 31 7 2015 212 1438300800000000000
64 561.0 2015-07-31 5 NaN 1.0 1 7156.0 1 1 1 65 8.875846 31 31 7 2015 212 1438300800000000000
65 636.0 2015-07-31 5 NaN 1.0 1 8306.0 1 1 1 66 9.024854 31 31 7 2015 212 1438300800000000000
66 912.0 2015-07-31 5 NaN 1.0 1 11035.0 1 1 1 67 9.308918 31 31 7 2015 212 1438300800000000000
67 1341.0 2015-07-31 5 NaN 1.0 1 11187.0 1 1 1 68 9.322597 31 31 7 2015 212 1438300800000000000
68 1371.0 2015-07-31 5 NaN 1.0 1 11204.0 1 1 1 69 9.324115 31 31 7 2015 212 1438300800000000000
69 741.0 2015-07-31 5 NaN 1.0 1 8613.0 1 1 1 70 9.061144 31 31 7 2015 212 1438300800000000000
70 869.0 2015-07-31 5 NaN 1.0 1 11545.0 1 1 1 71 9.354094 31 31 7 2015 212 1438300800000000000
71 520.0 2015-07-31 5 NaN 1.0 1 5251.0 0 1 1 72 8.566364 31 31 7 2015 212 1438300800000000000
72 537.0 2015-07-31 5 NaN 1.0 1 6026.0 1 1 1 73 8.704005 31 31 7 2015 212 1438300800000000000
73 929.0 2015-07-31 5 NaN 1.0 1 7518.0 1 1 1 74 8.925188 31 31 7 2015 212 1438300800000000000
74 807.0 2015-07-31 5 NaN 1.0 1 7444.0 1 1 1 75 8.915298 31 31 7 2015 212 1438300800000000000
75 1228.0 2015-07-31 5 NaN 1.0 1 12887.0 0 1 1 76 9.464052 31 31 7 2015 212 1438300800000000000
76 780.0 2015-07-31 5 NaN 1.0 1 10335.0 1 1 1 77 9.243388 31 31 7 2015 212 1438300800000000000
77 610.0 2015-07-31 5 NaN 1.0 1 5712.0 1 1 1 78 8.650500 31 31 7 2015 212 1438300800000000000
78 643.0 2015-07-31 5 NaN 1.0 1 7150.0 1 1 1 79 8.875007 31 31 7 2015 212 1438300800000000000
79 675.0 2015-07-31 5 NaN 1.0 1 9810.0 1 1 1 80 9.191259 31 31 7 2015 212 1438300800000000000
80 640.0 2015-07-31 5 NaN 1.0 1 9279.0 0 1 1 81 9.135617 31 31 7 2015 212 1438300800000000000
81 1035.0 2015-07-31 5 NaN 1.0 1 11492.0 1 1 1 82 9.349493 31 31 7 2015 212 1438300800000000000
82 327.0 2015-07-31 5 NaN 1.0 1 3866.0 1 1 1 83 8.260234 31 31 7 2015 212 1438300800000000000
83 1439.0 2015-07-31 5 NaN 1.0 1 14949.0 1 1 1 84 9.612467 31 31 7 2015 212 1438300800000000000
84 971.0 2015-07-31 5 NaN 1.0 1 7791.0 1 1 1 85 8.960853 31 31 7 2015 212 1438300800000000000
85 678.0 2015-07-31 5 NaN 1.0 1 5572.0 1 1 1 86 8.625689 31 31 7 2015 212 1438300800000000000
86 715.0 2015-07-31 5 NaN 1.0 1 6379.0 1 1 1 87 8.760923 31 31 7 2015 212 1438300800000000000
87 776.0 2015-07-31 5 NaN 1.0 1 7863.0 1 1 1 88 8.970051 31 31 7 2015 212 1438300800000000000
88 716.0 2015-07-31 5 NaN 1.0 1 7295.0 0 1 1 89 8.895082 31 31 7 2015 212 1438300800000000000
89 1295.0 2015-07-31 5 NaN 1.0 1 11495.0 1 1 1 90 9.349754 31 31 7 2015 212 1438300800000000000
90 554.0 2015-07-31 5 NaN 1.0 1 6164.0 1 1 1 91 8.726643 31 31 7 2015 212 1438300800000000000
91 701.0 2015-07-31 5 NaN 1.0 1 8227.0 1 1 1 92 9.015298 31 31 7 2015 212 1438300800000000000
92 715.0 2015-07-31 5 NaN 1.0 1 6548.0 1 1 1 93 8.787068 31 31 7 2015 212 1438300800000000000
93 694.0 2015-07-31 5 NaN 1.0 1 8372.0 1 1 1 94 9.032768 31 31 7 2015 212 1438300800000000000
94 920.0 2015-07-31 5 NaN 1.0 1 8445.0 1 1 1 95 9.041448 31 31 7 2015 212 1438300800000000000
95 612.0 2015-07-31 5 NaN 1.0 1 6370.0 1 1 1 96 8.759512 31 31 7 2015 212 1438300800000000000
96 802.0 2015-07-31 5 NaN 1.0 1 8380.0 1 1 1 97 9.033723 31 31 7 2015 212 1438300800000000000
97 512.0 2015-07-31 5 NaN 1.0 1 7012.0 1 1 1 98 8.855521 31 31 7 2015 212 1438300800000000000
98 507.0 2015-07-31 5 NaN 1.0 1 6571.0 0 1 1 99 8.790573 31 31 7 2015 212 1438300800000000000
99 764.0 2015-07-31 5 NaN 1.0 1 10280.0 0 1 1 100 9.238053 31 31 7 2015 212 1438300800000000000

In [33]:
df['DateInt'] = df['Date'].astype(np.int64)

In [34]:
df_store = pd.read_csv('store.csv', 
                       nrows=nrows)

In [35]:
df_store.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
Store                        1115 non-null int64
StoreType                    1115 non-null object
Assortment                   1115 non-null object
CompetitionDistance          1112 non-null float64
CompetitionOpenSinceMonth    761 non-null float64
CompetitionOpenSinceYear     761 non-null float64
Promo2                       1115 non-null int64
Promo2SinceWeek              571 non-null float64
Promo2SinceYear              571 non-null float64
PromoInterval                571 non-null object
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB

In [37]:
### Convert Storetype and Assortment to numerical categories
df_store['StoreType'] = df_store['StoreType'].astype('category').cat.codes
df_store['Assortment'] = df_store['Assortment'].astype('category').cat.codes

In [38]:
df_store.head(100)


Out[38]:
Store StoreType Assortment CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 Promo2SinceWeek Promo2SinceYear PromoInterval
0 1 2 0 1270.0 9.0 2008.0 0 NaN NaN NaN
1 2 0 0 570.0 11.0 2007.0 1 13.0 2010.0 Jan,Apr,Jul,Oct
2 3 0 0 14130.0 12.0 2006.0 1 14.0 2011.0 Jan,Apr,Jul,Oct
3 4 2 2 620.0 9.0 2009.0 0 NaN NaN NaN
4 5 0 0 29910.0 4.0 2015.0 0 NaN NaN NaN
5 6 0 0 310.0 12.0 2013.0 0 NaN NaN NaN
6 7 0 2 24000.0 4.0 2013.0 0 NaN NaN NaN
7 8 0 0 7520.0 10.0 2014.0 0 NaN NaN NaN
8 9 0 2 2030.0 8.0 2000.0 0 NaN NaN NaN
9 10 0 0 3160.0 9.0 2009.0 0 NaN NaN NaN
10 11 0 2 960.0 11.0 2011.0 1 1.0 2012.0 Jan,Apr,Jul,Oct
11 12 0 2 1070.0 NaN NaN 1 13.0 2010.0 Jan,Apr,Jul,Oct
12 13 3 0 310.0 NaN NaN 1 45.0 2009.0 Feb,May,Aug,Nov
13 14 0 0 1300.0 3.0 2014.0 1 40.0 2011.0 Jan,Apr,Jul,Oct
14 15 3 2 4110.0 3.0 2010.0 1 14.0 2011.0 Jan,Apr,Jul,Oct
15 16 0 2 3270.0 NaN NaN 0 NaN NaN NaN
16 17 0 0 50.0 12.0 2005.0 1 26.0 2010.0 Jan,Apr,Jul,Oct
17 18 3 2 13840.0 6.0 2010.0 1 14.0 2012.0 Jan,Apr,Jul,Oct
18 19 0 2 3240.0 NaN NaN 1 22.0 2011.0 Mar,Jun,Sept,Dec
19 20 3 0 2340.0 5.0 2009.0 1 40.0 2014.0 Jan,Apr,Jul,Oct
20 21 2 2 550.0 10.0 1999.0 1 45.0 2009.0 Jan,Apr,Jul,Oct
21 22 0 0 1040.0 NaN NaN 1 22.0 2012.0 Jan,Apr,Jul,Oct
22 23 3 0 4060.0 8.0 2005.0 0 NaN NaN NaN
23 24 0 2 4590.0 3.0 2000.0 1 40.0 2011.0 Jan,Apr,Jul,Oct
24 25 2 0 430.0 4.0 2003.0 0 NaN NaN NaN
25 26 3 0 2300.0 NaN NaN 0 NaN NaN NaN
26 27 0 0 60.0 1.0 2005.0 1 5.0 2011.0 Jan,Apr,Jul,Oct
27 28 0 0 1200.0 10.0 2014.0 1 6.0 2015.0 Mar,Jun,Sept,Dec
28 29 3 2 2170.0 NaN NaN 0 NaN NaN NaN
29 30 0 0 40.0 2.0 2014.0 1 10.0 2014.0 Mar,Jun,Sept,Dec
30 31 3 2 9800.0 7.0 2012.0 0 NaN NaN NaN
31 32 0 0 2910.0 NaN NaN 1 45.0 2009.0 Feb,May,Aug,Nov
32 33 0 2 1320.0 5.0 2013.0 0 NaN NaN NaN
33 34 2 0 2240.0 9.0 2009.0 0 NaN NaN NaN
34 35 3 2 7660.0 10.0 2000.0 1 1.0 2012.0 Jan,Apr,Jul,Oct
35 36 0 2 540.0 6.0 2003.0 1 40.0 2014.0 Jan,Apr,Jul,Oct
36 37 2 0 4230.0 12.0 2014.0 0 NaN NaN NaN
37 38 3 0 1090.0 4.0 2007.0 0 NaN NaN NaN
38 39 0 0 260.0 10.0 2006.0 1 31.0 2013.0 Feb,May,Aug,Nov
39 40 0 0 180.0 NaN NaN 1 45.0 2009.0 Feb,May,Aug,Nov
40 41 3 2 1180.0 NaN NaN 1 31.0 2013.0 Jan,Apr,Jul,Oct
41 42 0 2 290.0 NaN NaN 1 40.0 2011.0 Jan,Apr,Jul,Oct
42 43 3 0 4880.0 NaN NaN 1 37.0 2009.0 Jan,Apr,Jul,Oct
43 44 0 0 540.0 6.0 2011.0 0 NaN NaN NaN
44 45 3 0 9710.0 2.0 2014.0 0 NaN NaN NaN
45 46 2 0 1200.0 9.0 2005.0 1 14.0 2011.0 Jan,Apr,Jul,Oct
46 47 0 2 270.0 4.0 2013.0 1 14.0 2013.0 Jan,Apr,Jul,Oct
47 48 0 0 1060.0 5.0 2012.0 0 NaN NaN NaN
48 49 3 2 18010.0 9.0 2007.0 0 NaN NaN NaN
49 50 3 0 6260.0 11.0 2009.0 0 NaN NaN NaN
50 51 0 2 10570.0 7.0 2013.0 1 9.0 2011.0 Jan,Apr,Jul,Oct
51 52 3 2 450.0 4.0 2014.0 1 39.0 2010.0 Jan,Apr,Jul,Oct
52 53 0 2 30360.0 9.0 2013.0 0 NaN NaN NaN
53 54 3 2 7170.0 8.0 2014.0 1 5.0 2013.0 Feb,May,Aug,Nov
54 55 0 0 720.0 11.0 2004.0 0 NaN NaN NaN
55 56 3 2 6620.0 3.0 2012.0 1 10.0 2014.0 Mar,Jun,Sept,Dec
56 57 3 2 420.0 6.0 2014.0 0 NaN NaN NaN
57 58 0 2 7340.0 5.0 2008.0 1 27.0 2012.0 Jan,Apr,Jul,Oct
58 59 0 2 2840.0 6.0 2007.0 1 14.0 2011.0 Jan,Apr,Jul,Oct
59 60 3 2 5540.0 10.0 2009.0 0 NaN NaN NaN
60 61 0 2 350.0 12.0 2007.0 1 1.0 2012.0 Jan,Apr,Jul,Oct
61 62 0 0 2050.0 NaN NaN 0 NaN NaN NaN
62 63 2 2 3700.0 6.0 2010.0 1 18.0 2010.0 Feb,May,Aug,Nov
63 64 3 2 22560.0 NaN NaN 1 14.0 2013.0 Jan,Apr,Jul,Oct
64 65 0 2 13840.0 5.0 2010.0 1 1.0 2012.0 Jan,Apr,Jul,Oct
65 66 3 0 7660.0 NaN NaN 1 37.0 2009.0 Jan,Apr,Jul,Oct
66 67 0 2 410.0 2.0 2006.0 0 NaN NaN NaN
67 68 0 2 250.0 NaN NaN 1 35.0 2012.0 Mar,Jun,Sept,Dec
68 69 2 2 1130.0 NaN NaN 1 40.0 2011.0 Jan,Apr,Jul,Oct
69 70 2 2 4840.0 NaN NaN 0 NaN NaN NaN
70 71 0 0 17500.0 8.0 2008.0 1 37.0 2009.0 Mar,Jun,Sept,Dec
71 72 0 0 2200.0 12.0 2009.0 1 13.0 2010.0 Jan,Apr,Jul,Oct
72 73 0 2 1650.0 9.0 2008.0 0 NaN NaN NaN
73 74 0 0 330.0 NaN NaN 0 NaN NaN NaN
74 75 3 2 22440.0 12.0 2013.0 0 NaN NaN NaN
75 76 3 2 19960.0 3.0 2006.0 0 NaN NaN NaN
76 77 3 2 1090.0 8.0 2009.0 1 10.0 2014.0 Jan,Apr,Jul,Oct
77 78 0 0 3510.0 11.0 2006.0 1 5.0 2013.0 Feb,May,Aug,Nov
78 79 0 0 3320.0 NaN NaN 0 NaN NaN NaN
79 80 3 0 7910.0 NaN NaN 0 NaN NaN NaN
80 81 0 0 2370.0 3.0 2011.0 1 40.0 2014.0 Jan,Apr,Jul,Oct
81 82 0 0 22390.0 4.0 2008.0 1 37.0 2009.0 Jan,Apr,Jul,Oct
82 83 0 0 2710.0 NaN NaN 0 NaN NaN NaN
83 84 0 2 11810.0 8.0 2014.0 0 NaN NaN NaN
84 85 1 0 1870.0 10.0 2011.0 0 NaN NaN NaN
85 86 0 0 480.0 2.0 2005.0 1 31.0 2013.0 Jan,Apr,Jul,Oct
86 87 0 0 560.0 12.0 2010.0 0 NaN NaN NaN
87 88 0 0 10690.0 10.0 2005.0 0 NaN NaN NaN
88 89 0 0 2380.0 7.0 2004.0 1 40.0 2014.0 Jan,Apr,Jul,Oct
89 90 0 0 330.0 11.0 2007.0 0 NaN NaN NaN
90 91 2 0 2410.0 NaN NaN 1 35.0 2011.0 Jan,Apr,Jul,Oct
91 92 2 0 240.0 NaN NaN 1 45.0 2009.0 Feb,May,Aug,Nov
92 93 0 0 16690.0 NaN NaN 1 14.0 2011.0 Jan,Apr,Jul,Oct
93 94 3 2 14620.0 NaN NaN 0 NaN NaN NaN
94 95 0 0 1890.0 10.0 2014.0 0 NaN NaN NaN
95 96 0 0 8780.0 2.0 2005.0 1 37.0 2009.0 Jan,Apr,Jul,Oct
96 97 3 2 8980.0 NaN NaN 0 NaN NaN NaN
97 98 3 2 15140.0 12.0 2006.0 1 1.0 2012.0 Jan,Apr,Jul,Oct
98 99 2 2 2030.0 11.0 2003.0 1 22.0 2012.0 Mar,Jun,Sept,Dec
99 100 3 0 17930.0 NaN NaN 0 NaN NaN NaN

In [39]:
### Convert competition open year and month to float
def convertCompetitionOpen(df):
    try:
        date = '{}-{}'.format(int(df['CompetitionOpenSinceYear']), int(df['CompetitionOpenSinceMonth']))
        return pd.to_datetime(date)
    except:
        return np.nan

df_store['CompetitionOpenInt'] = df_store.apply(lambda df: convertCompetitionOpen(df), axis=1).astype(np.int64)

In [62]:
df_store.head(100)


Out[62]:
Store StoreType Assortment CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 Promo2SinceWeek Promo2SinceYear CompetitionOpenInt Promo2SinceFloat PromoInterval0 PromoInterval1 PromoInterval2 PromoInterval3
0 1 2 0 1270.0 9.0 2008.0 0 NaN NaN 1220227200000000000 -9223372036854775808 NaN NaN NaN NaN
1 2 0 0 570.0 11.0 2007.0 1 13.0 2010.0 1193875200000000000 1269820800000000000 1.0 4.0 7.0 10.0
2 3 0 0 14130.0 12.0 2006.0 1 14.0 2011.0 1164931200000000000 1301875200000000000 1.0 4.0 7.0 10.0
3 4 2 2 620.0 9.0 2009.0 0 NaN NaN 1251763200000000000 -9223372036854775808 NaN NaN NaN NaN
4 5 0 0 29910.0 4.0 2015.0 0 NaN NaN 1427846400000000000 -9223372036854775808 NaN NaN NaN NaN
5 6 0 0 310.0 12.0 2013.0 0 NaN NaN 1385856000000000000 -9223372036854775808 NaN NaN NaN NaN
6 7 0 2 24000.0 4.0 2013.0 0 NaN NaN 1364774400000000000 -9223372036854775808 NaN NaN NaN NaN
7 8 0 0 7520.0 10.0 2014.0 0 NaN NaN 1412121600000000000 -9223372036854775808 NaN NaN NaN NaN
8 9 0 2 2030.0 8.0 2000.0 0 NaN NaN 965088000000000000 -9223372036854775808 NaN NaN NaN NaN
9 10 0 0 3160.0 9.0 2009.0 0 NaN NaN 1251763200000000000 -9223372036854775808 NaN NaN NaN NaN
10 11 0 2 960.0 11.0 2011.0 1 1.0 2012.0 1320105600000000000 1325462400000000000 1.0 4.0 7.0 10.0
11 12 0 2 1070.0 NaN NaN 1 13.0 2010.0 -9223372036854775808 1269820800000000000 1.0 4.0 7.0 10.0
12 13 3 0 310.0 NaN NaN 1 45.0 2009.0 -9223372036854775808 1257724800000000000 2.0 5.0 8.0 11.0
13 14 0 0 1300.0 3.0 2014.0 1 40.0 2011.0 1393632000000000000 1317600000000000000 1.0 4.0 7.0 10.0
14 15 3 2 4110.0 3.0 2010.0 1 14.0 2011.0 1267401600000000000 1301875200000000000 1.0 4.0 7.0 10.0
15 16 0 2 3270.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
16 17 0 0 50.0 12.0 2005.0 1 26.0 2010.0 1133395200000000000 1277683200000000000 1.0 4.0 7.0 10.0
17 18 3 2 13840.0 6.0 2010.0 1 14.0 2012.0 1275350400000000000 1333324800000000000 1.0 4.0 7.0 10.0
18 19 0 2 3240.0 NaN NaN 1 22.0 2011.0 -9223372036854775808 1306713600000000000 3.0 6.0 9.0 12.0
19 20 3 0 2340.0 5.0 2009.0 1 40.0 2014.0 1241136000000000000 1412553600000000000 1.0 4.0 7.0 10.0
20 21 2 2 550.0 10.0 1999.0 1 45.0 2009.0 938736000000000000 1257724800000000000 1.0 4.0 7.0 10.0
21 22 0 0 1040.0 NaN NaN 1 22.0 2012.0 -9223372036854775808 1338163200000000000 1.0 4.0 7.0 10.0
22 23 3 0 4060.0 8.0 2005.0 0 NaN NaN 1122854400000000000 -9223372036854775808 NaN NaN NaN NaN
23 24 0 2 4590.0 3.0 2000.0 1 40.0 2011.0 951868800000000000 1317600000000000000 1.0 4.0 7.0 10.0
24 25 2 0 430.0 4.0 2003.0 0 NaN NaN 1049155200000000000 -9223372036854775808 NaN NaN NaN NaN
25 26 3 0 2300.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
26 27 0 0 60.0 1.0 2005.0 1 5.0 2011.0 1104537600000000000 1296432000000000000 1.0 4.0 7.0 10.0
27 28 0 0 1200.0 10.0 2014.0 1 6.0 2015.0 1412121600000000000 1423440000000000000 3.0 6.0 9.0 12.0
28 29 3 2 2170.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
29 30 0 0 40.0 2.0 2014.0 1 10.0 2014.0 1391212800000000000 1394409600000000000 3.0 6.0 9.0 12.0
30 31 3 2 9800.0 7.0 2012.0 0 NaN NaN 1341100800000000000 -9223372036854775808 NaN NaN NaN NaN
31 32 0 0 2910.0 NaN NaN 1 45.0 2009.0 -9223372036854775808 1257724800000000000 2.0 5.0 8.0 11.0
32 33 0 2 1320.0 5.0 2013.0 0 NaN NaN 1367366400000000000 -9223372036854775808 NaN NaN NaN NaN
33 34 2 0 2240.0 9.0 2009.0 0 NaN NaN 1251763200000000000 -9223372036854775808 NaN NaN NaN NaN
34 35 3 2 7660.0 10.0 2000.0 1 1.0 2012.0 970358400000000000 1325462400000000000 1.0 4.0 7.0 10.0
35 36 0 2 540.0 6.0 2003.0 1 40.0 2014.0 1054425600000000000 1412553600000000000 1.0 4.0 7.0 10.0
36 37 2 0 4230.0 12.0 2014.0 0 NaN NaN 1417392000000000000 -9223372036854775808 NaN NaN NaN NaN
37 38 3 0 1090.0 4.0 2007.0 0 NaN NaN 1175385600000000000 -9223372036854775808 NaN NaN NaN NaN
38 39 0 0 260.0 10.0 2006.0 1 31.0 2013.0 1159660800000000000 1375660800000000000 2.0 5.0 8.0 11.0
39 40 0 0 180.0 NaN NaN 1 45.0 2009.0 -9223372036854775808 1257724800000000000 2.0 5.0 8.0 11.0
40 41 3 2 1180.0 NaN NaN 1 31.0 2013.0 -9223372036854775808 1375660800000000000 1.0 4.0 7.0 10.0
41 42 0 2 290.0 NaN NaN 1 40.0 2011.0 -9223372036854775808 1317600000000000000 1.0 4.0 7.0 10.0
42 43 3 0 4880.0 NaN NaN 1 37.0 2009.0 -9223372036854775808 1252886400000000000 1.0 4.0 7.0 10.0
43 44 0 0 540.0 6.0 2011.0 0 NaN NaN 1306886400000000000 -9223372036854775808 NaN NaN NaN NaN
44 45 3 0 9710.0 2.0 2014.0 0 NaN NaN 1391212800000000000 -9223372036854775808 NaN NaN NaN NaN
45 46 2 0 1200.0 9.0 2005.0 1 14.0 2011.0 1125532800000000000 1301875200000000000 1.0 4.0 7.0 10.0
46 47 0 2 270.0 4.0 2013.0 1 14.0 2013.0 1364774400000000000 1365379200000000000 1.0 4.0 7.0 10.0
47 48 0 0 1060.0 5.0 2012.0 0 NaN NaN 1335830400000000000 -9223372036854775808 NaN NaN NaN NaN
48 49 3 2 18010.0 9.0 2007.0 0 NaN NaN 1188604800000000000 -9223372036854775808 NaN NaN NaN NaN
49 50 3 0 6260.0 11.0 2009.0 0 NaN NaN 1257033600000000000 -9223372036854775808 NaN NaN NaN NaN
50 51 0 2 10570.0 7.0 2013.0 1 9.0 2011.0 1372636800000000000 1298851200000000000 1.0 4.0 7.0 10.0
51 52 3 2 450.0 4.0 2014.0 1 39.0 2010.0 1396310400000000000 1285545600000000000 1.0 4.0 7.0 10.0
52 53 0 2 30360.0 9.0 2013.0 0 NaN NaN 1377993600000000000 -9223372036854775808 NaN NaN NaN NaN
53 54 3 2 7170.0 8.0 2014.0 1 5.0 2013.0 1406851200000000000 1359936000000000000 2.0 5.0 8.0 11.0
54 55 0 0 720.0 11.0 2004.0 0 NaN NaN 1099267200000000000 -9223372036854775808 NaN NaN NaN NaN
55 56 3 2 6620.0 3.0 2012.0 1 10.0 2014.0 1330560000000000000 1394409600000000000 3.0 6.0 9.0 12.0
56 57 3 2 420.0 6.0 2014.0 0 NaN NaN 1401580800000000000 -9223372036854775808 NaN NaN NaN NaN
57 58 0 2 7340.0 5.0 2008.0 1 27.0 2012.0 1209600000000000000 1341187200000000000 1.0 4.0 7.0 10.0
58 59 0 2 2840.0 6.0 2007.0 1 14.0 2011.0 1180656000000000000 1301875200000000000 1.0 4.0 7.0 10.0
59 60 3 2 5540.0 10.0 2009.0 0 NaN NaN 1254355200000000000 -9223372036854775808 NaN NaN NaN NaN
60 61 0 2 350.0 12.0 2007.0 1 1.0 2012.0 1196467200000000000 1325462400000000000 1.0 4.0 7.0 10.0
61 62 0 0 2050.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
62 63 2 2 3700.0 6.0 2010.0 1 18.0 2010.0 1275350400000000000 1272844800000000000 2.0 5.0 8.0 11.0
63 64 3 2 22560.0 NaN NaN 1 14.0 2013.0 -9223372036854775808 1365379200000000000 1.0 4.0 7.0 10.0
64 65 0 2 13840.0 5.0 2010.0 1 1.0 2012.0 1272672000000000000 1325462400000000000 1.0 4.0 7.0 10.0
65 66 3 0 7660.0 NaN NaN 1 37.0 2009.0 -9223372036854775808 1252886400000000000 1.0 4.0 7.0 10.0
66 67 0 2 410.0 2.0 2006.0 0 NaN NaN 1138752000000000000 -9223372036854775808 NaN NaN NaN NaN
67 68 0 2 250.0 NaN NaN 1 35.0 2012.0 -9223372036854775808 1346025600000000000 3.0 6.0 9.0 12.0
68 69 2 2 1130.0 NaN NaN 1 40.0 2011.0 -9223372036854775808 1317600000000000000 1.0 4.0 7.0 10.0
69 70 2 2 4840.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
70 71 0 0 17500.0 8.0 2008.0 1 37.0 2009.0 1217548800000000000 1252886400000000000 3.0 6.0 9.0 12.0
71 72 0 0 2200.0 12.0 2009.0 1 13.0 2010.0 1259625600000000000 1269820800000000000 1.0 4.0 7.0 10.0
72 73 0 2 1650.0 9.0 2008.0 0 NaN NaN 1220227200000000000 -9223372036854775808 NaN NaN NaN NaN
73 74 0 0 330.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
74 75 3 2 22440.0 12.0 2013.0 0 NaN NaN 1385856000000000000 -9223372036854775808 NaN NaN NaN NaN
75 76 3 2 19960.0 3.0 2006.0 0 NaN NaN 1141171200000000000 -9223372036854775808 NaN NaN NaN NaN
76 77 3 2 1090.0 8.0 2009.0 1 10.0 2014.0 1249084800000000000 1394409600000000000 1.0 4.0 7.0 10.0
77 78 0 0 3510.0 11.0 2006.0 1 5.0 2013.0 1162339200000000000 1359936000000000000 2.0 5.0 8.0 11.0
78 79 0 0 3320.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
79 80 3 0 7910.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
80 81 0 0 2370.0 3.0 2011.0 1 40.0 2014.0 1298937600000000000 1412553600000000000 1.0 4.0 7.0 10.0
81 82 0 0 22390.0 4.0 2008.0 1 37.0 2009.0 1207008000000000000 1252886400000000000 1.0 4.0 7.0 10.0
82 83 0 0 2710.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
83 84 0 2 11810.0 8.0 2014.0 0 NaN NaN 1406851200000000000 -9223372036854775808 NaN NaN NaN NaN
84 85 1 0 1870.0 10.0 2011.0 0 NaN NaN 1317427200000000000 -9223372036854775808 NaN NaN NaN NaN
85 86 0 0 480.0 2.0 2005.0 1 31.0 2013.0 1107216000000000000 1375660800000000000 1.0 4.0 7.0 10.0
86 87 0 0 560.0 12.0 2010.0 0 NaN NaN 1291161600000000000 -9223372036854775808 NaN NaN NaN NaN
87 88 0 0 10690.0 10.0 2005.0 0 NaN NaN 1128124800000000000 -9223372036854775808 NaN NaN NaN NaN
88 89 0 0 2380.0 7.0 2004.0 1 40.0 2014.0 1088640000000000000 1412553600000000000 1.0 4.0 7.0 10.0
89 90 0 0 330.0 11.0 2007.0 0 NaN NaN 1193875200000000000 -9223372036854775808 NaN NaN NaN NaN
90 91 2 0 2410.0 NaN NaN 1 35.0 2011.0 -9223372036854775808 1314576000000000000 1.0 4.0 7.0 10.0
91 92 2 0 240.0 NaN NaN 1 45.0 2009.0 -9223372036854775808 1257724800000000000 2.0 5.0 8.0 11.0
92 93 0 0 16690.0 NaN NaN 1 14.0 2011.0 -9223372036854775808 1301875200000000000 1.0 4.0 7.0 10.0
93 94 3 2 14620.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
94 95 0 0 1890.0 10.0 2014.0 0 NaN NaN 1412121600000000000 -9223372036854775808 NaN NaN NaN NaN
95 96 0 0 8780.0 2.0 2005.0 1 37.0 2009.0 1107216000000000000 1252886400000000000 1.0 4.0 7.0 10.0
96 97 3 2 8980.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN
97 98 3 2 15140.0 12.0 2006.0 1 1.0 2012.0 1164931200000000000 1325462400000000000 1.0 4.0 7.0 10.0
98 99 2 2 2030.0 11.0 2003.0 1 22.0 2012.0 1067644800000000000 1338163200000000000 3.0 6.0 9.0 12.0
99 100 3 0 17930.0 NaN NaN 0 NaN NaN -9223372036854775808 -9223372036854775808 NaN NaN NaN NaN

In [40]:
### Convert competition open year and month to float

def convertPromo2(df):
    try:
        date = '{}{}1'.format(int(df['Promo2SinceYear']), int(df['Promo2SinceWeek']))
        return pd.to_datetime(date, format='%Y%W%w')
    except:
        return np.nan

df_store['Promo2SinceFloat'] = df_store.apply(lambda df: convertPromo2(df), axis=1).astype(np.int64)

In [41]:
s = df_store['PromoInterval'].str.split(',').apply(pd.Series, 1)
s.columns = ['PromoInterval0', 'PromoInterval1', 'PromoInterval2', 'PromoInterval3']
df_store = df_store.join(s)

In [42]:
def monthToNum(date):
    return{
            'Jan' : 1,
            'Feb' : 2,
            'Mar' : 3,
            'Apr' : 4,
            'May' : 5,
            'Jun' : 6,
            'Jul' : 7,
            'Aug' : 8,
            'Sept' : 9, 
            'Oct' : 10,
            'Nov' : 11,
            'Dec' : 12
    }[date]

df_store['PromoInterval0'] = df_store['PromoInterval0'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval1'] = df_store['PromoInterval1'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval2'] = df_store['PromoInterval2'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval3'] = df_store['PromoInterval3'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)

In [43]:
del df_store['PromoInterval']

In [44]:
store_features = ['Store', 'StoreType', 'Assortment', 
                  'CompetitionDistance', 'CompetitionOpenInt',
                  'PromoInterval0']

### Features not helping
# PromoInterval1, PromoInterval2, PromoInterval3

features_x = list(set(features_x + store_features))

In [45]:
df = pd.merge(df, df_store[store_features], how='left', on=['Store'])

In [46]:
### Convert every NAN to -1
for feature in features_x:
    df[feature] = df[feature].fillna(-1)

In [47]:
list_stores_to_check = [105,163,172,364,378,523,589,663,676,681,700,708,730,764,837,845,861,882,969,986]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateInt'], y1, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1



In [48]:
list_stores_to_check = [192,263,500,797,815,825]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateInt'], y1, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1



In [49]:
list_stores_to_check = [274,524,709,1029]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateInt'], y1, '.')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1



In [50]:
list_stores_to_check = [274,524,709,1029]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateInt'], y1, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1



In [51]:
list_stores_to_check = [299,453,530,732,931]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateInt'], y1, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1



In [52]:
store_dates_to_remove = {   105:1.368e18, 163:1.368e18,
                            172:1.366e18, 364:1.37e18,
                            378:1.39e18, 523:1.39e18,
                            589:1.37e18, 663:1.39e18,
                            676:1.366e18, 681:1.37e18,
                            700:1.373e18, 708:1.368e18,
                            709:1.423e18, 730:1.39e18,
                            764:1.368e18, 837:1.396e18,
                            845:1.368e18, 861:1.368e18,
                            882:1.368e18, 969:1.366e18,
                            986:1.368e18, 192:1.421e18,
                            263:1.421e18, 500:1.421e18,
                            797:1.421e18, 815:1.421e18,
                            825:1.421e18}

for key,value in store_dates_to_remove.iteritems():
    df.loc[(df['Store'] == key) & (df['DateInt'] < value), 'Delete'] = True

In [53]:
list_stores_to_check = [105,163,172,364,378,523,589,663,676,681,700,708,730,764,837,845,861,882,969,986]

plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]

j = 1
for i in list_stores_to_check:
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] == True)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] == True)]['Sales']
    
    X2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] != True)]
    y2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] != True)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(len(list_stores_to_check),1,j)
    plt.plot(X1['DateInt'], y1, 'r-')
    plt.plot(X2['DateInt'], y2, '-')
    plt.minorticks_on()
    plt.grid(True, which='both')
    plt.title(i)
    j += 1



In [54]:
### Delete the data where sales in the first period is much different from the rest
df = df.loc[df['Delete'] != True]

In [55]:
def mad_based_outlier(points, thresh=3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

In [56]:
for i in df['Store'].unique():
    df.loc[(df['Set'] == 1) & (df['Store'] == i) & (df['Open'] == 1), 'Outlier'] = \
        mad_based_outlier(df.loc[(df['Set'] == 1) & (df['Store'] == i) & (df['Open'] == 1)]['Sales'], 3)

In [57]:
no_stores_to_check = 10

plt.rcParams["figure.figsize"] = [20,no_stores_to_check*5]

for i in range(1,no_stores_to_check+1):
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]['Sales']

    # Outliers
    X2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]
    y2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(10,5,i)
    plt.plot(X1['Date'], y1, '-')
    plt.plot(X2['Date'], y2, 'r.')
    plt.title(i)
    plt.axis('off')



In [58]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == False)][features_x],
                                                    df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == False)][features_y],
                                                    test_size=0.1, random_state=seed)

In [59]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

In [60]:
num_round = 20000
evallist = [(dtrain, 'train'), (dtest, 'test')]

In [63]:
param = {'bst:max_depth':12,
         'bst:eta':0.01,
         'subsample':0.8,
         'colsample_bytree':0.7,
         'silent':1,
         'objective':'reg:linear',
         'nthread':6,
         'seed':seed}

plst = param.items()

bst = xgb.train(plst, dtrain, num_round, evallist, feval=rmspe_xg, verbose_eval=250, early_stopping_rounds=250)


Will train until test error hasn't decreased in 250 rounds.
[0]	train-rmspe:0.999863	test-rmspe:0.999863
[250]	train-rmspe:0.491216	test-rmspe:0.487971
[500]	train-rmspe:0.198972	test-rmspe:0.188309
[750]	train-rmspe:0.166821	test-rmspe:0.156818
[1000]	train-rmspe:0.137129	test-rmspe:0.132996
[1250]	train-rmspe:0.122311	test-rmspe:0.121035
[1500]	train-rmspe:0.109952	test-rmspe:0.112465
[1750]	train-rmspe:0.100481	test-rmspe:0.106788
[2000]	train-rmspe:0.093883	test-rmspe:0.102796
[2250]	train-rmspe:0.088570	test-rmspe:0.100460
[2500]	train-rmspe:0.083871	test-rmspe:0.098587
[2750]	train-rmspe:0.080263	test-rmspe:0.097114
[3000]	train-rmspe:0.077273	test-rmspe:0.095896
[3250]	train-rmspe:0.074326	test-rmspe:0.094886
[3500]	train-rmspe:0.071833	test-rmspe:0.094089
[3750]	train-rmspe:0.069701	test-rmspe:0.093385
[4000]	train-rmspe:0.067834	test-rmspe:0.092811
[4250]	train-rmspe:0.066103	test-rmspe:0.092347
[4500]	train-rmspe:0.064509	test-rmspe:0.091941
[4750]	train-rmspe:0.062960	test-rmspe:0.091569
[5000]	train-rmspe:0.061581	test-rmspe:0.091271
[5250]	train-rmspe:0.060224	test-rmspe:0.091016
[5500]	train-rmspe:0.058971	test-rmspe:0.090792
[5750]	train-rmspe:0.057782	test-rmspe:0.090605
[6000]	train-rmspe:0.056656	test-rmspe:0.090459
[6250]	train-rmspe:0.055568	test-rmspe:0.090350
[6500]	train-rmspe:0.054547	test-rmspe:0.090252
[6750]	train-rmspe:0.053527	test-rmspe:0.090143
[7000]	train-rmspe:0.052577	test-rmspe:0.090067
[7250]	train-rmspe:0.051698	test-rmspe:0.090008
[7500]	train-rmspe:0.050825	test-rmspe:0.089956
[7750]	train-rmspe:0.050012	test-rmspe:0.089897
[8000]	train-rmspe:0.049207	test-rmspe:0.089844
[8250]	train-rmspe:0.048413	test-rmspe:0.089800
[8500]	train-rmspe:0.047679	test-rmspe:0.089768
[8750]	train-rmspe:0.046973	test-rmspe:0.089742
[9000]	train-rmspe:0.046274	test-rmspe:0.089741
Stopping. Best iteration:
[8751]	train-rmspe:0.046970	test-rmspe:0.089741


In [64]:
dpred = xgb.DMatrix(df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == True)][features_x])

In [65]:
ypred_bst = bst.predict(dpred)

In [66]:
df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == True), 'SalesLog'] = ypred_bst
df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == True), 'Sales'] = np.exp(ypred_bst) - 1

In [67]:
### You see the result being lower than before, but most of them are still pretty high
no_stores_to_check = 10

plt.rcParams["figure.figsize"] = [20,no_stores_to_check*5]

for i in range(1,no_stores_to_check+1):
    stor = i

    # Normal sales
    X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]
    y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]['Sales']

    # Outliers
    X2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]
    y2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]['Sales']

    Xt = df.loc[(df['Store'] == stor)]
    
    plt.subplot(10,5,i)
    plt.plot(X1['Date'], y1, '-')
    plt.plot(X2['Date'], y2, 'r.')
    plt.title(i)
    plt.axis('off')



In [68]:
### Get total sales, customers and open days per store
store_data_sales = df.groupby([df['Store']])['Sales'].sum()
store_data_customers = df.groupby([df['Store']])['Customers'].sum()
store_data_open = df.groupby([df['Store']])['Open'].count()

In [69]:
### Calculate sales per day, customers per day and sales per customers per day
store_data_sales_per_day = store_data_sales / store_data_open
store_data_customers_per_day = store_data_customers / store_data_open
store_data_sales_per_customer_per_day = store_data_sales_per_day / store_data_customers_per_day

In [70]:
df_store = pd.merge(df_store, store_data_sales_per_day.reset_index(name='SalesPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, store_data_customers_per_day.reset_index(name='CustomersPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, store_data_sales_per_customer_per_day.reset_index(name='SalesPerCustomersPerDay'), how='left', on=['Store'])

In [71]:
store_features = ['Store', 'SalesPerDay', 'CustomersPerDay', 'SalesPerCustomersPerDay']

features_x = list(set(features_x + store_features))

In [72]:
df = pd.merge(df, df_store[store_features], how='left', on=['Store'])

In [73]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[(df['Set'] == 1) & (df['Open'] == 1)][features_x],
                                                    df.loc[(df['Set'] == 1) & (df['Open'] == 1)][features_y],
                                                    test_size=0.1, random_state=seed)

In [74]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

In [75]:
num_round = 20000
evallist = [(dtrain, 'train'), (dtest, 'test')]

In [76]:
param = {'bst:max_depth':12,
         'bst:eta':0.0095,
         'subsample':0.8,
         'colsample_bytree':0.7,
         'silent':1, 
         'objective':'reg:linear',
         'nthread':6,
         'seed':seed}

plst = param.items()

bst1 = xgb.train(plst, dtrain, num_round, evallist, feval=rmspe_xg, verbose_eval=250, early_stopping_rounds=250)


Will train until test error hasn't decreased in 250 rounds.
[0]	train-rmspe:0.999866	test-rmspe:0.999865
[250]	train-rmspe:0.531137	test-rmspe:0.529227
[500]	train-rmspe:0.137918	test-rmspe:0.131062
[750]	train-rmspe:0.110675	test-rmspe:0.111607
[1000]	train-rmspe:0.100810	test-rmspe:0.106202
[1250]	train-rmspe:0.092724	test-rmspe:0.102566
[1500]	train-rmspe:0.087041	test-rmspe:0.099832
[1750]	train-rmspe:0.082683	test-rmspe:0.097939
[2000]	train-rmspe:0.078698	test-rmspe:0.096432
[2250]	train-rmspe:0.075622	test-rmspe:0.095216
[2500]	train-rmspe:0.072704	test-rmspe:0.094271
[2750]	train-rmspe:0.070362	test-rmspe:0.093525
[3000]	train-rmspe:0.068244	test-rmspe:0.092888
[3250]	train-rmspe:0.066329	test-rmspe:0.092319
[3500]	train-rmspe:0.064502	test-rmspe:0.091857
[3750]	train-rmspe:0.062939	test-rmspe:0.091523
[4000]	train-rmspe:0.061489	test-rmspe:0.091201
[4250]	train-rmspe:0.060084	test-rmspe:0.090915
[4500]	train-rmspe:0.058776	test-rmspe:0.090721
[4750]	train-rmspe:0.057532	test-rmspe:0.090546
[5000]	train-rmspe:0.056319	test-rmspe:0.090379
[5250]	train-rmspe:0.055231	test-rmspe:0.090255
[5500]	train-rmspe:0.054148	test-rmspe:0.090183
[5750]	train-rmspe:0.053105	test-rmspe:0.090085
[6000]	train-rmspe:0.052128	test-rmspe:0.090017
[6250]	train-rmspe:0.051182	test-rmspe:0.089965
[6500]	train-rmspe:0.050276	test-rmspe:0.089904
[6750]	train-rmspe:0.049418	test-rmspe:0.089879
[7000]	train-rmspe:0.048590	test-rmspe:0.089835
[7250]	train-rmspe:0.047803	test-rmspe:0.089802
[7500]	train-rmspe:0.047017	test-rmspe:0.089793
[7750]	train-rmspe:0.046294	test-rmspe:0.089776
[8000]	train-rmspe:0.045572	test-rmspe:0.089759
[8250]	train-rmspe:0.044861	test-rmspe:0.089749
[8500]	train-rmspe:0.044187	test-rmspe:0.089727
[8750]	train-rmspe:0.043539	test-rmspe:0.089736
Stopping. Best iteration:
[8575]	train-rmspe:0.043983	test-rmspe:0.089723


In [81]:
xgb.plot_importance(bst1)


Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b46080>

In [78]:
X_submit = df.loc[df['Set'] == 0]

In [79]:
dsubmit = xgb.DMatrix(X_submit[features_x])

In [82]:
ypred_bst = bst1.predict(dsubmit)

df_ypred = X_submit['Id'].reset_index()
del df_ypred['index']
df_ypred['Id'] = df_ypred['Id'].astype('int')

# Scale back the sales a bit
df_ypred['Sales'] = (np.exp(ypred_bst) - 1) * 0.985

df_ypred.sort_values('Id', inplace=True)
df_ypred[['Id', 'Sales']].to_csv('rossmann_best_no_ext_data_scaled.csv', index=False)

In [ ]: