In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import pylab
import csv
import datetime
import math
import re
import time
import random
import os
from pandas.tseries.offsets import *
from operator import *
from sklearn.cross_validation import train_test_split
%matplotlib inline
# plt.style.use('ggplot') # Good looking plots
np.set_printoptions(precision=4, threshold=10000, linewidth=100, edgeitems=999, suppress=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100)
pd.set_option('expand_frame_repr', False)
pd.set_option('precision', 6)
start_time = time.time()
In [3]:
# Thanks to Chenglong Chen for providing this in the forum
def ToWeight(y):
w = np.zeros(y.shape, dtype=float)
ind = y != 0
w[ind] = 1./(y[ind]**2)
return w
def rmspe(yhat, y):
w = ToWeight(y)
rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
return rmspe
def rmspe_xg(yhat, y):
# y = y.values
y = y.get_label()
y = np.exp(y) - 1
yhat = np.exp(yhat) - 1
w = ToWeight(y)
rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
return "rmspe", rmspe
In [4]:
seed = 42
In [5]:
nrows = None
df_train = pd.read_csv('train.csv',
nrows=nrows,
parse_dates=['Date'],
date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))
nrows = nrows
df_submit = pd.read_csv('test.csv',
nrows=nrows,
parse_dates=['Date'],
date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))
In [6]:
### Setting a variable to easily distinguish train (1) from submit (0) set
df_train['Set'] = 1
df_submit['Set'] = 0
In [7]:
### Combine train and test set
frames = [df_train, df_submit]
df = pd.concat(frames)
In [8]:
df.info()
In [10]:
df_train.head()
Out[10]:
In [11]:
df_submit.head()
Out[11]:
In [23]:
features_x = ['Store', 'Date', 'DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'StateHoliday']
features_y = ['SalesLog']
In [24]:
### Remove rows where store is open, but no sales.
df = df.loc[~((df['Open'] == 1) & (df['Sales'] == 0))]
In [25]:
df.loc[df['Set'] == 1, 'SalesLog'] = np.log1p(df.loc[df['Set'] == 1]['Sales']) # = np.log(df['Sales'] + 1)
In [26]:
df['StateHoliday'] = df['StateHoliday'].astype('category').cat.codes
In [27]:
var_name = 'Date'
df[var_name + 'Day'] = pd.Index(df[var_name]).day
df[var_name + 'Week'] = pd.Index(df[var_name]).week
df[var_name + 'Month'] = pd.Index(df[var_name]).month
df[var_name + 'Year'] = pd.Index(df[var_name]).year
df[var_name + 'DayOfYear'] = pd.Index(df[var_name]).dayofyear
df[var_name + 'Day'] = df[var_name + 'Day'].fillna(0)
df[var_name + 'Week'] = df[var_name + 'Week'].fillna(0)
df[var_name + 'Month'] = df[var_name + 'Month'].fillna(0)
df[var_name + 'Year'] = df[var_name + 'Year'].fillna(0)
df[var_name + 'DayOfYear'] = df[var_name + 'DayOfYear'].fillna(0)
features_x.remove(var_name)
features_x.append(var_name + 'Day')
features_x.append(var_name + 'Week')
features_x.append(var_name + 'Month')
features_x.append(var_name + 'Year')
features_x.append(var_name + 'DayOfYear')
In [31]:
features_x
Out[31]:
In [32]:
df.head(100)
Out[32]:
In [33]:
df['DateInt'] = df['Date'].astype(np.int64)
In [34]:
df_store = pd.read_csv('store.csv',
nrows=nrows)
In [35]:
df_store.info()
In [37]:
### Convert Storetype and Assortment to numerical categories
df_store['StoreType'] = df_store['StoreType'].astype('category').cat.codes
df_store['Assortment'] = df_store['Assortment'].astype('category').cat.codes
In [38]:
df_store.head(100)
Out[38]:
In [39]:
### Convert competition open year and month to float
def convertCompetitionOpen(df):
try:
date = '{}-{}'.format(int(df['CompetitionOpenSinceYear']), int(df['CompetitionOpenSinceMonth']))
return pd.to_datetime(date)
except:
return np.nan
df_store['CompetitionOpenInt'] = df_store.apply(lambda df: convertCompetitionOpen(df), axis=1).astype(np.int64)
In [62]:
df_store.head(100)
Out[62]:
In [40]:
### Convert competition open year and month to float
def convertPromo2(df):
try:
date = '{}{}1'.format(int(df['Promo2SinceYear']), int(df['Promo2SinceWeek']))
return pd.to_datetime(date, format='%Y%W%w')
except:
return np.nan
df_store['Promo2SinceFloat'] = df_store.apply(lambda df: convertPromo2(df), axis=1).astype(np.int64)
In [41]:
s = df_store['PromoInterval'].str.split(',').apply(pd.Series, 1)
s.columns = ['PromoInterval0', 'PromoInterval1', 'PromoInterval2', 'PromoInterval3']
df_store = df_store.join(s)
In [42]:
def monthToNum(date):
return{
'Jan' : 1,
'Feb' : 2,
'Mar' : 3,
'Apr' : 4,
'May' : 5,
'Jun' : 6,
'Jul' : 7,
'Aug' : 8,
'Sept' : 9,
'Oct' : 10,
'Nov' : 11,
'Dec' : 12
}[date]
df_store['PromoInterval0'] = df_store['PromoInterval0'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval1'] = df_store['PromoInterval1'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval2'] = df_store['PromoInterval2'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval3'] = df_store['PromoInterval3'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
In [43]:
del df_store['PromoInterval']
In [44]:
store_features = ['Store', 'StoreType', 'Assortment',
'CompetitionDistance', 'CompetitionOpenInt',
'PromoInterval0']
### Features not helping
# PromoInterval1, PromoInterval2, PromoInterval3
features_x = list(set(features_x + store_features))
In [45]:
df = pd.merge(df, df_store[store_features], how='left', on=['Store'])
In [46]:
### Convert every NAN to -1
for feature in features_x:
df[feature] = df[feature].fillna(-1)
In [47]:
list_stores_to_check = [105,163,172,364,378,523,589,663,676,681,700,708,730,764,837,845,861,882,969,986]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateInt'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
In [48]:
list_stores_to_check = [192,263,500,797,815,825]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateInt'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
In [49]:
list_stores_to_check = [274,524,709,1029]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateInt'], y1, '.')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
In [50]:
list_stores_to_check = [274,524,709,1029]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateInt'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
In [51]:
list_stores_to_check = [299,453,530,732,931]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateInt'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
In [52]:
store_dates_to_remove = { 105:1.368e18, 163:1.368e18,
172:1.366e18, 364:1.37e18,
378:1.39e18, 523:1.39e18,
589:1.37e18, 663:1.39e18,
676:1.366e18, 681:1.37e18,
700:1.373e18, 708:1.368e18,
709:1.423e18, 730:1.39e18,
764:1.368e18, 837:1.396e18,
845:1.368e18, 861:1.368e18,
882:1.368e18, 969:1.366e18,
986:1.368e18, 192:1.421e18,
263:1.421e18, 500:1.421e18,
797:1.421e18, 815:1.421e18,
825:1.421e18}
for key,value in store_dates_to_remove.iteritems():
df.loc[(df['Store'] == key) & (df['DateInt'] < value), 'Delete'] = True
In [53]:
list_stores_to_check = [105,163,172,364,378,523,589,663,676,681,700,708,730,764,837,845,861,882,969,986]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] == True)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] == True)]['Sales']
X2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] != True)]
y2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Delete'] != True)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateInt'], y1, 'r-')
plt.plot(X2['DateInt'], y2, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1
In [54]:
### Delete the data where sales in the first period is much different from the rest
df = df.loc[df['Delete'] != True]
In [55]:
def mad_based_outlier(points, thresh=3.5):
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
In [56]:
for i in df['Store'].unique():
df.loc[(df['Set'] == 1) & (df['Store'] == i) & (df['Open'] == 1), 'Outlier'] = \
mad_based_outlier(df.loc[(df['Set'] == 1) & (df['Store'] == i) & (df['Open'] == 1)]['Sales'], 3)
In [57]:
no_stores_to_check = 10
plt.rcParams["figure.figsize"] = [20,no_stores_to_check*5]
for i in range(1,no_stores_to_check+1):
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]['Sales']
# Outliers
X2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]
y2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(10,5,i)
plt.plot(X1['Date'], y1, '-')
plt.plot(X2['Date'], y2, 'r.')
plt.title(i)
plt.axis('off')
In [58]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == False)][features_x],
df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == False)][features_y],
test_size=0.1, random_state=seed)
In [59]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)
In [60]:
num_round = 20000
evallist = [(dtrain, 'train'), (dtest, 'test')]
In [63]:
param = {'bst:max_depth':12,
'bst:eta':0.01,
'subsample':0.8,
'colsample_bytree':0.7,
'silent':1,
'objective':'reg:linear',
'nthread':6,
'seed':seed}
plst = param.items()
bst = xgb.train(plst, dtrain, num_round, evallist, feval=rmspe_xg, verbose_eval=250, early_stopping_rounds=250)
In [64]:
dpred = xgb.DMatrix(df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == True)][features_x])
In [65]:
ypred_bst = bst.predict(dpred)
In [66]:
df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == True), 'SalesLog'] = ypred_bst
df.loc[(df['Set'] == 1) & (df['Open'] == 1) & (df['Outlier'] == True), 'Sales'] = np.exp(ypred_bst) - 1
In [67]:
### You see the result being lower than before, but most of them are still pretty high
no_stores_to_check = 10
plt.rcParams["figure.figsize"] = [20,no_stores_to_check*5]
for i in range(1,no_stores_to_check+1):
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == False)]['Sales']
# Outliers
X2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]
y2 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1) & (df['Outlier'] == True)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(10,5,i)
plt.plot(X1['Date'], y1, '-')
plt.plot(X2['Date'], y2, 'r.')
plt.title(i)
plt.axis('off')
In [68]:
### Get total sales, customers and open days per store
store_data_sales = df.groupby([df['Store']])['Sales'].sum()
store_data_customers = df.groupby([df['Store']])['Customers'].sum()
store_data_open = df.groupby([df['Store']])['Open'].count()
In [69]:
### Calculate sales per day, customers per day and sales per customers per day
store_data_sales_per_day = store_data_sales / store_data_open
store_data_customers_per_day = store_data_customers / store_data_open
store_data_sales_per_customer_per_day = store_data_sales_per_day / store_data_customers_per_day
In [70]:
df_store = pd.merge(df_store, store_data_sales_per_day.reset_index(name='SalesPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, store_data_customers_per_day.reset_index(name='CustomersPerDay'), how='left', on=['Store'])
df_store = pd.merge(df_store, store_data_sales_per_customer_per_day.reset_index(name='SalesPerCustomersPerDay'), how='left', on=['Store'])
In [71]:
store_features = ['Store', 'SalesPerDay', 'CustomersPerDay', 'SalesPerCustomersPerDay']
features_x = list(set(features_x + store_features))
In [72]:
df = pd.merge(df, df_store[store_features], how='left', on=['Store'])
In [73]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[(df['Set'] == 1) & (df['Open'] == 1)][features_x],
df.loc[(df['Set'] == 1) & (df['Open'] == 1)][features_y],
test_size=0.1, random_state=seed)
In [74]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)
In [75]:
num_round = 20000
evallist = [(dtrain, 'train'), (dtest, 'test')]
In [76]:
param = {'bst:max_depth':12,
'bst:eta':0.0095,
'subsample':0.8,
'colsample_bytree':0.7,
'silent':1,
'objective':'reg:linear',
'nthread':6,
'seed':seed}
plst = param.items()
bst1 = xgb.train(plst, dtrain, num_round, evallist, feval=rmspe_xg, verbose_eval=250, early_stopping_rounds=250)
In [81]:
xgb.plot_importance(bst1)
Out[81]:
In [78]:
X_submit = df.loc[df['Set'] == 0]
In [79]:
dsubmit = xgb.DMatrix(X_submit[features_x])
In [82]:
ypred_bst = bst1.predict(dsubmit)
df_ypred = X_submit['Id'].reset_index()
del df_ypred['index']
df_ypred['Id'] = df_ypred['Id'].astype('int')
# Scale back the sales a bit
df_ypred['Sales'] = (np.exp(ypred_bst) - 1) * 0.985
df_ypred.sort_values('Id', inplace=True)
df_ypred[['Id', 'Sales']].to_csv('rossmann_best_no_ext_data_scaled.csv', index=False)
In [ ]: