In [1]:
"""
Created on Fri Nov 27 23:01:16 2015
@author: yilin
"""
# useful code: https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2/code
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", font="monospace")
import plotly
import plotly.plotly as py
py.sign_in('lemonsong', '3lcplsq1a3')
import plotly.graph_objs as go
#import datetime
from sklearn.utils import shuffle
from sklearn import preprocessing
In [2]:
## Load data
In [3]:
types = {'CompetitionOpenSinceYear': np.dtype(int),
'CompetitionOpenSinceMonth': np.dtype(int),
'StateHoliday': np.dtype(str),
'Promo2SinceWeek': np.dtype(int),
'SchoolHoliday': np.dtype(float),
'PromoInterval': np.dtype(str)}
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
#dateparse = lambda x: pd.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ")
dftrain = pd.read_csv('train.csv',parse_dates=[2], date_parser=dateparse, dtype=types)
dfstore = pd.read_csv('store.csv')
df = pd.merge(dftrain, dfstore, on='Store')
dftest = pd.read_csv('test.csv',parse_dates=[3], date_parser=dateparse, dtype=types)
dt = pd.merge(dftest, dfstore, on='Store')
In [4]:
# function to calculate how many weeks the store has taken promo2 as Promo2Week
def Promo2WeekCal(sinceweek,sinceyear,week,year):
return (year-sinceyear)*52-sinceweek+week
# dictionary for map PromoInterval
month2int_dict = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sept':9, 'Oct':10, 'Nov':11, 'Dec':12}
# function to calculate how many month the promo2 has going on
def Promo2MonthCal(x,y):
x=x.split(',')
for i in range(0,len(x)):
x[i]=month2int_dict[x[i]]
x.insert(0,x[0]-3)
x.append(x[-1]+3)
for i in range(0,len(x)-1):
if y>=x[i] and y<x[i+1]:
return int(y-x[i])
def dummyvariable(dataframe,colname):
dummy = pd.get_dummies(dataframe[colname],prefix=colname)
dataframe = dataframe.join(dummy.ix[:, colname:]).drop(colname,1)
return dataframe
def CompetitorCal(distance):
if np.isnan(distance):
return int(0)
else:
return int(1)
def prepro(dataframe,features0,features1):
# Assume store open, if not provided
dataframe['Open'].fillna(1, inplace=True)
# Only consider data open==1 and sales > 0
dataframe = dataframe[(dataframe['Open'] ==1) & (dataframe['Sales'] > 0)]
dataframe = dataframe[(dataframe['StateHoliday'] =="0") | (dataframe['StateHoliday'] == "a")]
# Create year, month, day and week
dataframe['Year'] = dataframe.Date.dt.year
dataframe['Month'] = dataframe.Date.dt.month
dataframe['Day'] = dataframe.Date.dt.day
dataframe['Week'] = dataframe.Date.dt.week
# Create dummy variables for StoreType
dataframe = dummyvariable(dataframe,'StoreType')
# Create dummy variables for Assortment
dataframe = dummyvariable(dataframe,'Assortment')
# Create dummy variables for StateHoliday
dataframe = dummyvariable(dataframe,'StateHoliday')
#dataframe = dataframe[('StateHoliday_b'==0) & ('StateHoliday_c'==0)]
# Calculate how many month the store has competed with its competitor
dataframe['CompetitionMonth']=12 * (dataframe.Year - dataframe.CompetitionOpenSinceYear) + (dataframe.Month - dataframe.CompetitionOpenSinceMonth)
#
dataframe['HaveCompetitor'] = dataframe.apply( lambda x:CompetitorCal(x['CompetitionDistance']), axis=1)
mean_CompetitionDistance=dataframe[['CompetitionDistance']].mean(axis=0)
dataframe['CompetitionDistance']=dataframe[['CompetitionDistance']].fillna(mean_CompetitionDistance)
mean_CompetitionMonth=dataframe[['CompetitionMonth']].mean(axis=0)
dataframe['CompetitionMonth']=dataframe[['CompetitionMonth']].fillna(mean_CompetitionMonth)
# calculate how many weeks the store has taken promo2 as Promo2Week
dataframe['Promo2Week']=dataframe.query('Promo2==1').apply( lambda x:Promo2WeekCal(x['Promo2SinceWeek'],x['Promo2SinceYear'],x['Week'],x['Year']), axis=1)
# calculate how many month the promo2 has going on since las promo2 as Promo2Month
dataframe['Promo2Month']=dataframe.query('Promo2==1').apply( lambda x:Promo2MonthCal(x['PromoInterval'],x['Month']), axis=1)
dataframe0 = dataframe[(dataframe['Promo2'] == 0)]
dataframe1 = dataframe[(dataframe['Promo2'] == 1)]
x0=dataframe0[features0]
x1=dataframe1[features1]
return x0,x1
# replace "Date" with "Year","Month","Day","Week"
# replace "StoreType" with "StoreType_a","StoreType_b","StoreType_c","StoreType_d"
# replace "Assortment" with "Assortment_a","Assortment_b","Assortment_c"
# replace "StateHoliday" with "StateHoliday_0","StateHoliday_a","StateHoliday_b","StateHoliday_c",
# delete "StateHoliday_b","StateHoliday_c" because Kaggle test data does not have these two type of date
# replace "CompetitionOpenSinceMonth","CompetitionOpenSinceYear" with "CompetitionMonth"
# replace "PromoInterval" with "Promo2Month"
# replace ""Promo2SinceWeek","Promo2SinceYear" with "Promo2Week"
# delete target variabel "Sales"
# delete "Promo2"
# delete "Promo2Month","Promo2Week" for records with "Promo2"==0
# delete "Customer" because Kaggle test does not have this information
features0=["Store","DayOfWeek","Promo","SchoolHoliday",
"HaveCompetitor","CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth",'Customers','Sales'
]
features1=["Store","DayOfWeek","Promo","SchoolHoliday",
"HaveCompetitor","CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth",
"Promo2Month","Promo2Week",'Customers','Sales'
]
x0,x1 = prepro(df,features0,features1)
In [5]:
def preprotest(dataframe,features0,features1):
# Assume store open, if not provided
dataframe['Open'].fillna(1, inplace=True)
# Create year, month, day and week
dataframe['Year'] = dataframe.Date.dt.year
dataframe['Month'] = dataframe.Date.dt.month
dataframe['Day'] = dataframe.Date.dt.day
dataframe['Week'] = dataframe.Date.dt.week
# Create dummy variables for StoreType
dataframe = dummyvariable(dataframe,'StoreType')
# Create dummy variables for Assortment
dataframe = dummyvariable(dataframe,'Assortment')
# Create dummy variables for StateHoliday
dataframe = dummyvariable(dataframe,'StateHoliday')
# Calculate how many month the store has competed with its competitor
dataframe['CompetitionMonth']=12 * (dataframe.Year - dataframe.CompetitionOpenSinceYear) + (dataframe.Month - dataframe.CompetitionOpenSinceMonth)
#
dataframe['HaveCompetitor'] = dataframe.apply( lambda x:CompetitorCal(x['CompetitionDistance']), axis=1)
mean_CompetitionDistance=dataframe[['CompetitionDistance']].mean(axis=0)
dataframe['CompetitionDistance']=dataframe[['CompetitionDistance']].fillna(mean_CompetitionDistance)
mean_CompetitionMonth=dataframe[['CompetitionMonth']].mean(axis=0)
dataframe['CompetitionMonth']=dataframe[['CompetitionMonth']].fillna(mean_CompetitionMonth)
# calculate how many weeks the store has taken promo2 as Promo2Week
dataframe['Promo2Week']=dataframe.query('Promo2==1').apply( lambda x:Promo2WeekCal(x['Promo2SinceWeek'],x['Promo2SinceYear'],x['Week'],x['Year']), axis=1)
# calculate how many month the promo2 has going on since las promo2 as Promo2Month
dataframe['Promo2Month']=dataframe.query('Promo2==1').apply( lambda x:Promo2MonthCal(x['PromoInterval'],x['Month']), axis=1)
# Only consider data open==1
dataframe0 = dataframe[(dataframe['Open'] ==1) & (dataframe['Promo2'] == 0)]
dataframe1 = dataframe[(dataframe['Open'] ==1) & (dataframe['Promo2'] == 1)]
x0=dataframe0[features_test0]
x1=dataframe1[features_test1]
return x0,x1
# replace "Date" with "Year","Month","Day","Week"
# replace "StoreType" with "StoreType_a","StoreType_b","StoreType_c","StoreType_d"
# replace "Assortment" with "Assortment_a","Assortment_b","Assortment_c"
# replace "StateHoliday" with "StateHoliday_0","StateHoliday_a","StateHoliday_b","StateHoliday_c",
# delete "StateHoliday_b","StateHoliday_c" because Kaggle test data does not have these two type of date
# replace "CompetitionOpenSinceMonth","CompetitionOpenSinceYear" with "CompetitionMonth"
# replace "PromoInterval" with "Promo2Month"
# replace ""Promo2SinceWeek","Promo2SinceYear" with "Promo2Week"
# delete target variabel "Sales"
# delete "Promo2"
# delete "Promo2Month","Promo2Week" for records with "Promo2"==0
# delete "Customer" because Kaggle test does not have this information
features_test0=["Id","Store","DayOfWeek","Promo","SchoolHoliday",
"HaveCompetitor","CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth"
]
features_test1=["Id","Store","DayOfWeek","Promo","SchoolHoliday",
"HaveCompetitor","CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth",
"Promo2Month","Promo2Week"
]
In [6]:
dt0,dt1 = preprotest(dt,features_test0,features_test1)
In [7]:
x0.to_csv("train0forkagglewtcustomer.csv", index=False)
x1.to_csv("train1forkagglewtcustomer.csv", index=False)
In [8]:
dt0.to_csv("pre0.csv", index=False)
dt1.to_csv("pre1.csv", index=False)
In [ ]: