In [1]:
"""
Created on Fri Nov 27 23:01:16 2015

@author: yilin
"""
# useful code: https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2/code
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", font="monospace")
import plotly
import plotly.plotly as py
py.sign_in('lemonsong', '3lcplsq1a3')
import plotly.graph_objs as go

#import datetime

from sklearn.utils import shuffle
from sklearn import preprocessing

In [2]:
## Load data

In [3]:
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
#dateparse = lambda x: pd.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ")

dftrain = pd.read_csv('train.csv',parse_dates=[2], date_parser=dateparse, dtype=types)
dfstore = pd.read_csv('store.csv')
df = pd.merge(dftrain, dfstore, on='Store')
dftest = pd.read_csv('test.csv',parse_dates=[3], date_parser=dateparse, dtype=types)
dt = pd.merge(dftest, dfstore, on='Store')

Preprocess data


In [4]:
# function to calculate how many weeks the store has taken promo2 as Promo2Week
def Promo2WeekCal(sinceweek,sinceyear,week,year):
    return (year-sinceyear)*52-sinceweek+week
    
# dictionary for map PromoInterval
month2int_dict = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,              'Jul':7, 'Aug':8, 'Sept':9, 'Oct':10, 'Nov':11, 'Dec':12}
# function to calculate how many month the promo2 has going on
def Promo2MonthCal(x,y):
    x=x.split(',')
    for i in range(0,len(x)):
        x[i]=month2int_dict[x[i]]
    x.insert(0,x[0]-3)
    x.append(x[-1]+3)
    for i in range(0,len(x)-1):
        if y>=x[i] and y<x[i+1]:
            return int(y-x[i])
    
def dummyvariable(dataframe,colname):
    dummy = pd.get_dummies(dataframe[colname],prefix=colname)
    dataframe = dataframe.join(dummy.ix[:, colname:]).drop(colname,1)
    return dataframe

def CompetitorCal(distance):
    if np.isnan(distance):
        return int(0)
    else:
        return int(1)
    
def prepro(dataframe,features0,features1):
    # Assume store open, if not provided
    dataframe['Open'].fillna(1, inplace=True)
    # Only consider data open==1 and sales > 0
    dataframe = dataframe[(dataframe['Open'] ==1) & (dataframe['Sales'] > 0)]
    dataframe = dataframe[(dataframe['StateHoliday'] =="0") | (dataframe['StateHoliday'] == "a")]
    # Create year, month, day and week
    dataframe['Year'] = dataframe.Date.dt.year
    dataframe['Month'] = dataframe.Date.dt.month
    dataframe['Day'] = dataframe.Date.dt.day
    dataframe['Week'] = dataframe.Date.dt.week

    
    # Create dummy variables for StoreType
    dataframe = dummyvariable(dataframe,'StoreType')
    # Create dummy variables for Assortment
    dataframe = dummyvariable(dataframe,'Assortment')
    # Create dummy variables for StateHoliday
    dataframe = dummyvariable(dataframe,'StateHoliday')
    
    #dataframe = dataframe[('StateHoliday_b'==0) & ('StateHoliday_c'==0)]

    
    # Calculate how many month the store has competed with its competitor
    dataframe['CompetitionMonth']=12 * (dataframe.Year - dataframe.CompetitionOpenSinceYear) +         (dataframe.Month - dataframe.CompetitionOpenSinceMonth)
        
    #
    dataframe['HaveCompetitor'] = dataframe.apply(        lambda x:CompetitorCal(x['CompetitionDistance']), axis=1)
        
    mean_CompetitionDistance=dataframe[['CompetitionDistance']].mean(axis=0)
    dataframe['CompetitionDistance']=dataframe[['CompetitionDistance']].fillna(mean_CompetitionDistance)
    mean_CompetitionMonth=dataframe[['CompetitionMonth']].mean(axis=0)
    dataframe['CompetitionMonth']=dataframe[['CompetitionMonth']].fillna(mean_CompetitionMonth)

    # calculate how many weeks the store has taken promo2 as Promo2Week
    dataframe['Promo2Week']=dataframe.query('Promo2==1').apply(        lambda x:Promo2WeekCal(x['Promo2SinceWeek'],x['Promo2SinceYear'],x['Week'],x['Year']), axis=1)
        
    # calculate how many month the promo2 has going on since las promo2 as Promo2Month
    dataframe['Promo2Month']=dataframe.query('Promo2==1').apply(        lambda x:Promo2MonthCal(x['PromoInterval'],x['Month']), axis=1)
    
    dataframe0 = dataframe[(dataframe['Promo2'] == 0)]
    dataframe1 = dataframe[(dataframe['Promo2'] == 1)]


    x0=dataframe0[features0]
    x1=dataframe1[features1]

    return x0,x1


    
# replace "Date" with "Year","Month","Day","Week"
# replace "StoreType" with "StoreType_a","StoreType_b","StoreType_c","StoreType_d"   
# replace "Assortment" with "Assortment_a","Assortment_b","Assortment_c"
# replace "StateHoliday" with "StateHoliday_0","StateHoliday_a","StateHoliday_b","StateHoliday_c",
# delete "StateHoliday_b","StateHoliday_c" because Kaggle test data does not have these two type of date
# replace "CompetitionOpenSinceMonth","CompetitionOpenSinceYear" with "CompetitionMonth"
# replace "PromoInterval" with "Promo2Month"
# replace ""Promo2SinceWeek","Promo2SinceYear" with "Promo2Week"
# delete target variabel "Sales"
# delete "Promo2"
# delete "Promo2Month","Promo2Week" for records with "Promo2"==0
# delete "Customer" because Kaggle test does not have this information

features0=["Store","DayOfWeek","Promo","SchoolHoliday",
          "HaveCompetitor","CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
           "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",'Customers','Sales'
          ]
features1=["Store","DayOfWeek","Promo","SchoolHoliday",
          "HaveCompetitor","CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
         "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",
          "Promo2Month","Promo2Week",'Customers','Sales'
          ]
x0,x1 = prepro(df,features0,features1)

In [5]:
def preprotest(dataframe,features0,features1):
    # Assume store open, if not provided
    dataframe['Open'].fillna(1, inplace=True)

    # Create year, month, day and week
    dataframe['Year'] = dataframe.Date.dt.year
    dataframe['Month'] = dataframe.Date.dt.month
    dataframe['Day'] = dataframe.Date.dt.day
    dataframe['Week'] = dataframe.Date.dt.week

    
    # Create dummy variables for StoreType
    dataframe = dummyvariable(dataframe,'StoreType')
    # Create dummy variables for Assortment
    dataframe = dummyvariable(dataframe,'Assortment')
    # Create dummy variables for StateHoliday
    dataframe = dummyvariable(dataframe,'StateHoliday')
    
    # Calculate how many month the store has competed with its competitor
    dataframe['CompetitionMonth']=12 * (dataframe.Year - dataframe.CompetitionOpenSinceYear) +         (dataframe.Month - dataframe.CompetitionOpenSinceMonth)
        
    #
    dataframe['HaveCompetitor'] = dataframe.apply(        lambda x:CompetitorCal(x['CompetitionDistance']), axis=1)
        
    mean_CompetitionDistance=dataframe[['CompetitionDistance']].mean(axis=0)
    dataframe['CompetitionDistance']=dataframe[['CompetitionDistance']].fillna(mean_CompetitionDistance)
    mean_CompetitionMonth=dataframe[['CompetitionMonth']].mean(axis=0)
    dataframe['CompetitionMonth']=dataframe[['CompetitionMonth']].fillna(mean_CompetitionMonth)

    # calculate how many weeks the store has taken promo2 as Promo2Week
    dataframe['Promo2Week']=dataframe.query('Promo2==1').apply(        lambda x:Promo2WeekCal(x['Promo2SinceWeek'],x['Promo2SinceYear'],x['Week'],x['Year']), axis=1)
        
    # calculate how many month the promo2 has going on since las promo2 as Promo2Month
    dataframe['Promo2Month']=dataframe.query('Promo2==1').apply(        lambda x:Promo2MonthCal(x['PromoInterval'],x['Month']), axis=1)
        
    # Only consider data open==1 
    dataframe0 = dataframe[(dataframe['Open'] ==1) & (dataframe['Promo2'] == 0)]
    dataframe1 = dataframe[(dataframe['Open'] ==1) & (dataframe['Promo2'] == 1)]

    x0=dataframe0[features_test0]
    x1=dataframe1[features_test1]

    return x0,x1
    
# replace "Date" with "Year","Month","Day","Week"
# replace "StoreType" with "StoreType_a","StoreType_b","StoreType_c","StoreType_d"   
# replace "Assortment" with "Assortment_a","Assortment_b","Assortment_c"
# replace "StateHoliday" with "StateHoliday_0","StateHoliday_a","StateHoliday_b","StateHoliday_c",
# delete "StateHoliday_b","StateHoliday_c" because Kaggle test data does not have these two type of date
# replace "CompetitionOpenSinceMonth","CompetitionOpenSinceYear" with "CompetitionMonth"
# replace "PromoInterval" with "Promo2Month"
# replace ""Promo2SinceWeek","Promo2SinceYear" with "Promo2Week"
# delete target variabel "Sales"
# delete "Promo2"
# delete "Promo2Month","Promo2Week" for records with "Promo2"==0
# delete "Customer" because Kaggle test does not have this information


features_test0=["Id","Store","DayOfWeek","Promo","SchoolHoliday",
          "HaveCompetitor","CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
           "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth"
          ]
features_test1=["Id","Store","DayOfWeek","Promo","SchoolHoliday",
          "HaveCompetitor","CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
            "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",
          "Promo2Month","Promo2Week"
          ]

In [6]:
dt0,dt1 = preprotest(dt,features_test0,features_test1)

In [7]:
x0.to_csv("train0forkagglewtcustomer.csv", index=False)
x1.to_csv("train1forkagglewtcustomer.csv", index=False)

In [8]:
dt0.to_csv("pre0.csv", index=False)
dt1.to_csv("pre1.csv", index=False)

In [ ]: