In [170]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 23:01:16 2015

@author: yilin
"""
# useful code: https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2/code
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", font="monospace")
import plotly
import plotly.plotly as py
py.sign_in('lemonsong', '3lcplsq1a3')
import plotly.graph_objs as go

#import datetime

from sklearn.utils import shuffle
from sklearn import preprocessing
from numpy import float32
from sklearn.preprocessing import Imputer

In [171]:
def getxy(x):
    y = x.Customers
    x.drop('Customers', axis=1, inplace=True)
    #x.drop('Store', axis=1, inplace=True)
    return x,y

In [44]:
data = pd.read_csv("train0forcustomers.csv")
data = data[(data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
            (data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9) |\
           (data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
           (data['Year']==2015) & (data['Month']==4) ]

x,y=getxy(data)

In [45]:
data1 = pd.read_csv("train1forcustomers.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
            (data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9) |\
           (data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
           (data1['Year']==2015) & (data1['Month']==4) ]
x1,y1=getxy(data1)

In [96]:
data = pd.read_csv("train0forcustomers.csv")
data = data[(data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
           (data['Year']==2015) & (data['Month']==4) ]

x,y=getxy(data)

In [97]:
data1 = pd.read_csv("train1forcustomers.csv")
data1 = data1[(data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
           (data1['Year']==2015) & (data1['Month']==4) ]
x1,y1=getxy(data1)

In [187]:
data = pd.read_csv("train0forcustomers.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7) |\
            (data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
            (data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9) |\
            (data['Year']==2015) & (data['Month']==7) ]

x,y=getxy(data)

In [189]:
data1 = pd.read_csv("train1forcustomers.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7) |\
            (data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
            (data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9) |\
            (data1['Year']==2015) & (data1['Month']==7) ]
x1,y1=getxy(data1)

Split Data


In [190]:
def splitdata(x,y):# Split data into train and test
    train, test = shuffle(x,y, random_state=15)
    offset = int(train.shape[0] * 0.7)
    x_train, y_train = train[:offset], test[:offset]
    x_test, y_test = train[offset:], test[offset:]
    return x_train, y_train,x_test, y_test

In [191]:
x_train, y_train,x_test, y_test = splitdata(x,y)

In [192]:
print x_train.columns


Index([u'Store', u'DayOfWeek', u'Promo', u'SchoolHoliday', u'HaveCompetitor',
       u'CompetitionDistance', u'Year', u'Month', u'Day', u'Week',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'StateHoliday_0',
       u'StateHoliday_a', u'CompetitionMonth'],
      dtype='object')

In [193]:
x_train1, y_train1,x_test1, y_test1 = splitdata(x1,y1)

Builde Model

DT

In [194]:
from sklearn import tree
clf2 = tree.DecisionTreeRegressor(max_features='auto')
clf2.fit(x_train, y_train)
y_pred2 = clf2.predict(x_test)

In [195]:
from sklearn import tree
clf12 = tree.DecisionTreeRegressor(max_features='auto')
clf12.fit(x_train1, y_train1)
y_pred12 = clf12.predict(x_test1)
KNN

In [199]:
from sklearn.neighbors import KNeighborsRegressor
clf3 = KNeighborsRegressor(n_neighbors=5,weights='distance',algorithm='auto')
clf3.fit(x_train, y_train)
y_pred3=clf3.predict(x_test)

In [200]:
from sklearn.neighbors import KNeighborsRegressor
clf13 = KNeighborsRegressor(n_neighbors=10,weights='distance',algorithm='auto')
clf13.fit(x_train1, y_train1)
y_pred13=clf13.predict(x_test1)
RF

In [201]:
from sklearn.ensemble import RandomForestRegressor
clf4 = RandomForestRegressor(n_estimators=100)
clf4.fit(x_train, y_train)
y_pred4=clf4.predict(x_test)

In [202]:
from sklearn.ensemble import RandomForestRegressor
clf14 = RandomForestRegressor(n_estimators=100)
clf14.fit(x_train1, y_train1)
y_pred14=clf14.predict(x_test1)

Evaluation


In [181]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y)
    yhat = np.expm1(yhat)
    print y
    return "rmspe", rmspe(y,yhat)

In [182]:
def compare(y_test,y_pred,y_test1,y_pred1):
    y_test=np.append(y_test,y_test1)
    y_pred=np.append(y_pred,y_pred1)
    return rmspe(y_test,y_pred)
DT

In [196]:
print rmspe(y_test,y_pred2)


0.115054822918

In [197]:
print rmspe(y_test1,y_pred12)


0.118947533529

In [198]:
print compare(y_test,y_pred2,y_test1,y_pred12)


0.116955351932
KNN

In [203]:
print rmspe(y_test,y_pred3)


0.245184479992

In [204]:
print rmspe(y_test1,y_pred13)


0.233931210729

In [205]:
print compare(y_test,y_pred3,y_test1,y_pred13)


0.23980304971
RF

In [206]:
print rmspe(y_test,y_pred4)


0.0859897399445

In [207]:
print rmspe(y_test1,y_pred14)


0.0893187668543

In [209]:
print compare(y_test,y_pred4,y_test1,y_pred14)


0.0876170220362

Make Prediction


In [210]:
def makeprediction(testfile,feature,clf):
    #train_x = pd.read_csv(trainfile).astype(float32)
    pre_x = pd.read_csv(testfile).astype(float32)
    #print np.all(np.isfinite(train_x))
    print np.all(np.isfinite(pre_x))
    
    
    #train_x,train_y=getxy(train_x)
    
    
    pre_y = clf.predict(pre_x[feature])
    prediction = pd.concat([pre_x, pd.Series(pre_y,name='Customers')], axis=1)

    return prediction
feature0=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
          "CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
           "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth"
          ]
feature1=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
          "CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
            "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",
          "Promo2Month","Promo2Week"
          ]

In [211]:
prediction0=makeprediction('pre0.csv',feature0,clf4)


True

In [212]:
prediction1=makeprediction('pre1.csv',feature1,clf14)


True

In [213]:
prediction0.to_csv("pre0wtcustomers.csv", index=False)
prediction1.to_csv("pre1wtcustomers.csv", index=False)

In [ ]: