In [170]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 23:01:16 2015
@author: yilin
"""
# useful code: https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2/code
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", font="monospace")
import plotly
import plotly.plotly as py
py.sign_in('lemonsong', '3lcplsq1a3')
import plotly.graph_objs as go
#import datetime
from sklearn.utils import shuffle
from sklearn import preprocessing
from numpy import float32
from sklearn.preprocessing import Imputer
In [171]:
def getxy(x):
y = x.Customers
x.drop('Customers', axis=1, inplace=True)
#x.drop('Store', axis=1, inplace=True)
return x,y
In [44]:
data = pd.read_csv("train0forcustomers.csv")
data = data[(data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
(data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9) |\
(data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
(data['Year']==2015) & (data['Month']==4) ]
x,y=getxy(data)
In [45]:
data1 = pd.read_csv("train1forcustomers.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
(data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9) |\
(data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
(data1['Year']==2015) & (data1['Month']==4) ]
x1,y1=getxy(data1)
In [96]:
data = pd.read_csv("train0forcustomers.csv")
data = data[(data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
(data['Year']==2015) & (data['Month']==4) ]
x,y=getxy(data)
In [97]:
data1 = pd.read_csv("train1forcustomers.csv")
data1 = data1[(data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
(data1['Year']==2015) & (data1['Month']==4) ]
x1,y1=getxy(data1)
In [187]:
data = pd.read_csv("train0forcustomers.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7) |\
(data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
(data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9) |\
(data['Year']==2015) & (data['Month']==7) ]
x,y=getxy(data)
In [189]:
data1 = pd.read_csv("train1forcustomers.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7) |\
(data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
(data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9) |\
(data1['Year']==2015) & (data1['Month']==7) ]
x1,y1=getxy(data1)
In [190]:
def splitdata(x,y):# Split data into train and test
train, test = shuffle(x,y, random_state=15)
offset = int(train.shape[0] * 0.7)
x_train, y_train = train[:offset], test[:offset]
x_test, y_test = train[offset:], test[offset:]
return x_train, y_train,x_test, y_test
In [191]:
x_train, y_train,x_test, y_test = splitdata(x,y)
In [192]:
print x_train.columns
In [193]:
x_train1, y_train1,x_test1, y_test1 = splitdata(x1,y1)
In [194]:
from sklearn import tree
clf2 = tree.DecisionTreeRegressor(max_features='auto')
clf2.fit(x_train, y_train)
y_pred2 = clf2.predict(x_test)
In [195]:
from sklearn import tree
clf12 = tree.DecisionTreeRegressor(max_features='auto')
clf12.fit(x_train1, y_train1)
y_pred12 = clf12.predict(x_test1)
In [199]:
from sklearn.neighbors import KNeighborsRegressor
clf3 = KNeighborsRegressor(n_neighbors=5,weights='distance',algorithm='auto')
clf3.fit(x_train, y_train)
y_pred3=clf3.predict(x_test)
In [200]:
from sklearn.neighbors import KNeighborsRegressor
clf13 = KNeighborsRegressor(n_neighbors=10,weights='distance',algorithm='auto')
clf13.fit(x_train1, y_train1)
y_pred13=clf13.predict(x_test1)
In [201]:
from sklearn.ensemble import RandomForestRegressor
clf4 = RandomForestRegressor(n_estimators=100)
clf4.fit(x_train, y_train)
y_pred4=clf4.predict(x_test)
In [202]:
from sklearn.ensemble import RandomForestRegressor
clf14 = RandomForestRegressor(n_estimators=100)
clf14.fit(x_train1, y_train1)
y_pred14=clf14.predict(x_test1)
In [181]:
def rmspe(y, yhat):
return np.sqrt(np.mean((yhat/y-1) ** 2))
def rmspe_xg(yhat, y):
y = np.expm1(y)
yhat = np.expm1(yhat)
print y
return "rmspe", rmspe(y,yhat)
In [182]:
def compare(y_test,y_pred,y_test1,y_pred1):
y_test=np.append(y_test,y_test1)
y_pred=np.append(y_pred,y_pred1)
return rmspe(y_test,y_pred)
In [196]:
print rmspe(y_test,y_pred2)
In [197]:
print rmspe(y_test1,y_pred12)
In [198]:
print compare(y_test,y_pred2,y_test1,y_pred12)
In [203]:
print rmspe(y_test,y_pred3)
In [204]:
print rmspe(y_test1,y_pred13)
In [205]:
print compare(y_test,y_pred3,y_test1,y_pred13)
In [206]:
print rmspe(y_test,y_pred4)
In [207]:
print rmspe(y_test1,y_pred14)
In [209]:
print compare(y_test,y_pred4,y_test1,y_pred14)
In [210]:
def makeprediction(testfile,feature,clf):
#train_x = pd.read_csv(trainfile).astype(float32)
pre_x = pd.read_csv(testfile).astype(float32)
#print np.all(np.isfinite(train_x))
print np.all(np.isfinite(pre_x))
#train_x,train_y=getxy(train_x)
pre_y = clf.predict(pre_x[feature])
prediction = pd.concat([pre_x, pd.Series(pre_y,name='Customers')], axis=1)
return prediction
feature0=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
"CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth"
]
feature1=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
"CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth",
"Promo2Month","Promo2Week"
]
In [211]:
prediction0=makeprediction('pre0.csv',feature0,clf4)
In [212]:
prediction1=makeprediction('pre1.csv',feature1,clf14)
In [213]:
prediction0.to_csv("pre0wtcustomers.csv", index=False)
prediction1.to_csv("pre1wtcustomers.csv", index=False)
In [ ]: