In [191]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 23:01:16 2015
@author: yilin
"""
# useful code: https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2/code
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", font="monospace")
import plotly
import plotly.plotly as py
py.sign_in('lemonsong', '3lcplsq1a3')
import plotly.graph_objs as go
#import datetime
from sklearn.utils import shuffle
from sklearn import preprocessing
from numpy import float32
from sklearn.preprocessing import Imputer
In [265]:
def getxy(x):
y = x.Sales
x.drop('Sales', axis=1, inplace=True)
#x.drop('Store', axis=1, inplace=True)
return x,y
In [325]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
In [311]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7) |\
(data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
(data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9) |\
(data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
(data['Year']==2015) & (data['Month']==7) ]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7) |\
(data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
(data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9) |\
(data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
(data1['Year']==2015) & (data1['Month']==7) ]
In [312]:
data=pd.DataFrame(data)
data.to_csv("bigml0.csv", index=False)
data1=pd.DataFrame(data1)
In [219]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
(data['Year']==2015) & (data['Month']==7) ]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
(data1['Year']==2015) & (data1['Month']==7) ]
In [242]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7) |\
(data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
(data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9)]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7) |\
(data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
(data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9)]
In [267]:
x,y=getxy(data)
x1,y1=getxy(data1)
In [268]:
def splitdata(x,y):# Split data into train and test
train, test = shuffle(x,y, random_state=15)
offset = int(train.shape[0] * 0.7)
x_train, y_train = train[:offset], test[:offset]
x_test, y_test = train[offset:], test[offset:]
return x_train, y_train,x_test, y_test
In [269]:
x_train, y_train,x_test, y_test = splitdata(x,y)
In [270]:
print x_train.columns
In [271]:
x_train1, y_train1,x_test1, y_test1 = splitdata(x1,y1)
In [272]:
from sklearn import tree
clf2 = tree.DecisionTreeRegressor(max_features='auto')
clf2.fit(x_train, y_train)
y_pred2 = clf2.predict(x_test)
In [273]:
from sklearn import tree
clf12 = tree.DecisionTreeRegressor(max_features='auto')
clf12.fit(x_train1, y_train1)
y_pred12 = clf12.predict(x_test1)
In [274]:
from sklearn.neighbors import KNeighborsRegressor
clf3 = KNeighborsRegressor(n_neighbors=5,weights='distance',algorithm='auto')
clf3.fit(x_train, y_train)
y_pred3=clf3.predict(x_test)
In [275]:
from sklearn.neighbors import KNeighborsRegressor
clf13 = KNeighborsRegressor(n_neighbors=10,weights='distance',algorithm='auto')
clf13.fit(x_train1, y_train1)
y_pred13=clf13.predict(x_test1)
In [276]:
from sklearn.ensemble import RandomForestRegressor
clf4 = RandomForestRegressor(n_estimators=300)
clf4.fit(x_train, y_train)
y_pred4=clf4.predict(x_test)
In [277]:
from sklearn.ensemble import RandomForestRegressor
clf14 = RandomForestRegressor(n_estimators=300)
clf14.fit(x_train1, y_train1)
y_pred14=clf14.predict(x_test1)
In [289]:
def getfeature_importance(df,clf):
feature_importance= pd.concat([pd.Series(list(df.columns),name='Feature'),\
pd.Series(clf.feature_importances_,name='Importance')],\
axis=1).sort(['Importance'], ascending=[1])
return feature_importance
In [290]:
feature_importance=getfeature_importance(x_train,clf4)
feature_importance1=getfeature_importance(x_train1,clf14)
In [291]:
featureimportance = pd.merge(feature_importance,feature_importance1,on="Feature", how='outer')
print featureimportance
featureimportance.to_csv("featureimportance.csv", index=False)
In [292]:
%matplotlib inline
trace1 = go.Bar(
y=featureimportance.Feature,
x=featureimportance.Importance_x,
name='Promo2==0',
orientation = 'h',
marker = dict(
color = 'rgba(55, 128, 191, 0.6)',
line = dict(
color = 'rgba(55, 128, 191, 1.0)',
width = 1,
)
)
)
trace2 = go.Bar(
y=featureimportance.Feature,
x=featureimportance.Importance_y,
name='Promo2==1',
orientation = 'h',
marker = dict(
color = 'rgba(255, 153, 51, 0.6)',
line = dict(
color = 'rgba(255, 153, 51, 1.0)',
width = 1,
)
)
)
data = [trace1, trace2]
layout = go.Layout(
barmode='group'
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='marker-h-bar')
In [293]:
import plotly.tools as tls
tls.embed("https://plot.ly/~lemonsong/43/promo20-vs-promo21/")
Out[293]:
In [29]:
predcollect=pd.concat([pd.Series(y_pred2,name='dt'),pd.Series(y_pred3,name='knn'),pd.Series(y_pred4,name='rf')], axis=1)
In [30]:
pred1collect=pd.concat([pd.Series(y_pred12,name='dt'),pd.Series(y_pred13,name='knn'),pd.Series(y_pred14,name='rf')], axis=1)
In [31]:
predavg= predcollect.mean(axis=1)
In [32]:
pred1avg= pred1collect.mean(axis=1)
In [254]:
def rmspe(y, yhat):
return np.sqrt(np.mean((yhat/y-1) ** 2))
def rmspe_xg(yhat, y):
y = np.expm1(y)
yhat = np.expm1(yhat)
print y
return "rmspe", rmspe(y,yhat)
Function to calculate RMSPE for both Promo2==0 and Promo2==1 test
In [255]:
def compare(y_test,y_pred,y_test1,y_pred1):
y_test=np.append(y_test,y_test1)
y_pred=np.append(y_pred,y_pred1)
return rmspe(y_test,y_pred)
Promo2==0
In [256]:
print rmspe(y_test,y_pred2)
Promo2==1
In [257]:
print rmspe(y_test1,y_pred12)
Promo2==0 & Promo2==1
In [258]:
print compare(y_test,y_pred2,y_test1,y_pred12)
In [259]:
print rmspe(y_test,y_pred3)
In [260]:
print rmspe(y_test1,y_pred13)
In [261]:
print compare(y_test,y_pred3,y_test1,y_pred13)
In [262]:
print rmspe(y_test,y_pred4)
In [263]:
print rmspe(y_test1,y_pred14)
In [264]:
print compare(y_test,y_pred4,y_test1,y_pred14)
In [33]:
print rmspe(y_test,predavg)
In [34]:
print rmspe(y_test1,pred1avg)
In [35]:
print compare(y_test,predavg,y_test1,pred1avg)
In [287]:
tree.export_graphviz(clf2,out_file='tree0.dot',max_depth=8)
In [288]:
tree.export_graphviz(clf12,out_file='tree1.dot',max_depth=8)
In [ ]:
In [294]:
def makeprediction(testfile,feature,clf):
#train_x = pd.read_csv(trainfile).astype(float32)
pre_x = pd.read_csv(testfile).astype(float32)
#print np.all(np.isfinite(train_x))
print np.all(np.isfinite(pre_x))
#train_x,train_y=getxy(train_x)
pre_y = clf.predict(pre_x[feature])
prediction = pd.concat([pre_x, pd.Series(pre_y,name='Sales')], axis=1)
return prediction
feature0=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
"CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth",'Customers'
]
feature1=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
"CompetitionDistance",
"Year","Month","Day","Week",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"StateHoliday_0","StateHoliday_a",
"CompetitionMonth",
"Promo2Month","Promo2Week",'Customers'
]
In [295]:
prediction0=makeprediction('pre0wtcustomers.csv',feature0,clf4)
In [296]:
prediction1=makeprediction('pre1wtcustomers.csv',feature1,clf14)
In [ ]:
prediction02=makeprediction('pre0.csv',feature0,clf2)
prediction03=makeprediction('pre0.csv',feature0,clf3)
prediction04=makeprediction('pre0.csv',feature0,clf4)
In [ ]:
prediction12=makeprediction('pre1.csv',feature1,clf12)
prediction13=makeprediction('pre1.csv',feature1,clf13)
prediction14=makeprediction('pre1.csv',feature1,clf14)
In [ ]:
def mergeavg(predition2,predition3,predition4):
predcollect=pd.concat([pd.Series(predition2,name='dt'),pd.Series(predition3,name='knn'),pd.Series(predition4,name='rf')], axis=1)
predavg= predcollect.mean(axis=1)
return predavg
In [ ]:
prediction0=mergeavg(prediction02.Sales,prediction03.Sales,prediction04.Sales)
In [ ]:
prediction1=mergeavg(prediction12.Sales,prediction13.Sales,prediction14.Sales)
In [ ]:
def generatepreforsub(filename,pred):
pre_x = pd.read_csv(filename).astype(float32)
prediction = pd.concat([pre_x.Id, pd.Series(pred,name='Sales')], axis=1)
return prediction
In [ ]:
prediction0=generatepreforsub('pre0.csv',prediction0)
In [ ]:
prediction1=generatepreforsub('pre1.csv',prediction1)
In [313]:
prediction_sub0=pd.DataFrame(prediction0[["Id","Sales"]],columns=["Id","Sales"])
prediction_sub1=pd.DataFrame(prediction1[["Id","Sales"]],columns=["Id","Sales"])
prediction_sub=pd.concat([prediction_sub0,prediction_sub1])
print len(prediction_sub)
submission = pd.read_csv("submission.csv")
submission = pd.merge(submission,prediction_sub,on="Id", how='outer')
submission.fillna(0, inplace=True)
In [298]:
submission.to_csv("submission4.csv", index=False)
Only includ test and prediction of test with open==1 or open==null
In [326]:
prediction0.to_csv("prediction0.csv", index=False)
prediction1.to_csv("prediction1.csv", index=False)
In [327]:
fet=["Store","DayOfWeek","Promo","SchoolHoliday","StateHoliday_0","StateHoliday_a",
"Year","Month","Day",
"StoreType_a","StoreType_b","StoreType_c","StoreType_d",
"Assortment_a","Assortment_b","Assortment_c",
"Customers","Sales"]
prediction_ana0=pd.DataFrame(prediction0[fet])
prediction_ana0["Promo2"]=0
print prediction_ana0.head()
prediction_ana1=pd.DataFrame(prediction1[fet])
prediction_ana0["Promo2"]=1
data_ana0=pd.DataFrame(data[fet])
prediction_ana0["Promo2"]=0
data_ana1=pd.DataFrame(data1[fet])
prediction_ana0["Promo2"]=1
prediction_ana=pd.concat([prediction_ana0,prediction_ana1,data_ana0,data_ana1])
In [328]:
y = np.array(prediction_ana['Year']-1970, dtype='<M8[Y]')
m = np.array(prediction_ana['Month']-1, dtype='<m8[M]')
d = np.array(prediction_ana['Day']-1, dtype='<m8[D]')
prediction_ana['Date'] = pd.Series(y+m+d)
In [329]:
print prediction_ana.dtypes
In [330]:
print prediction_ana.head()
In [331]:
prediction_ana.drop(["Day","Month","Year"], axis=1, inplace=True)
In [332]:
gr_date=prediction_ana.groupby(['Date'])
In [333]:
gr_date_sales=gr_date.agg({'Customers' : 'mean', 'Sales' : 'mean'})
In [334]:
print gr_date_sales.head()
In [335]:
trace1 = go.Scatter(
x=gr_date_sales.index,
y=gr_date_sales.Customers,
name='Customers',
line=dict(
color='#ae32e4',
width = 1 ,
)
)
trace2 = go.Scatter(
x=gr_date_sales.index,
y=gr_date_sales.Sales,
name='Sales',
mode = 'lines+markers',
yaxis='y2',
line=dict(
color='#3268e4',
width = 1
),
opacity=0.8
)
data = [trace1, trace2]
layout = go.Layout(
title='Time Series of Prediction',
yaxis=dict(
title='Customers'
),
yaxis2=dict(
title='Sales',
titlefont=dict(
color='rgb(174,50,228)'
),
tickfont=dict(
color='rgb(174,50,228)'
),
overlaying='y',
side='right'
)
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='multiple-axes-double')
tls.embed("https://plot.ly/~lemonsong/54/time-series-of-prediction/")
Out[335]:
In [336]:
gr_assortment=prediction_ana
In [337]:
#gr_assortment.query('Assortment_a==1')['Assortment']='basic'
gr_assortment.ix[gr_assortment.Assortment_a==1, 'Assortment'] = 'basic'
gr_assortment.ix[gr_assortment.Assortment_b==1, 'Assortment'] = 'extra'
gr_assortment.ix[gr_assortment.Assortment_c==1, 'Assortment'] = 'extended'
gr_assortment.drop(['Assortment_a','Assortment_b','Assortment_c'], axis=1, inplace=True)
In [338]:
print gr_assortment.columns
In [339]:
gr_assortment1=gr_assortment.groupby(['Assortment', 'DayOfWeek'])
gr_assortment1=gr_assortment1.agg({ 'Customers' : 'sum','Store':'count'}).reset_index()
gr_assortment1['Coustomers_by_store']=gr_assortment1['Customers']/gr_assortment1['Store']
gr_assortment1
Out[339]:
In [340]:
gr_assortment2=gr_assortment1.pivot('Assortment', 'DayOfWeek', 'Coustomers_by_store')
print gr_assortment2
In [341]:
data = [
go.Heatmap(
z=gr_assortment2.values,
x=gr_assortment2.columns,
y=gr_assortment2.index,
colorscale=[[0, '"rgb(228, 174, 50)"'],[1, 'rgb(174, 50, 228)']]
)
]
layout = go.Layout(
title='Average Customers',
yaxis=dict(
title='Assortment',
),
xaxis=dict(
type="category",
title='WeekOfDay',
)
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='labelled-heatmap')
tls.embed("https://plot.ly/~lemonsong/80/average-sales/")
Out[341]:
In [342]:
gr_store=prediction_ana
In [343]:
gr_store=gr_store.groupby(['Store'])
gr_store_sales=gr_store.agg({'Customers' : 'sum', 'Sales' : 'sum','Promo':'sum','Promo2':'sum'}).reset_index()
In [344]:
gr_store1=pd.merge(gr_store_sales,prediction_ana[['Store','Assortment']],on="Store", how='left').drop_duplicates()
In [345]:
gr_store1.head()
Out[345]:
In [346]:
gr_store1_assort=gr_store1.groupby(['Assortment'])
gr_store_sales_agg=gr_store1_assort.agg({'Customers' : 'sum', 'Sales' : 'sum','Store':'count','Promo':'sum','Promo2':'sum'}).reset_index()
In [347]:
gr_store_sales_agg
Out[347]:
In [348]:
fig = {
"data": [
{
"values": gr_store_sales_agg.Store,
"labels": gr_store_sales_agg.Assortment,
"domain": {"x": [0, .33]},
"name": "Store",
"hoverinfo":"label+percent+name",
"hole": .4,
"type": "pie"
},
{
"values": gr_store_sales_agg.Customers,
"labels":gr_store_sales_agg.Assortment,
"text":"Customers",
"textposition":"inside",
"domain": {"x": [.33, .66]},
"name": "Customers",
"hoverinfo":"label+percent+name",
"hole": .4,
"type": "pie"
},
{
"values": gr_store_sales_agg.Sales,
"labels":gr_store_sales_agg.Assortment,
"text":"Sales",
"textposition":"inside",
"domain": {"x": [.66, 1]},
"name": "Sales",
"hoverinfo":"label+percent+name",
"hole": .4,
"type": "pie"
},
],
"layout": {
"title":"Percentage by Assortment Type",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Store",
"x": 0.10,
"y": 0.5
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Customers",
"x": 0.5,
"y": 0.5
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Sales",
"x": 0.9,
"y": 0.5
}
]
}
}
url = py.plot(fig, filename='Global Emissions 1990-2011')
In [ ]:
In [ ]: