notebook.community

Edit and run



In [191]:

    
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 23:01:16 2015

@author: yilin
"""
# useful code: https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2/code
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", font="monospace")
import plotly
import plotly.plotly as py
py.sign_in('lemonsong', '3lcplsq1a3')
import plotly.graph_objs as go

#import datetime

from sklearn.utils import shuffle
from sklearn import preprocessing
from numpy import float32
from sklearn.preprocessing import Imputer



In [265]:

    
def getxy(x):
    y = x.Sales
    x.drop('Sales', axis=1, inplace=True)
    #x.drop('Store', axis=1, inplace=True)
    return x,y



In [325]:

    
data = pd.read_csv("train0forkagglewtcustomer.csv")
data1 = pd.read_csv("train1forkagglewtcustomer.csv")



In [311]:

    
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7)  |\
            (data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
            (data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9) |\
           (data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
           (data['Year']==2015) & (data['Month']==7) ]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7)  |\
            (data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
            (data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9) |\
           (data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
           (data1['Year']==2015) & (data1['Month']==7) ]



In [312]:

    
data=pd.DataFrame(data)
data.to_csv("bigml0.csv", index=False)
data1=pd.DataFrame(data1)



In [219]:

    
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
           (data['Year']==2015) & (data['Month']==7) ]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
           (data1['Year']==2015) & (data1['Month']==7) ]



In [242]:

    
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7)  |\
            (data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
            (data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9)]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7)  |\
            (data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
            (data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9)]



In [267]:

    
x,y=getxy(data)
x1,y1=getxy(data1)

Split Data



In [268]:

    
def splitdata(x,y):# Split data into train and test
    train, test = shuffle(x,y, random_state=15)
    offset = int(train.shape[0] * 0.7)
    x_train, y_train = train[:offset], test[:offset]
    x_test, y_test = train[offset:], test[offset:]
    return x_train, y_train,x_test, y_test



In [269]:

    
x_train, y_train,x_test, y_test = splitdata(x,y)



In [270]:

    
print x_train.columns









    



Index([u'Store', u'DayOfWeek', u'Promo', u'SchoolHoliday', u'HaveCompetitor',
       u'CompetitionDistance', u'Year', u'Month', u'Day', u'Week',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'StateHoliday_0',
       u'StateHoliday_a', u'CompetitionMonth', u'Customers'],
      dtype='object')



In [271]:

    
x_train1, y_train1,x_test1, y_test1 = splitdata(x1,y1)

Builde Model

DT



In [272]:

    
from sklearn import tree
clf2 = tree.DecisionTreeRegressor(max_features='auto')
clf2.fit(x_train, y_train)
y_pred2 = clf2.predict(x_test)



In [273]:

    
from sklearn import tree
clf12 = tree.DecisionTreeRegressor(max_features='auto')
clf12.fit(x_train1, y_train1)
y_pred12 = clf12.predict(x_test1)

KNN



In [274]:

    
from sklearn.neighbors import KNeighborsRegressor
clf3 = KNeighborsRegressor(n_neighbors=5,weights='distance',algorithm='auto')
clf3.fit(x_train, y_train)
y_pred3=clf3.predict(x_test)



In [275]:

    
from sklearn.neighbors import KNeighborsRegressor
clf13 = KNeighborsRegressor(n_neighbors=10,weights='distance',algorithm='auto')
clf13.fit(x_train1, y_train1)
y_pred13=clf13.predict(x_test1)

RF



In [276]:

    
from sklearn.ensemble import RandomForestRegressor
clf4 = RandomForestRegressor(n_estimators=300)
clf4.fit(x_train, y_train)
y_pred4=clf4.predict(x_test)



In [277]:

    
from sklearn.ensemble import RandomForestRegressor
clf14 = RandomForestRegressor(n_estimators=300)
clf14.fit(x_train1, y_train1)
y_pred14=clf14.predict(x_test1)

Feature Importance



In [289]:

    
def getfeature_importance(df,clf):
    feature_importance= pd.concat([pd.Series(list(df.columns),name='Feature'),\
                               pd.Series(clf.feature_importances_,name='Importance')],\
                              axis=1).sort(['Importance'], ascending=[1])
    return feature_importance



In [290]:

    
feature_importance=getfeature_importance(x_train,clf4)
feature_importance1=getfeature_importance(x_train1,clf14)



In [291]:

    
featureimportance = pd.merge(feature_importance,feature_importance1,on="Feature", how='outer')

print featureimportance
featureimportance.to_csv("featureimportance.csv", index=False)









    



                Feature  Importance_x  Importance_y
0        HaveCompetitor      0.000033      0.000005
1        StateHoliday_a      0.000065      0.000018
2        StateHoliday_0      0.000071      0.000018
3         SchoolHoliday      0.000902      0.001177
4           StoreType_c      0.001294      0.001040
5                 Month      0.001297      0.001600
6           StoreType_a      0.001703      0.001779
7          Assortment_a      0.002244      0.002869
8          Assortment_c      0.002447      0.002820
9                  Year      0.003039      0.004429
10                 Week      0.003800      0.004933
11                  Day      0.005641      0.009144
12            DayOfWeek      0.009455      0.014090
13          StoreType_b      0.013134      0.013617
14         Assortment_b      0.014945      0.013282
15     CompetitionMonth      0.017033      0.019052
16          StoreType_d      0.033185      0.054346
17                Store      0.035426      0.031710
18                Promo      0.035536      0.066964
19  CompetitionDistance      0.048419      0.060506
20            Customers      0.770330      0.670496
21          Promo2Month           NaN      0.002017
22           Promo2Week           NaN      0.024087



In [292]:

    
%matplotlib inline

trace1 = go.Bar(
    y=featureimportance.Feature,
    x=featureimportance.Importance_x,
    name='Promo2==0',
    orientation = 'h',
    marker = dict(
        color = 'rgba(55, 128, 191, 0.6)',
        line = dict(
            color = 'rgba(55, 128, 191, 1.0)',
            width = 1,
        )
    )
)
trace2 = go.Bar(
    y=featureimportance.Feature,
    x=featureimportance.Importance_y,
    name='Promo2==1',
    orientation = 'h',
    marker = dict(
        color = 'rgba(255, 153, 51, 0.6)',
        line = dict(
            color = 'rgba(255, 153, 51, 1.0)',
            width = 1,
        )
    )
)
data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='marker-h-bar')



In [293]:

    
import plotly.tools as tls

tls.embed("https://plot.ly/~lemonsong/43/promo20-vs-promo21/")









    Out[293]:

Predict based on average of three algorithm



In [29]:

    
predcollect=pd.concat([pd.Series(y_pred2,name='dt'),pd.Series(y_pred3,name='knn'),pd.Series(y_pred4,name='rf')], axis=1)



In [30]:

    
pred1collect=pd.concat([pd.Series(y_pred12,name='dt'),pd.Series(y_pred13,name='knn'),pd.Series(y_pred14,name='rf')], axis=1)



In [31]:

    
predavg= predcollect.mean(axis=1)



In [32]:

    
pred1avg= pred1collect.mean(axis=1)

Evaluation



In [254]:

    
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y)
    yhat = np.expm1(yhat)
    print y
    return "rmspe", rmspe(y,yhat)

Function to calculate RMSPE for both Promo2==0 and Promo2==1 test



In [255]:

    
def compare(y_test,y_pred,y_test1,y_pred1):
    y_test=np.append(y_test,y_test1)
    y_pred=np.append(y_pred,y_pred1)
    return rmspe(y_test,y_pred)

DT

Promo2==0



In [256]:

    
print rmspe(y_test,y_pred2)









    



0.103550818995

Promo2==1



In [257]:

    
print rmspe(y_test1,y_pred12)









    



0.130232097084

Promo2==0 & Promo2==1



In [258]:

    
print compare(y_test,y_pred2,y_test1,y_pred12)









    



0.117093827408

KNN



In [259]:

    
print rmspe(y_test,y_pred3)









    



0.0827110382687



In [260]:

    
print rmspe(y_test1,y_pred13)









    



0.0901338125419



In [261]:

    
print compare(y_test,y_pred3,y_test1,y_pred13)









    



0.0863466452436

RF



In [262]:

    
print rmspe(y_test,y_pred4)









    



0.0703702331174



In [263]:

    
print rmspe(y_test1,y_pred14)









    



0.0799880772887



In [264]:

    
print compare(y_test,y_pred4,y_test1,y_pred14)









    



0.0751315384619

Average method

Predict sales based on average of predictions from three algorithms



In [33]:

    
print rmspe(y_test,predavg)









    



0.850845818724



In [34]:

    
print rmspe(y_test1,pred1avg)









    



0.823322014024



In [35]:

    
print compare(y_test,predavg,y_test1,pred1avg)









    



0.0699363310493

Export Decision Tree



In [287]:

    
tree.export_graphviz(clf2,out_file='tree0.dot',max_depth=8)



In [288]:

    
tree.export_graphviz(clf12,out_file='tree1.dot',max_depth=8)



In [ ]:

Make Prediction



In [294]:

    
def makeprediction(testfile,feature,clf):
    #train_x = pd.read_csv(trainfile).astype(float32)
    pre_x = pd.read_csv(testfile).astype(float32)
    #print np.all(np.isfinite(train_x))
    print np.all(np.isfinite(pre_x))
    
    
    #train_x,train_y=getxy(train_x)
    
    
    pre_y = clf.predict(pre_x[feature])
    prediction = pd.concat([pre_x, pd.Series(pre_y,name='Sales')], axis=1)

    return prediction
feature0=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
          "CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
           "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",'Customers'
          ]
feature1=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
          "CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
            "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",
          "Promo2Month","Promo2Week",'Customers'
          ]



In [295]:

    
prediction0=makeprediction('pre0wtcustomers.csv',feature0,clf4)









    



True



In [296]:

    
prediction1=makeprediction('pre1wtcustomers.csv',feature1,clf14)









    



True

average method

When make submission based on average prediction of three algorithms, use this part



In [ ]:

    
prediction02=makeprediction('pre0.csv',feature0,clf2)
prediction03=makeprediction('pre0.csv',feature0,clf3)
prediction04=makeprediction('pre0.csv',feature0,clf4)



In [ ]:

    
prediction12=makeprediction('pre1.csv',feature1,clf12)
prediction13=makeprediction('pre1.csv',feature1,clf13)
prediction14=makeprediction('pre1.csv',feature1,clf14)



In [ ]:

    
def mergeavg(predition2,predition3,predition4):
    predcollect=pd.concat([pd.Series(predition2,name='dt'),pd.Series(predition3,name='knn'),pd.Series(predition4,name='rf')], axis=1)
    predavg= predcollect.mean(axis=1)
    return predavg



In [ ]:

    
prediction0=mergeavg(prediction02.Sales,prediction03.Sales,prediction04.Sales)



In [ ]:

    
prediction1=mergeavg(prediction12.Sales,prediction13.Sales,prediction14.Sales)



In [ ]:

    
def generatepreforsub(filename,pred):
    pre_x = pd.read_csv(filename).astype(float32)
    prediction = pd.concat([pre_x.Id, pd.Series(pred,name='Sales')], axis=1)
    return prediction



In [ ]:

    
prediction0=generatepreforsub('pre0.csv',prediction0)



In [ ]:

    
prediction1=generatepreforsub('pre1.csv',prediction1)

Make Submission



In [313]:

    
prediction_sub0=pd.DataFrame(prediction0[["Id","Sales"]],columns=["Id","Sales"])
prediction_sub1=pd.DataFrame(prediction1[["Id","Sales"]],columns=["Id","Sales"])

prediction_sub=pd.concat([prediction_sub0,prediction_sub1])
print len(prediction_sub)
submission = pd.read_csv("submission.csv")
submission = pd.merge(submission,prediction_sub,on="Id", how='outer')
submission.fillna(0, inplace=True)



In [298]:

    
submission.to_csv("submission4.csv", index=False)

Generat Data for Advance Analysis

Only includ test and prediction of test with open==1 or open==null



In [326]:

    
prediction0.to_csv("prediction0.csv", index=False)
prediction1.to_csv("prediction1.csv", index=False)



In [327]:

    
fet=["Store","DayOfWeek","Promo","SchoolHoliday","StateHoliday_0","StateHoliday_a",
          "Year","Month","Day",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
            "Customers","Sales"]
prediction_ana0=pd.DataFrame(prediction0[fet])
prediction_ana0["Promo2"]=0
print prediction_ana0.head()
prediction_ana1=pd.DataFrame(prediction1[fet])
prediction_ana0["Promo2"]=1
data_ana0=pd.DataFrame(data[fet])
prediction_ana0["Promo2"]=0
data_ana1=pd.DataFrame(data1[fet])
prediction_ana0["Promo2"]=1
prediction_ana=pd.concat([prediction_ana0,prediction_ana1,data_ana0,data_ana1])









    



   Store  DayOfWeek  Promo  SchoolHoliday  StateHoliday_0  StateHoliday_a  \
0      1          4      1              0               1               0   
1      1          3      1              0               1               0   
2      1          2      1              0               1               0   
3      1          1      1              0               1               0   
4      1          6      0              0               1               0   

   Year  Month  Day  StoreType_a  StoreType_b  StoreType_c  StoreType_d  \
0  2015      9   17            0            0            1            0   
1  2015      9   16            0            0            1            0   
2  2015      9   15            0            0            1            0   
3  2015      9   14            0            0            1            0   
4  2015      9   12            0            0            1            0   

   Assortment_a  Assortment_b  Assortment_c   Customers        Sales  Promo2  
0             1             0             0  484.890015  3674.843333       0  
1             1             0             0  516.349976  3914.896667       0  
2             1             0             0  527.690002  3905.466667       0  
3             1             0             0  579.429993  5384.690000       0  
4             1             0             0  513.880005  4214.560000       0

Creat Date column



In [328]:

    
y = np.array(prediction_ana['Year']-1970, dtype='<M8[Y]')
m = np.array(prediction_ana['Month']-1, dtype='<m8[M]')
d = np.array(prediction_ana['Day']-1, dtype='<m8[D]')
prediction_ana['Date'] = pd.Series(y+m+d)



In [329]:

    
print prediction_ana.dtypes









    



Assortment_a             float64
Assortment_b             float64
Assortment_c             float64
Customers                float64
Day                      float64
DayOfWeek                float64
Month                    float64
Promo                    float64
Promo2                   float64
Sales                    float64
SchoolHoliday            float64
StateHoliday_0           float64
StateHoliday_a           float64
Store                    float64
StoreType_a              float64
StoreType_b              float64
StoreType_c              float64
StoreType_d              float64
Year                     float64
Date              datetime64[ns]
dtype: object



In [330]:

    
print prediction_ana.head()









    



   Assortment_a  Assortment_b  Assortment_c   Customers  Day  DayOfWeek  \
0             1             0             0  484.890015   17          4   
1             1             0             0  516.349976   16          3   
2             1             0             0  527.690002   15          2   
3             1             0             0  579.429993   14          1   
4             1             0             0  513.880005   12          6   

   Month  Promo  Promo2        Sales  SchoolHoliday  StateHoliday_0  \
0      9      1       1  3674.843333              0               1   
1      9      1       1  3914.896667              0               1   
2      9      1       1  3905.466667              0               1   
3      9      1       1  5384.690000              0               1   
4      9      0       1  4214.560000              0               1   

   StateHoliday_a  Store  StoreType_a  StoreType_b  StoreType_c  StoreType_d  \
0               0      1            0            0            1            0   
1               0      1            0            0            1            0   
2               0      1            0            0            1            0   
3               0      1            0            0            1            0   
4               0      1            0            0            1            0   

   Year       Date  
0  2015 2015-09-17  
1  2015 2015-09-16  
2  2015 2015-09-15  
3  2015 2015-09-14  
4  2015 2015-09-12



In [331]:

    
prediction_ana.drop(["Day","Month","Year"], axis=1, inplace=True)

Sales and Customers Prediction by Date



In [332]:

    
gr_date=prediction_ana.groupby(['Date'])



In [333]:

    
gr_date_sales=gr_date.agg({'Customers' : 'mean', 'Sales' : 'mean'})



In [334]:

    
print gr_date_sales.head()









    



             Customers        Sales
Date                               
2013-01-01  832.227273  7404.090909
2013-01-02  767.874747  6994.882828
2013-01-03  773.235830  7062.363360
2013-01-04  761.178138  6975.305668
2013-01-05  766.191684  7009.895538



In [335]:

    
trace1 = go.Scatter(
    x=gr_date_sales.index,
    y=gr_date_sales.Customers,
    name='Customers',
    line=dict(
        color='#ae32e4',
        width = 1 ,
        
    )
)
trace2 = go.Scatter(
    x=gr_date_sales.index,
    y=gr_date_sales.Sales,
    name='Sales',
    mode = 'lines+markers',
    yaxis='y2',
    line=dict(
        color='#3268e4',
        width = 1 
        ),
    opacity=0.8
)
data = [trace1, trace2]
layout = go.Layout(
    title='Time Series of Prediction',
    yaxis=dict(
        title='Customers'
    ),
    yaxis2=dict(
        title='Sales',
        titlefont=dict(
            color='rgb(174,50,228)'
        ),
        tickfont=dict(
            color='rgb(174,50,228)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='multiple-axes-double')
tls.embed("https://plot.ly/~lemonsong/54/time-series-of-prediction/")









    Out[335]:



In [336]:

    
gr_assortment=prediction_ana



In [337]:

    
#gr_assortment.query('Assortment_a==1')['Assortment']='basic'
gr_assortment.ix[gr_assortment.Assortment_a==1, 'Assortment'] = 'basic'
gr_assortment.ix[gr_assortment.Assortment_b==1, 'Assortment'] = 'extra'
gr_assortment.ix[gr_assortment.Assortment_c==1, 'Assortment'] = 'extended'

gr_assortment.drop(['Assortment_a','Assortment_b','Assortment_c'], axis=1, inplace=True)



In [338]:

    
print gr_assortment.columns









    



Index([u'Customers', u'DayOfWeek', u'Promo', u'Promo2', u'Sales',
       u'SchoolHoliday', u'StateHoliday_0', u'StateHoliday_a', u'Store',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d', u'Date',
       u'Assortment'],
      dtype='object')



In [339]:

    
gr_assortment1=gr_assortment.groupby(['Assortment', 'DayOfWeek'])
gr_assortment1=gr_assortment1.agg({ 'Customers' : 'sum','Store':'count'}).reset_index()
gr_assortment1['Coustomers_by_store']=gr_assortment1['Customers']/gr_assortment1['Store']
gr_assortment1









    Out[339]:






  
    
      
      Assortment
      DayOfWeek
      Customers
      Store
      Coustomers_by_store
    
  
  
    
      0
      basic
      1
      63852110.939697
      75512
      845.588925
    
    
      1
      basic
      2
      60317516.748489
      78987
      763.638532
    
    
      2
      basic
      3
      56956555.221161
      77892
      731.224711
    
    
      3
      basic
      4
      54820759.229858
      73927
      741.552602
    
    
      4
      basic
      5
      57778947.551147
      75730
      762.959825
    
    
      5
      basic
      6
      48806490.477905
      78964
      618.085336
    
    
      6
      basic
      7
      1793533.239441
      1039
      1726.211010
    
    
      7
      extended
      1
      55911744.629669
      66735
      837.817407
    
    
      8
      extended
      2
      52371614.009567
      69720
      751.170597
    
    
      9
      extended
      3
      49878672.819870
      68774
      725.254788
    
    
      10
      extended
      4
      48556544.109848
      65433
      742.080359
    
    
      11
      extended
      5
      51473334.070618
      66774
      770.858928
    
    
      12
      extended
      6
      47731133.938538
      69685
      684.955642
    
    
      13
      extended
      7
      991295.099731
      1510
      656.486821
    
    
      14
      extra
      1
      2561143.219238
      1203
      2128.963607
    
    
      15
      extra
      2
      2521112.621094
      1236
      2039.735130
    
    
      16
      extra
      3
      2509097.530151
      1227
      2044.904263
    
    
      17
      extra
      4
      2567180.448975
      1220
      2104.246270
    
    
      18
      extra
      5
      2677645.900391
      1195
      2240.707866
    
    
      19
      extra
      6
      2224878.769287
      1230
      1808.844528
    
    
      20
      extra
      7
      2662687.199341
      1233
      2159.519221



In [340]:

    
gr_assortment2=gr_assortment1.pivot('Assortment', 'DayOfWeek', 'Coustomers_by_store')
print gr_assortment2









    



DayOfWeek             1            2            3            4            5  \
Assortment                                                                    
basic        845.588925   763.638532   731.224711   741.552602   762.959825   
extended     837.817407   751.170597   725.254788   742.080359   770.858928   
extra       2128.963607  2039.735130  2044.904263  2104.246270  2240.707866   

DayOfWeek             6            7  
Assortment                            
basic        618.085336  1726.211010  
extended     684.955642   656.486821  
extra       1808.844528  2159.519221



In [341]:

    
data = [
    go.Heatmap(
        z=gr_assortment2.values,
        x=gr_assortment2.columns,
        y=gr_assortment2.index,
        colorscale=[[0, '"rgb(228, 174, 50)"'],[1, 'rgb(174, 50, 228)']]
    )
]
layout = go.Layout(
    title='Average Customers',
    yaxis=dict(
        title='Assortment',
    ),
    xaxis=dict(
        type="category",
        title='WeekOfDay',
    )
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='labelled-heatmap')
tls.embed("https://plot.ly/~lemonsong/80/average-sales/")









    Out[341]:



In [342]:

    
gr_store=prediction_ana



In [343]:

    
gr_store=gr_store.groupby(['Store'])
gr_store_sales=gr_store.agg({'Customers' : 'sum', 'Sales' : 'sum','Promo':'sum','Promo2':'sum'}).reset_index()



In [344]:

    
gr_store1=pd.merge(gr_store_sales,prediction_ana[['Store','Assortment']],on="Store", how='left').drop_duplicates()



In [345]:

    
gr_store1.head()









    Out[345]:






  
    
      
      Store
      Promo
      Customers
      Promo2
      Sales
      Assortment
    
  
  
    
      0
      1
      369
      461401.890045
      41
      3886087.213333
      basic
    
    
      822
      2
      354
      457855.000000
      NaN
      3883858.000000
      basic
    
    
      1606
      3
      369
      612658.549927
      NaN
      5696661.756667
      basic
    
    
      2426
      4
      353
      1036254.000000
      NaN
      7556507.000000
      extended
    
    
      3210
      5
      351
      418588.000000
      NaN
      3642818.000000
      basic



In [346]:

    
gr_store1_assort=gr_store1.groupby(['Assortment'])
gr_store_sales_agg=gr_store1_assort.agg({'Customers' : 'sum', 'Sales' : 'sum','Store':'count','Promo':'sum','Promo2':'sum'}).reset_index()



In [347]:

    
gr_store_sales_agg









    Out[347]:






  
    
      
      Assortment
      Promo
      Customers
      Promo2
      Sales
      Store
    
  
  
    
      0
      basic
      206917
      3.443259e+08
      6600
      3.053790e+09
      593
    
    
      1
      extended
      182888
      3.069143e+08
      7978
      2.982794e+09
      513
    
    
      2
      extra
      3272
      1.772375e+07
      236
      7.396227e+07
      9



In [348]:

    
fig = {
  "data": [
    {
      "values": gr_store_sales_agg.Store,
      "labels": gr_store_sales_agg.Assortment,
      "domain": {"x": [0, .33]},
      "name": "Store",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    },     
    {
      "values": gr_store_sales_agg.Customers,
      "labels":gr_store_sales_agg.Assortment,
      "text":"Customers",
      "textposition":"inside",
      "domain": {"x": [.33, .66]},
      "name": "Customers",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    }, 
    {
      "values": gr_store_sales_agg.Sales,
      "labels":gr_store_sales_agg.Assortment,
      "text":"Sales",
      "textposition":"inside",
      "domain": {"x": [.66, 1]},
      "name": "Sales",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    },
  ],
  "layout": {
        "title":"Percentage by Assortment Type",
        "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Store",
                "x": 0.10,
                "y": 0.5
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Customers",
                "x": 0.5,
                "y": 0.5
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Sales",
                "x": 0.9,
                "y": 0.5
            }
        ]
    }
}

url = py.plot(fig, filename='Global Emissions 1990-2011')



In [ ]:



In [ ]:

	Assortment	DayOfWeek	Customers	Store	Coustomers_by_store
0	basic	1	63852110.939697	75512	845.588925
1	basic	2	60317516.748489	78987	763.638532
2	basic	3	56956555.221161	77892	731.224711
3	basic	4	54820759.229858	73927	741.552602
4	basic	5	57778947.551147	75730	762.959825
5	basic	6	48806490.477905	78964	618.085336
6	basic	7	1793533.239441	1039	1726.211010
7	extended	1	55911744.629669	66735	837.817407
8	extended	2	52371614.009567	69720	751.170597
9	extended	3	49878672.819870	68774	725.254788
10	extended	4	48556544.109848	65433	742.080359
11	extended	5	51473334.070618	66774	770.858928
12	extended	6	47731133.938538	69685	684.955642
13	extended	7	991295.099731	1510	656.486821
14	extra	1	2561143.219238	1203	2128.963607
15	extra	2	2521112.621094	1236	2039.735130
16	extra	3	2509097.530151	1227	2044.904263
17	extra	4	2567180.448975	1220	2104.246270
18	extra	5	2677645.900391	1195	2240.707866
19	extra	6	2224878.769287	1230	1808.844528
20	extra	7	2662687.199341	1233	2159.519221

	Store	Promo	Customers	Promo2	Sales	Assortment
0	1	369	461401.890045	41	3886087.213333	basic
822	2	354	457855.000000	NaN	3883858.000000	basic
1606	3	369	612658.549927	NaN	5696661.756667	basic
2426	4	353	1036254.000000	NaN	7556507.000000	extended
3210	5	351	418588.000000	NaN	3642818.000000	basic

	Assortment	Promo	Customers	Promo2	Sales	Store
0	basic	206917	3.443259e+08	6600	3.053790e+09	593
1	extended	182888	3.069143e+08	7978	2.982794e+09	513
2	extra	3272	1.772375e+07	236	7.396227e+07	9