In [191]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 23:01:16 2015

@author: yilin
"""
# useful code: https://www.kaggle.com/cast42/rossmann-store-sales/xgboost-in-python-with-rmspe-v2/code
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="paper", font="monospace")
import plotly
import plotly.plotly as py
py.sign_in('lemonsong', '3lcplsq1a3')
import plotly.graph_objs as go

#import datetime

from sklearn.utils import shuffle
from sklearn import preprocessing
from numpy import float32
from sklearn.preprocessing import Imputer

In [265]:
def getxy(x):
    y = x.Sales
    x.drop('Sales', axis=1, inplace=True)
    #x.drop('Store', axis=1, inplace=True)
    return x,y

In [325]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data1 = pd.read_csv("train1forkagglewtcustomer.csv")

In [311]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7)  |\
            (data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
            (data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9) |\
           (data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
           (data['Year']==2015) & (data['Month']==7) ]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7)  |\
            (data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
            (data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9) |\
           (data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
           (data1['Year']==2015) & (data1['Month']==7) ]

In [312]:
data=pd.DataFrame(data)
data.to_csv("bigml0.csv", index=False)
data1=pd.DataFrame(data1)

In [219]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2015) & (data['Month']==6) | (data['Year']==2014) & (data['Month']==5) |
           (data['Year']==2015) & (data['Month']==7) ]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2015) & (data1['Month']==6) | (data1['Year']==2014) & (data1['Month']==5) |
           (data1['Year']==2015) & (data1['Month']==7) ]

In [242]:
data = pd.read_csv("train0forkagglewtcustomer.csv")
data = data[(data['Year']==2013) & (data['Month']==7) | (data['Year']==2014) & (data['Month']==7)  |\
            (data['Year']==2013) & (data['Month']==8) | (data['Year']==2014) & (data['Month']==8) |\
            (data['Year']==2013) & (data['Month']==9) | (data['Year']==2014) & (data['Month']==9)]
data1 = pd.read_csv("train1forkagglewtcustomer.csv")
data1 = data1[(data1['Year']==2013) & (data1['Month']==7) | (data1['Year']==2014) & (data1['Month']==7)  |\
            (data1['Year']==2013) & (data1['Month']==8) | (data1['Year']==2014) & (data1['Month']==8) |\
            (data1['Year']==2013) & (data1['Month']==9) | (data1['Year']==2014) & (data1['Month']==9)]

In [267]:
x,y=getxy(data)
x1,y1=getxy(data1)

Split Data


In [268]:
def splitdata(x,y):# Split data into train and test
    train, test = shuffle(x,y, random_state=15)
    offset = int(train.shape[0] * 0.7)
    x_train, y_train = train[:offset], test[:offset]
    x_test, y_test = train[offset:], test[offset:]
    return x_train, y_train,x_test, y_test

In [269]:
x_train, y_train,x_test, y_test = splitdata(x,y)

In [270]:
print x_train.columns


Index([u'Store', u'DayOfWeek', u'Promo', u'SchoolHoliday', u'HaveCompetitor',
       u'CompetitionDistance', u'Year', u'Month', u'Day', u'Week',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'StateHoliday_0',
       u'StateHoliday_a', u'CompetitionMonth', u'Customers'],
      dtype='object')

In [271]:
x_train1, y_train1,x_test1, y_test1 = splitdata(x1,y1)

Builde Model

DT

In [272]:
from sklearn import tree
clf2 = tree.DecisionTreeRegressor(max_features='auto')
clf2.fit(x_train, y_train)
y_pred2 = clf2.predict(x_test)

In [273]:
from sklearn import tree
clf12 = tree.DecisionTreeRegressor(max_features='auto')
clf12.fit(x_train1, y_train1)
y_pred12 = clf12.predict(x_test1)
KNN

In [274]:
from sklearn.neighbors import KNeighborsRegressor
clf3 = KNeighborsRegressor(n_neighbors=5,weights='distance',algorithm='auto')
clf3.fit(x_train, y_train)
y_pred3=clf3.predict(x_test)

In [275]:
from sklearn.neighbors import KNeighborsRegressor
clf13 = KNeighborsRegressor(n_neighbors=10,weights='distance',algorithm='auto')
clf13.fit(x_train1, y_train1)
y_pred13=clf13.predict(x_test1)
RF

In [276]:
from sklearn.ensemble import RandomForestRegressor
clf4 = RandomForestRegressor(n_estimators=300)
clf4.fit(x_train, y_train)
y_pred4=clf4.predict(x_test)

In [277]:
from sklearn.ensemble import RandomForestRegressor
clf14 = RandomForestRegressor(n_estimators=300)
clf14.fit(x_train1, y_train1)
y_pred14=clf14.predict(x_test1)

Feature Importance


In [289]:
def getfeature_importance(df,clf):
    feature_importance= pd.concat([pd.Series(list(df.columns),name='Feature'),\
                               pd.Series(clf.feature_importances_,name='Importance')],\
                              axis=1).sort(['Importance'], ascending=[1])
    return feature_importance

In [290]:
feature_importance=getfeature_importance(x_train,clf4)
feature_importance1=getfeature_importance(x_train1,clf14)

In [291]:
featureimportance = pd.merge(feature_importance,feature_importance1,on="Feature", how='outer')

print featureimportance
featureimportance.to_csv("featureimportance.csv", index=False)


                Feature  Importance_x  Importance_y
0        HaveCompetitor      0.000033      0.000005
1        StateHoliday_a      0.000065      0.000018
2        StateHoliday_0      0.000071      0.000018
3         SchoolHoliday      0.000902      0.001177
4           StoreType_c      0.001294      0.001040
5                 Month      0.001297      0.001600
6           StoreType_a      0.001703      0.001779
7          Assortment_a      0.002244      0.002869
8          Assortment_c      0.002447      0.002820
9                  Year      0.003039      0.004429
10                 Week      0.003800      0.004933
11                  Day      0.005641      0.009144
12            DayOfWeek      0.009455      0.014090
13          StoreType_b      0.013134      0.013617
14         Assortment_b      0.014945      0.013282
15     CompetitionMonth      0.017033      0.019052
16          StoreType_d      0.033185      0.054346
17                Store      0.035426      0.031710
18                Promo      0.035536      0.066964
19  CompetitionDistance      0.048419      0.060506
20            Customers      0.770330      0.670496
21          Promo2Month           NaN      0.002017
22           Promo2Week           NaN      0.024087

In [292]:
%matplotlib inline

trace1 = go.Bar(
    y=featureimportance.Feature,
    x=featureimportance.Importance_x,
    name='Promo2==0',
    orientation = 'h',
    marker = dict(
        color = 'rgba(55, 128, 191, 0.6)',
        line = dict(
            color = 'rgba(55, 128, 191, 1.0)',
            width = 1,
        )
    )
)
trace2 = go.Bar(
    y=featureimportance.Feature,
    x=featureimportance.Importance_y,
    name='Promo2==1',
    orientation = 'h',
    marker = dict(
        color = 'rgba(255, 153, 51, 0.6)',
        line = dict(
            color = 'rgba(255, 153, 51, 1.0)',
            width = 1,
        )
    )
)
data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='marker-h-bar')

In [293]:
import plotly.tools as tls

tls.embed("https://plot.ly/~lemonsong/43/promo20-vs-promo21/")


Out[293]:
Predict based on average of three algorithm

In [29]:
predcollect=pd.concat([pd.Series(y_pred2,name='dt'),pd.Series(y_pred3,name='knn'),pd.Series(y_pred4,name='rf')], axis=1)

In [30]:
pred1collect=pd.concat([pd.Series(y_pred12,name='dt'),pd.Series(y_pred13,name='knn'),pd.Series(y_pred14,name='rf')], axis=1)

In [31]:
predavg= predcollect.mean(axis=1)

In [32]:
pred1avg= pred1collect.mean(axis=1)

Evaluation


In [254]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y)
    yhat = np.expm1(yhat)
    print y
    return "rmspe", rmspe(y,yhat)

Function to calculate RMSPE for both Promo2==0 and Promo2==1 test


In [255]:
def compare(y_test,y_pred,y_test1,y_pred1):
    y_test=np.append(y_test,y_test1)
    y_pred=np.append(y_pred,y_pred1)
    return rmspe(y_test,y_pred)
DT

Promo2==0


In [256]:
print rmspe(y_test,y_pred2)


0.103550818995

Promo2==1


In [257]:
print rmspe(y_test1,y_pred12)


0.130232097084

Promo2==0 & Promo2==1


In [258]:
print compare(y_test,y_pred2,y_test1,y_pred12)


0.117093827408
KNN

In [259]:
print rmspe(y_test,y_pred3)


0.0827110382687

In [260]:
print rmspe(y_test1,y_pred13)


0.0901338125419

In [261]:
print compare(y_test,y_pred3,y_test1,y_pred13)


0.0863466452436
RF

In [262]:
print rmspe(y_test,y_pred4)


0.0703702331174

In [263]:
print rmspe(y_test1,y_pred14)


0.0799880772887

In [264]:
print compare(y_test,y_pred4,y_test1,y_pred14)


0.0751315384619
Average method

Predict sales based on average of predictions from three algorithms


In [33]:
print rmspe(y_test,predavg)


0.850845818724

In [34]:
print rmspe(y_test1,pred1avg)


0.823322014024

In [35]:
print compare(y_test,predavg,y_test1,pred1avg)


0.0699363310493

Export Decision Tree


In [287]:
tree.export_graphviz(clf2,out_file='tree0.dot',max_depth=8)

In [288]:
tree.export_graphviz(clf12,out_file='tree1.dot',max_depth=8)

In [ ]:

Make Prediction


In [294]:
def makeprediction(testfile,feature,clf):
    #train_x = pd.read_csv(trainfile).astype(float32)
    pre_x = pd.read_csv(testfile).astype(float32)
    #print np.all(np.isfinite(train_x))
    print np.all(np.isfinite(pre_x))
    
    
    #train_x,train_y=getxy(train_x)
    
    
    pre_y = clf.predict(pre_x[feature])
    prediction = pd.concat([pre_x, pd.Series(pre_y,name='Sales')], axis=1)

    return prediction
feature0=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
          "CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
           "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",'Customers'
          ]
feature1=["Store","DayOfWeek","Promo","SchoolHoliday",'HaveCompetitor',
          "CompetitionDistance",
          "Year","Month","Day","Week",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
            "StateHoliday_0","StateHoliday_a",
          "CompetitionMonth",
          "Promo2Month","Promo2Week",'Customers'
          ]

In [295]:
prediction0=makeprediction('pre0wtcustomers.csv',feature0,clf4)


True

In [296]:
prediction1=makeprediction('pre1wtcustomers.csv',feature1,clf14)


True

average method

When make submission based on average prediction of three algorithms, use this part


In [ ]:
prediction02=makeprediction('pre0.csv',feature0,clf2)
prediction03=makeprediction('pre0.csv',feature0,clf3)
prediction04=makeprediction('pre0.csv',feature0,clf4)

In [ ]:
prediction12=makeprediction('pre1.csv',feature1,clf12)
prediction13=makeprediction('pre1.csv',feature1,clf13)
prediction14=makeprediction('pre1.csv',feature1,clf14)

In [ ]:
def mergeavg(predition2,predition3,predition4):
    predcollect=pd.concat([pd.Series(predition2,name='dt'),pd.Series(predition3,name='knn'),pd.Series(predition4,name='rf')], axis=1)
    predavg= predcollect.mean(axis=1)
    return predavg

In [ ]:
prediction0=mergeavg(prediction02.Sales,prediction03.Sales,prediction04.Sales)

In [ ]:
prediction1=mergeavg(prediction12.Sales,prediction13.Sales,prediction14.Sales)

In [ ]:
def generatepreforsub(filename,pred):
    pre_x = pd.read_csv(filename).astype(float32)
    prediction = pd.concat([pre_x.Id, pd.Series(pred,name='Sales')], axis=1)
    return prediction

In [ ]:
prediction0=generatepreforsub('pre0.csv',prediction0)

In [ ]:
prediction1=generatepreforsub('pre1.csv',prediction1)

Make Submission


In [313]:
prediction_sub0=pd.DataFrame(prediction0[["Id","Sales"]],columns=["Id","Sales"])
prediction_sub1=pd.DataFrame(prediction1[["Id","Sales"]],columns=["Id","Sales"])

prediction_sub=pd.concat([prediction_sub0,prediction_sub1])
print len(prediction_sub)
submission = pd.read_csv("submission.csv")
submission = pd.merge(submission,prediction_sub,on="Id", how='outer')
submission.fillna(0, inplace=True)


35104

In [298]:
submission.to_csv("submission4.csv", index=False)

Generat Data for Advance Analysis

Only includ test and prediction of test with open==1 or open==null


In [326]:
prediction0.to_csv("prediction0.csv", index=False)
prediction1.to_csv("prediction1.csv", index=False)

In [327]:
fet=["Store","DayOfWeek","Promo","SchoolHoliday","StateHoliday_0","StateHoliday_a",
          "Year","Month","Day",
          "StoreType_a","StoreType_b","StoreType_c","StoreType_d",
          "Assortment_a","Assortment_b","Assortment_c",
            "Customers","Sales"]
prediction_ana0=pd.DataFrame(prediction0[fet])
prediction_ana0["Promo2"]=0
print prediction_ana0.head()
prediction_ana1=pd.DataFrame(prediction1[fet])
prediction_ana0["Promo2"]=1
data_ana0=pd.DataFrame(data[fet])
prediction_ana0["Promo2"]=0
data_ana1=pd.DataFrame(data1[fet])
prediction_ana0["Promo2"]=1
prediction_ana=pd.concat([prediction_ana0,prediction_ana1,data_ana0,data_ana1])


   Store  DayOfWeek  Promo  SchoolHoliday  StateHoliday_0  StateHoliday_a  \
0      1          4      1              0               1               0   
1      1          3      1              0               1               0   
2      1          2      1              0               1               0   
3      1          1      1              0               1               0   
4      1          6      0              0               1               0   

   Year  Month  Day  StoreType_a  StoreType_b  StoreType_c  StoreType_d  \
0  2015      9   17            0            0            1            0   
1  2015      9   16            0            0            1            0   
2  2015      9   15            0            0            1            0   
3  2015      9   14            0            0            1            0   
4  2015      9   12            0            0            1            0   

   Assortment_a  Assortment_b  Assortment_c   Customers        Sales  Promo2  
0             1             0             0  484.890015  3674.843333       0  
1             1             0             0  516.349976  3914.896667       0  
2             1             0             0  527.690002  3905.466667       0  
3             1             0             0  579.429993  5384.690000       0  
4             1             0             0  513.880005  4214.560000       0  

Creat Date column


In [328]:
y = np.array(prediction_ana['Year']-1970, dtype='<M8[Y]')
m = np.array(prediction_ana['Month']-1, dtype='<m8[M]')
d = np.array(prediction_ana['Day']-1, dtype='<m8[D]')
prediction_ana['Date'] = pd.Series(y+m+d)

In [329]:
print prediction_ana.dtypes


Assortment_a             float64
Assortment_b             float64
Assortment_c             float64
Customers                float64
Day                      float64
DayOfWeek                float64
Month                    float64
Promo                    float64
Promo2                   float64
Sales                    float64
SchoolHoliday            float64
StateHoliday_0           float64
StateHoliday_a           float64
Store                    float64
StoreType_a              float64
StoreType_b              float64
StoreType_c              float64
StoreType_d              float64
Year                     float64
Date              datetime64[ns]
dtype: object

In [330]:
print prediction_ana.head()


   Assortment_a  Assortment_b  Assortment_c   Customers  Day  DayOfWeek  \
0             1             0             0  484.890015   17          4   
1             1             0             0  516.349976   16          3   
2             1             0             0  527.690002   15          2   
3             1             0             0  579.429993   14          1   
4             1             0             0  513.880005   12          6   

   Month  Promo  Promo2        Sales  SchoolHoliday  StateHoliday_0  \
0      9      1       1  3674.843333              0               1   
1      9      1       1  3914.896667              0               1   
2      9      1       1  3905.466667              0               1   
3      9      1       1  5384.690000              0               1   
4      9      0       1  4214.560000              0               1   

   StateHoliday_a  Store  StoreType_a  StoreType_b  StoreType_c  StoreType_d  \
0               0      1            0            0            1            0   
1               0      1            0            0            1            0   
2               0      1            0            0            1            0   
3               0      1            0            0            1            0   
4               0      1            0            0            1            0   

   Year       Date  
0  2015 2015-09-17  
1  2015 2015-09-16  
2  2015 2015-09-15  
3  2015 2015-09-14  
4  2015 2015-09-12  

In [331]:
prediction_ana.drop(["Day","Month","Year"], axis=1, inplace=True)

Sales and Customers Prediction by Date


In [332]:
gr_date=prediction_ana.groupby(['Date'])

In [333]:
gr_date_sales=gr_date.agg({'Customers' : 'mean', 'Sales' : 'mean'})

In [334]:
print gr_date_sales.head()


             Customers        Sales
Date                               
2013-01-01  832.227273  7404.090909
2013-01-02  767.874747  6994.882828
2013-01-03  773.235830  7062.363360
2013-01-04  761.178138  6975.305668
2013-01-05  766.191684  7009.895538

In [335]:
trace1 = go.Scatter(
    x=gr_date_sales.index,
    y=gr_date_sales.Customers,
    name='Customers',
    line=dict(
        color='#ae32e4',
        width = 1 ,
        
    )
)
trace2 = go.Scatter(
    x=gr_date_sales.index,
    y=gr_date_sales.Sales,
    name='Sales',
    mode = 'lines+markers',
    yaxis='y2',
    line=dict(
        color='#3268e4',
        width = 1 
        ),
    opacity=0.8
)
data = [trace1, trace2]
layout = go.Layout(
    title='Time Series of Prediction',
    yaxis=dict(
        title='Customers'
    ),
    yaxis2=dict(
        title='Sales',
        titlefont=dict(
            color='rgb(174,50,228)'
        ),
        tickfont=dict(
            color='rgb(174,50,228)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='multiple-axes-double')
tls.embed("https://plot.ly/~lemonsong/54/time-series-of-prediction/")


Out[335]:

In [336]:
gr_assortment=prediction_ana

In [337]:
#gr_assortment.query('Assortment_a==1')['Assortment']='basic'
gr_assortment.ix[gr_assortment.Assortment_a==1, 'Assortment'] = 'basic'
gr_assortment.ix[gr_assortment.Assortment_b==1, 'Assortment'] = 'extra'
gr_assortment.ix[gr_assortment.Assortment_c==1, 'Assortment'] = 'extended'

gr_assortment.drop(['Assortment_a','Assortment_b','Assortment_c'], axis=1, inplace=True)

In [338]:
print gr_assortment.columns


Index([u'Customers', u'DayOfWeek', u'Promo', u'Promo2', u'Sales',
       u'SchoolHoliday', u'StateHoliday_0', u'StateHoliday_a', u'Store',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d', u'Date',
       u'Assortment'],
      dtype='object')

In [339]:
gr_assortment1=gr_assortment.groupby(['Assortment', 'DayOfWeek'])
gr_assortment1=gr_assortment1.agg({ 'Customers' : 'sum','Store':'count'}).reset_index()
gr_assortment1['Coustomers_by_store']=gr_assortment1['Customers']/gr_assortment1['Store']
gr_assortment1


Out[339]:
Assortment DayOfWeek Customers Store Coustomers_by_store
0 basic 1 63852110.939697 75512 845.588925
1 basic 2 60317516.748489 78987 763.638532
2 basic 3 56956555.221161 77892 731.224711
3 basic 4 54820759.229858 73927 741.552602
4 basic 5 57778947.551147 75730 762.959825
5 basic 6 48806490.477905 78964 618.085336
6 basic 7 1793533.239441 1039 1726.211010
7 extended 1 55911744.629669 66735 837.817407
8 extended 2 52371614.009567 69720 751.170597
9 extended 3 49878672.819870 68774 725.254788
10 extended 4 48556544.109848 65433 742.080359
11 extended 5 51473334.070618 66774 770.858928
12 extended 6 47731133.938538 69685 684.955642
13 extended 7 991295.099731 1510 656.486821
14 extra 1 2561143.219238 1203 2128.963607
15 extra 2 2521112.621094 1236 2039.735130
16 extra 3 2509097.530151 1227 2044.904263
17 extra 4 2567180.448975 1220 2104.246270
18 extra 5 2677645.900391 1195 2240.707866
19 extra 6 2224878.769287 1230 1808.844528
20 extra 7 2662687.199341 1233 2159.519221

In [340]:
gr_assortment2=gr_assortment1.pivot('Assortment', 'DayOfWeek', 'Coustomers_by_store')
print gr_assortment2


DayOfWeek             1            2            3            4            5  \
Assortment                                                                    
basic        845.588925   763.638532   731.224711   741.552602   762.959825   
extended     837.817407   751.170597   725.254788   742.080359   770.858928   
extra       2128.963607  2039.735130  2044.904263  2104.246270  2240.707866   

DayOfWeek             6            7  
Assortment                            
basic        618.085336  1726.211010  
extended     684.955642   656.486821  
extra       1808.844528  2159.519221  

In [341]:
data = [
    go.Heatmap(
        z=gr_assortment2.values,
        x=gr_assortment2.columns,
        y=gr_assortment2.index,
        colorscale=[[0, '"rgb(228, 174, 50)"'],[1, 'rgb(174, 50, 228)']]
    )
]
layout = go.Layout(
    title='Average Customers',
    yaxis=dict(
        title='Assortment',
    ),
    xaxis=dict(
        type="category",
        title='WeekOfDay',
    )
)
fig = go.Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='labelled-heatmap')
tls.embed("https://plot.ly/~lemonsong/80/average-sales/")


Out[341]:

In [342]:
gr_store=prediction_ana

In [343]:
gr_store=gr_store.groupby(['Store'])
gr_store_sales=gr_store.agg({'Customers' : 'sum', 'Sales' : 'sum','Promo':'sum','Promo2':'sum'}).reset_index()

In [344]:
gr_store1=pd.merge(gr_store_sales,prediction_ana[['Store','Assortment']],on="Store", how='left').drop_duplicates()

In [345]:
gr_store1.head()


Out[345]:
Store Promo Customers Promo2 Sales Assortment
0 1 369 461401.890045 41 3886087.213333 basic
822 2 354 457855.000000 NaN 3883858.000000 basic
1606 3 369 612658.549927 NaN 5696661.756667 basic
2426 4 353 1036254.000000 NaN 7556507.000000 extended
3210 5 351 418588.000000 NaN 3642818.000000 basic

In [346]:
gr_store1_assort=gr_store1.groupby(['Assortment'])
gr_store_sales_agg=gr_store1_assort.agg({'Customers' : 'sum', 'Sales' : 'sum','Store':'count','Promo':'sum','Promo2':'sum'}).reset_index()

In [347]:
gr_store_sales_agg


Out[347]:
Assortment Promo Customers Promo2 Sales Store
0 basic 206917 3.443259e+08 6600 3.053790e+09 593
1 extended 182888 3.069143e+08 7978 2.982794e+09 513
2 extra 3272 1.772375e+07 236 7.396227e+07 9

In [348]:
fig = {
  "data": [
    {
      "values": gr_store_sales_agg.Store,
      "labels": gr_store_sales_agg.Assortment,
      "domain": {"x": [0, .33]},
      "name": "Store",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    },     
    {
      "values": gr_store_sales_agg.Customers,
      "labels":gr_store_sales_agg.Assortment,
      "text":"Customers",
      "textposition":"inside",
      "domain": {"x": [.33, .66]},
      "name": "Customers",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    }, 
    {
      "values": gr_store_sales_agg.Sales,
      "labels":gr_store_sales_agg.Assortment,
      "text":"Sales",
      "textposition":"inside",
      "domain": {"x": [.66, 1]},
      "name": "Sales",
      "hoverinfo":"label+percent+name",
      "hole": .4,
      "type": "pie"
    },
  ],
  "layout": {
        "title":"Percentage by Assortment Type",
        "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Store",
                "x": 0.10,
                "y": 0.5
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Customers",
                "x": 0.5,
                "y": 0.5
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Sales",
                "x": 0.9,
                "y": 0.5
            }
        ]
    }
}

url = py.plot(fig, filename='Global Emissions 1990-2011')

In [ ]:


In [ ]: