In [1]:
%matplotlib inline
import pandas as pd
import argparse
import os
import matplotlib.pyplot as plt
import numpy as np

In [125]:
def rmspe(tru, pred):
    if tru==0.0 or isinstance(tru, str) or np.isnan(tru) or np.isnan(pred):
        return 0.0
    return (float(pred-tru)/tru)**2
def eval(trues, preds):
    return np.mean([rmspe(t,p) for t,p in zip(trues, preds)])

In [275]:
train_file = "/home/gv/ashabou/data/ds/rossmann/train.csv"
test_file = "/home/gv/ashabou/data/ds/rossmann/test.csv"
store_file = "/home/gv/ashabou/data/ds/rossmann/store.csv"

In [276]:
train = pd.read_csv( train_file )
test = pd.read_csv( test_file )
store = pd.read_csv( store_file )

In [277]:
print train.head(5)
print test.head(5)
print train.count()
print test.count()
print store.head(10)
print store.count()


   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263        555     1      1            0   
1      2          5  2015-07-31   6064        625     1      1            0   
2      3          5  2015-07-31   8314        821     1      1            0   
3      4          5  2015-07-31  13995       1498     1      1            0   
4      5          5  2015-07-31   4822        559     1      1            0   

   SchoolHoliday  
0              1  
1              1  
2              1  
3              1  
4              1  

[5 rows x 9 columns]
   Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  SchoolHoliday
0   1      1          4  2015-09-17     1      1            0              0
1   2      3          4  2015-09-17     1      1            0              0
2   3      7          4  2015-09-17     1      1            0              0
3   4      8          4  2015-09-17     1      1            0              0
4   5      9          4  2015-09-17     1      1            0              0

[5 rows x 8 columns]
Store            1017209
DayOfWeek        1017209
Date             1017209
Sales            1017209
Customers        1017209
Open             1017209
Promo            1017209
StateHoliday     1017209
SchoolHoliday    1017209
dtype: int64
Id               41088
Store            41088
DayOfWeek        41088
Date             41088
Open             41077
Promo            41088
StateHoliday     41088
SchoolHoliday    41088
dtype: int64
   Store StoreType Assortment  CompetitionDistance  CompetitionOpenSinceMonth  \
0      1         c          a                 1270                          9   
1      2         a          a                  570                         11   
2      3         a          a                14130                         12   
3      4         c          c                  620                          9   
4      5         a          a                29910                          4   
5      6         a          a                  310                         12   
6      7         a          c                24000                          4   
7      8         a          a                 7520                         10   
8      9         a          c                 2030                          8   
9     10         a          a                 3160                          9   

   CompetitionOpenSinceYear  Promo2  Promo2SinceWeek  Promo2SinceYear  \
0                      2008       0              NaN              NaN   
1                      2007       1               13             2010   
2                      2006       1               14             2011   
3                      2009       0              NaN              NaN   
4                      2015       0              NaN              NaN   
5                      2013       0              NaN              NaN   
6                      2013       0              NaN              NaN   
7                      2014       0              NaN              NaN   
8                      2000       0              NaN              NaN   
9                      2009       0              NaN              NaN   

     PromoInterval  
0              NaN  
1  Jan,Apr,Jul,Oct  
2  Jan,Apr,Jul,Oct  
3              NaN  
4              NaN  
5              NaN  
6              NaN  
7              NaN  
8              NaN  
9              NaN  

[10 rows x 10 columns]
Store                        1115
StoreType                    1115
Assortment                   1115
CompetitionDistance          1112
CompetitionOpenSinceMonth     761
CompetitionOpenSinceYear      761
Promo2                       1115
Promo2SinceWeek               571
Promo2SinceYear               571
PromoInterval                 571
dtype: int64

In [6]:
train['Store'].hist()
print train['Store'].min()
print train['Store'].max()
test['Store'].hist()
print test['Store'].min()
print test['Store'].max()


1
1115
1
1115

In [7]:
train['DayOfWeek'].hist()
test['DayOfWeek'].hist()


Out[7]:
<matplotlib.axes.AxesSubplot at 0x7f3747b72750>

In [8]:
train['Sales'].loc[train['Sales']>0].hist(bins=100)
print (train['Sales']==0).sum()


172871

In [9]:
train['Customers'].loc[train['Customers']>0].hist(bins=100)
print (train['Customers']==0).sum()
print train['Customers'].loc[train['Customers']>0].describe()


172869
count    844340.000000
mean        762.775369
std         401.195377
min           3.000000
25%         519.000000
50%         676.000000
75%         893.000000
max        7388.000000
Name: Customers, dtype: float64

In [10]:
#0 sales while there are some customers
train.loc[(train['Customers']>0) & (train['Sales']==0)]


Out[10]:
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday
478649 1100 2 2014-04-29 0 3 1 1 0 0
889932 948 4 2013-04-25 0 5 1 1 0 0

2 rows × 9 columns


In [11]:
plt.figure()
train[['Sales', 'Customers']].plot(x='Sales', y='Customers', style='bo')
plt.show()



In [12]:
#from pandas.tools.plotting import scatter_matrix
#_ = scatter_matrix(train[['Sales', 'Store', 'Customers']], figsize=(14, 10))

In [13]:
plt.figure()
train.query('Open==1')[['Sales', 'Store']].plot(x='Sales', y='Store', style='bo')
plt.show()



In [14]:
train.query('Open==1')[['Sales', 'DayOfWeek']].plot(x='Sales', y='DayOfWeek', style='bo')


Out[14]:
<matplotlib.axes.AxesSubplot at 0x7f3747a0fed0>

In [15]:
train.query('Open==1')[['Sales', 'Promo']].plot(x='Sales', y='Promo', style='bo')


Out[15]:
<matplotlib.axes.AxesSubplot at 0x7f3747ad8290>

In [16]:
train.query('Open==1')[['Store', 'Promo']].plot(x='Store', y='Promo', style='bo')


Out[16]:
<matplotlib.axes.AxesSubplot at 0x7f37477fd4d0>

In [17]:
train.query('Open==1')[['Store', 'Promo']].groupby('Store').sum().plot()
print train.query('Open==1')[['Store', 'Promo']].groupby('Store').sum().sort('Promo',ascending=0).head(10)


       Promo
Store       
85       360
1097     360
262      360
423      360
335      360
769      360
562      360
494      360
682      360
733      360

[10 rows x 1 columns]

In [18]:
train.query('Open==1')[['Store', 'Sales']].groupby('Store').sum().plot()
print train.query('Open==1')[['Store', 'Sales']].groupby('Store').sum().sort('Sales',ascending=0).head(5)


          Sales
Store          
262    19516842
817    17057867
562    16927322
1114   16202585
251    14896870

[5 rows x 1 columns]

In [19]:
#not stores that have more propmo have more sales
print train.query('Open==1')[['Store', 'Sales','Promo']].groupby('Store').sum().sort('Sales',ascending=0).head(5)


          Sales  Promo
Store                 
262    19516842    360
817    17057867    353
562    16927322    360
1114   16202585    353
251    14896870    350

[5 rows x 2 columns]

In [20]:
train.query('Open==1')[['Store', 'Sales','DayOfWeek']].groupby(['Store','DayOfWeek']).sum()


Out[20]:
Sales
Store DayOfWeek
1 1 662780
2 627874
3 601354
4 552772
5 609716
6 662358
2 1 775930
2 719473
3 770213
4 630869
5 602377
6 384996
3 1 1067314
2 1024409
3 941448
4 860118
5 920730
6 594242
4 1 1389002
2 1260927
3 1177438
4 1152255
5 1221683
6 1355202
5 1 781874
2 684699
3 677571
4 598243
5 621831
6 278600
6 1 850893
2 789803
3 738733
4 696573
5 745972
6 475784
7 1 1380861
2 1313785
3 1115744
4 1089027
5 1204452
6 826333
8 1 902885
2 828902
3 732428
4 806263
5 670348
6 402031
9 1 1018249
2 881441
3 811510
4 760322
5 833736
6 806803
10 1 839376
2 773559
3 704621
4 678242
5 722925
6 646919
...

6723 rows × 1 columns


In [21]:
train['Year'] = train['Date'].map(lambda x: int(x.split("-")[0]))
train['Month'] = train['Date'].map(lambda x: int(x.split("-")[1]))
train['Day'] = train['Date'].map(lambda x: int(x.split("-")[2]))
train.head(5)


Out[21]:
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday Year Month Day
0 1 5 2015-07-31 5263 555 1 1 0 1 2015 7 31
1 2 5 2015-07-31 6064 625 1 1 0 1 2015 7 31
2 3 5 2015-07-31 8314 821 1 1 0 1 2015 7 31
3 4 5 2015-07-31 13995 1498 1 1 0 1 2015 7 31
4 5 5 2015-07-31 4822 559 1 1 0 1 2015 7 31

5 rows × 12 columns


In [22]:
print set(train['Year'].tolist())


set([2013, 2014, 2015])

In [23]:
print set(train.query("Open==1")['Month'].tolist())


set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

In [71]:
print "2013-->", set(train.query("Open==1 & Year==2013")['Month'].tolist())
print "2014-->", set(train.query("Open==1 & Year==2014")['Month'].tolist())
print "2015-->", set(train.query("Open==1 & Year==2015")['Month'].tolist())


2013--> set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
2014--> set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
2015--> set([1, 2, 3, 4, 5, 6, 7])

In [24]:
test['Year'] = test['Date'].map(lambda x: int(x.split("-")[0]))
test['Month'] = test['Date'].map(lambda x: int(x.split("-")[1]))
test['Day'] = test['Date'].map(lambda x: int(x.split("-")[2]))
test.head(5)


Out[24]:
Id Store DayOfWeek Date Open Promo StateHoliday SchoolHoliday Year Month Day
0 1 1 4 2015-09-17 1 1 0 0 2015 9 17
1 2 3 4 2015-09-17 1 1 0 0 2015 9 17
2 3 7 4 2015-09-17 1 1 0 0 2015 9 17
3 4 8 4 2015-09-17 1 1 0 0 2015 9 17
4 5 9 4 2015-09-17 1 1 0 0 2015 9 17

5 rows × 11 columns


In [25]:
print set(test['Year'].tolist())


set([2015])

In [26]:
print set(test.query("Open==1")['Month'].tolist())


set([8, 9])

In [27]:
data = train.query("Open==1").query('Month==6')[['Sales','Year','Store']].groupby(['Store','Year']).sum()
print data.head(20)
data.query('Year==2013').plot()
data.query('Year==2014').plot()
data.query('Year==2015').plot()


             Sales
Store Year        
1     2013  113411
      2014  107905
      2015  110011
2     2013  115871
      2014  122758
      2015  132800
3     2013  170462
      2014  171301
      2015  189918
4     2013  235227
      2014  235443
      2015  248643
5     2013  105280
      2014  112854
      2015  124041
6     2013  155385
      2014  124986
      2015  125511
7     2013  211771
      2014  220951

[20 rows x 1 columns]
Out[27]:
<matplotlib.axes.AxesSubplot at 0x7f3747615d90>

In [38]:
data = train.query("Open==1").query('Month==8 | Month==9')[['Sales','Store','Month','Year']].groupby(['Store','Month']).median()
data.rename(columns={'Sales': 'Median'}, inplace=True)
print data.head(10)


             Median  Year
Store Month              
1     8        4174  2013
      9        4175  2014
2     8        4467  2013
      9        4709  2014
3     8        6051  2013
      9        5978  2014
4     8        9338  2013
      9        8995  2014
5     8        4458  2013
      9        4314  2014

[10 rows x 2 columns]

In [96]:
valid = train.copy().query('(Month==8 | Month==9) & Year==2014')
valid['tmp'] = train['Open'].map(lambda x: 1 if x == 1 else 0)
print valid.head(5)


        Store  DayOfWeek        Date  Sales  Customers  Open  Promo  \
322400      1          2  2014-09-30   4919        550     1      1   
322401      2          2  2014-09-30   8461        839     1      1   
322402      3          2  2014-09-30  12260       1084     1      1   
322403      4          2  2014-09-30  12659       1486     1      1   
322404      5          2  2014-09-30   6633        676     1      1   

       StateHoliday  SchoolHoliday  Year  Month  Day  tmp  
322400            0              0  2014      9   30    1  
322401            0              0  2014      9   30    1  
322402            0              0  2014      9   30    1  
322403            0              0  2014      9   30    1  
322404            0              0  2014      9   30    1  

[5 rows x 13 columns]

In [97]:
ttrain = train.query("Open==1").query('(Month==8 | Month==9) & Year==2013')[['Sales','Store','Month']].groupby(['Store','Month']).median()
ttrain.rename(columns={'Sales': 'Median'}, inplace=True)
print ttrain.head(10)
out = valid.join(ttrain, on=['Store','Month'])
print out.head(5)


             Median
Store Month        
1     8        4494
      9        4212
2     8        4399
      9        4574
3     8        5991
      9        5847
4     8        8709
      9        8701
5     8        4816
      9        4314

[10 rows x 1 columns]
        Store  DayOfWeek        Date  Sales  Customers  Open  Promo  \
322400      1          2  2014-09-30   4919        550     1      1   
322401      2          2  2014-09-30   8461        839     1      1   
322402      3          2  2014-09-30  12260       1084     1      1   
322403      4          2  2014-09-30  12659       1486     1      1   
322404      5          2  2014-09-30   6633        676     1      1   

       StateHoliday  SchoolHoliday  Year  Month  Day  tmp  Median  
322400            0              0  2014      9   30    1    4212  
322401            0              0  2014      9   30    1    4574  
322402            0              0  2014      9   30    1    5847  
322403            0              0  2014      9   30    1    8701  
322404            0              0  2014      9   30    1    4314  

[5 rows x 14 columns]

In [98]:
out['preds'] = out['tmp']*out['Median']
print out.shape
print out[['Sales','preds']].head(5)
print eval(out['Sales'].tolist(), out['preds'].tolist())


(57035, 15)
        Sales  preds
322400   4919   4212
322401   8461   4574
322402  12260   5847
322403  12659   8701
322404   6633   4314

[5 rows x 2 columns]
0.0789875948933

In [291]:
test['tmp'] = test['Open'].map(lambda x: 1 if x == 1 else 0)
print test.head(5)


   Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  SchoolHoliday  \
0   1      1          4  2015-09-17     1      1            0              0   
1   2      3          4  2015-09-17     1      1            0              0   
2   3      7          4  2015-09-17     1      1            0              0   
3   4      8          4  2015-09-17     1      1            0              0   
4   5      9          4  2015-09-17     1      1            0              0   

   tmp  
0    1  
1    1  
2    1  
3    1  
4    1  

[5 rows x 9 columns]

In [68]:
out = test[['Id','Store','tmp','Month']].join(data, on=['Store','Month'])#.sort('Id')
print out.head(10)
print out.query('Store==1 & Month==8').head(10)
print out.query('Store==1 & Month==9').head(10)


   Id  Store  tmp  Month  Median  Year
0   1      1    1      9    4175  2014
1   2      3    1      9    5978  2014
2   3      7    1      9    8023  2014
3   4      8    1      9    5135  2014
4   5      9    1      9    5770  2014
5   6     10    1      9    5088  2014
6   7     11    1      9    6979  2014
7   8     12    1      9    6271  2014
8   9     13    1      9    4059  2013
9  10     14    1      9    5050  2014

[10 rows x 6 columns]
          Id  Store  tmp  Month  Median  Year
14552  14553      1    1      8    4174  2013
15408  15409      1    0      8    4174  2013
16264  16265      1    1      8    4174  2013
17120  17121      1    1      8    4174  2013
17976  17977      1    1      8    4174  2013
18832  18833      1    1      8    4174  2013
19688  19689      1    1      8    4174  2013
20544  20545      1    1      8    4174  2013
21400  21401      1    0      8    4174  2013
22256  22257      1    1      8    4174  2013

[10 rows x 6 columns]
        Id  Store  tmp  Month  Median  Year
0        1      1    1      9    4175  2014
856    857      1    1      9    4175  2014
1712  1713      1    1      9    4175  2014
2568  2569      1    1      9    4175  2014
3424  3425      1    0      9    4175  2014
4280  4281      1    1      9    4175  2014
5136  5137      1    1      9    4175  2014
5992  5993      1    1      9    4175  2014
6848  6849      1    1      9    4175  2014
7704  7705      1    1      9    4175  2014

[10 rows x 6 columns]

In [69]:
out['Sales'] = out['tmp']*out['Median']

print out.head(10)
print out.query('Store==1 & Month==8').head(10)
print out.query('Store==1 & Month==9').head(10)


   Id  Store  tmp  Month  Median  Year  Sales
0   1      1    1      9    4175  2014   4175
1   2      3    1      9    5978  2014   5978
2   3      7    1      9    8023  2014   8023
3   4      8    1      9    5135  2014   5135
4   5      9    1      9    5770  2014   5770
5   6     10    1      9    5088  2014   5088
6   7     11    1      9    6979  2014   6979
7   8     12    1      9    6271  2014   6271
8   9     13    1      9    4059  2013   4059
9  10     14    1      9    5050  2014   5050

[10 rows x 7 columns]
          Id  Store  tmp  Month  Median  Year  Sales
14552  14553      1    1      8    4174  2013   4174
15408  15409      1    0      8    4174  2013      0
16264  16265      1    1      8    4174  2013   4174
17120  17121      1    1      8    4174  2013   4174
17976  17977      1    1      8    4174  2013   4174
18832  18833      1    1      8    4174  2013   4174
19688  19689      1    1      8    4174  2013   4174
20544  20545      1    1      8    4174  2013   4174
21400  21401      1    0      8    4174  2013      0
22256  22257      1    1      8    4174  2013   4174

[10 rows x 7 columns]
        Id  Store  tmp  Month  Median  Year  Sales
0        1      1    1      9    4175  2014   4175
856    857      1    1      9    4175  2014   4175
1712  1713      1    1      9    4175  2014   4175
2568  2569      1    1      9    4175  2014   4175
3424  3425      1    0      9    4175  2014      0
4280  4281      1    1      9    4175  2014   4175
5136  5137      1    1      9    4175  2014   4175
5992  5993      1    1      9    4175  2014   4175
6848  6849      1    1      9    4175  2014   4175
7704  7705      1    1      9    4175  2014   4175

[10 rows x 7 columns]

In [70]:
out = out[['Id','Sales']]
print out.head(10)


   Id  Sales
0   1   4175
1   2   5978
2   3   8023
3   4   5135
4   5   5770
5   6   5088
6   7   6979
7   8   6271
8   9   4059
9  10   5050

[10 rows x 2 columns]

In [141]:
out.to_csv('/home/gv/ashabou/mycsv.csv', index=False)

In [103]:
data_2013 = train.query('(Month==8 | Month==9) & Year==2013')[['Sales','Store', 'Day','Month']].sort(['Store','Month', 'Day'])
data_2013.index=range(1, len(data_2013) + 1)
print data_2013.head(10)
data_2014 = train.query('(Month==8 | Month==9) & Year==2014')[['Sales','Store','Day','Month']].sort(['Store','Month', 'Day'])
data_2014.index=range(1, len(data_2014) + 1)
print data_2014.head(10)


    Sales  Store  Day  Month
1    4994      1    1      8
2    4494      1    2      8
3    4461      1    3      8
4       0      1    4      8
5    4086      1    5      8
6    3582      1    6      8
7    4143      1    7      8
8    3680      1    8      8
9    3257      1    9      8
10   3768      1   10      8

[10 rows x 4 columns]
    Sales  Store  Day  Month
1    5038      1    1      8
2    4731      1    2      8
3       0      1    3      8
4    5655      1    4      8
5    5433      1    5      8
6    5337      1    6      8
7    4154      1    7      8
8    4451      1    8      8
9    4174      1    9      8
10      0      1   10      8

[10 rows x 4 columns]

In [109]:
join1314 = data_2013.join(data_2014.groupby(['Store','Month','Day']).mean().rename(columns={'Sales': 'Median'}), on=['Store','Month','Day'])
print join1314.head(5)


   Sales  Store  Day  Month  Median
1   4994      1    1      8    5038
2   4494      1    2      8    4731
3   4461      1    3      8       0
4      0      1    4      8    5655
5   4086      1    5      8    5433

[5 rows x 5 columns]

In [126]:
sales = join1314['Sales'][1:]
sales.index =range(1, len(sales) + 1) 
median = join1314['Median'][:-1]
median.index =range(1, len(median) + 1) 
ct = pd.concat([sales, median], axis=1)
print ct.head(5)
ct.plot()
print sales.describe()
print median.describe()

print eval(sales.tolist(), median.tolist())


   Sales  Median
1   4494    5038
2   4461    4731
3      0       0
4   4086    5655
5   3582    5433

[5 rows x 2 columns]
count    68014.000000
mean      5529.621784
std       3490.589233
min          0.000000
25%       3707.000000
50%       5463.000000
75%       7394.000000
max      32926.000000
Name: Sales, dtype: float64
count    57034.000000
mean      5755.464460
std       3616.194141
min          0.000000
25%       3869.000000
50%       5695.000000
75%       7681.000000
max      33913.000000
Name: Median, dtype: float64
0.126957561374

In [130]:
join_indexed = join1314.groupby(['Store','Month','Day']).median()
print join_indexed.head(5)


                 Sales  Median
Store Month Day               
1     8     1     4994    5038
            2     4494    4731
            3     4461       0
            4        0    5655
            5     4086    5433

[5 rows x 2 columns]

In [202]:
out = test[['Id','Store','tmp','Month','Day']].join(join_indexed, on=['Store','Month','Day'])#.sort('Id')
out.rename(columns={'Sales': 'y2013', 'Median':'y2014'}, inplace=True)
print out.head(10)
print out.query('Store==1 & Month==8').head(10)
print out.query('Store==1 & Month==9').head(10)


   Id  Store  tmp  Month  Day  y2013  y2014
0   1      1    1      9   17   3685   4383
1   2      3    1      9   17   5517   8034
2   3      7    1      9   17   7767   8614
3   4      8    1      9   17   4473   6038
4   5      9    1      9   17   4830   6152
5   6     10    1      9   17   4394   5714
6   7     11    1      9   17   6658   8334
7   8     12    1      9   17   5117   7610
8   9     13    1      9   17   3225    NaN
9  10     14    1      9   17   4229   5298

[10 rows x 7 columns]
          Id  Store  tmp  Month  Day  y2013  y2014
14552  14553      1    1      8   31   5317      0
15408  15409      1    0      8   30   5469   4094
16264  16265      1    1      8   29   4804   3869
17120  17121      1    1      8   28   4583   3920
17976  17977      1    1      8   27   5156   3148
18832  18833      1    1      8   26   5482   3396
19688  19689      1    1      8   25      0   3414
20544  20545      1    1      8   24   4752      0
21400  21401      1    0      8   23   3493   3582
22256  22257      1    1      8   22   3703   3971

[10 rows x 7 columns]
        Id  Store  tmp  Month  Day  y2013  y2014
0        1      1    1      9   17   3685   4383
856    857      1    1      9   16   4249   4381
1712  1713      1    1      9   15      0   4611
2568  2569      1    1      9   14   4663      0
3424  3425      1    0      9   13   5187   4270
4280  4281      1    1      9   12   5079   3906
5136  5137      1    1      9   11   5103   3392
5992  5993      1    1      9   10   5403   3275
6848  6849      1    1      9    9   5893   3377
7704  7705      1    1      9    8      0   3676

[10 rows x 7 columns]

In [203]:
out['Sales']=[0]*len(out['tmp'])
print out.head(30)


    Id  Store  tmp  Month  Day  y2013  y2014  Sales
0    1      1    1      9   17   3685   4383      0
1    2      3    1      9   17   5517   8034      0
2    3      7    1      9   17   7767   8614      0
3    4      8    1      9   17   4473   6038      0
4    5      9    1      9   17   4830   6152      0
5    6     10    1      9   17   4394   5714      0
6    7     11    1      9   17   6658   8334      0
7    8     12    1      9   17   5117   7610      0
8    9     13    1      9   17   3225    NaN      0
9   10     14    1      9   17   4229   5298      0
10  11     15    1      9   17   4554   6488      0
11  12     16    1      9   17   6407   7908      0
12  13     19    1      9   17   4994   6694      0
13  14     20    1      9   17   6639    NaN      0
14  15     21    1      9   17   3114   5710      0
15  16     22    1      9   17   3950    NaN      0
16  17     23    1      9   17   3436   5617      0
17  18     24    1      9   17   6660   9067      0
18  19     25    1      9   17   8897  10277      0
19  20     27    1      9   17   7138  10609      0
20  21     29    1      9   17   4413   7053      0
21  22     30    1      9   17   4370   4844      0
22  23     31    1      9   17   3984   5287      0
23  24     32    1      9   17   3020    NaN      0
24  25     33    1      9   17   6094   6724      0
25  26     35    1      9   17   6735   9648      0
26  27     36    1      9   17   7794    NaN      0
27  28     38    1      9   17   3548   5198      0
28  29     39    1      9   17   3202   5132      0
29  30     40    1      9   17   3445   4522      0

[30 rows x 8 columns]

In [205]:
def computeValue(x,y):
    if np.isnan(x) or x==0:
        return y
    if np.isnan(y) or y==0:
        return x
    return (x+y)/2.0
out = out.sort(['Store','Month','Day'])
out['Sales']=[computeValue(x,y) for x,y in zip(out['y2014'].tolist()[1:-1], out['y2013'].tolist()[2:])]+\
            [computeValue(x,y) for x,y in zip(out['y2014'].tolist()[-2:], out['y2013'].tolist()[-2:])]
out['Sales'] = out['Sales']*out['tmp']
print out.head(1000)


          Id  Store  tmp  Month  Day  y2013  y2014   Sales
40232  40233      1    1      8    1   4994   5038  4596.0
39376  39377      1    0      8    2   4494   4731     0.0
38520  38521      1    1      8    3   4461      0  4870.5
37664  37665      1    1      8    4      0   5655  4507.5
36808  36809      1    1      8    5   4086   5433  4740.0
35952  35953      1    1      8    6   3582   5337  3917.0
35096  35097      1    1      8    7   4143   4154  3854.0
34240  34241      1    1      8    8   3680   4451  3971.0
33384  33385      1    0      8    9   3257   4174     0.0
32528  32529      1    1      8   10   3768      0  4606.0
31672  31673      1    1      8   11      0   3886  4242.0
30816  30817      1    1      8   12   5326   3185  4396.5
29960  29961      1    1      8   13   5299   4069  4240.5
29104  29105      1    1      8   14   4724   3906  4149.0
28248  28249      1    1      8   15   4575   3746  4251.5
27392  27393      1    0      8   16   4552   4425     0.0
26536  26537      1    1      8   17   4078      0  4788.5
25680  25681      1    1      8   18      0   5623  4341.0
24824  24825      1    1      8   19   3954   5190  3688.0
23968  23969      1    1      8   20   3492   4280  4015.0
23112  23113      1    1      8   21   3096   4327  3732.0
22256  22257      1    1      8   22   3703   3971  4167.0
21400  21401      1    0      8   23   3493   3582     0.0
20544  20545      1    1      8   24   4752      0  4448.0
19688  19689      1    1      8   25      0   3414  4276.0
18832  18833      1    1      8   26   5482   3396  3865.5
17976  17977      1    1      8   27   5156   3148  4362.0
17120  17121      1    1      8   28   4583   3920  4669.0
16264  16265      1    1      8   29   4804   3869  4705.5
15408  15409      1    0      8   30   5469   4094     0.0
14552  14553      1    1      8   31   5317      0  4829.5
13696  13697      1    1      9    1      0   5464  4468.0
12840  12841      1    1      9    2   4195   5008  4160.5
11984  11985      1    1      9    3   3928   4978  4114.5
11128  11129      1    1      9    4   3343   4346  4271.0
10272  10273      1    1      9    5   3883   4706  4311.0
9416    9417      1    0      9    6   3836   3959     0.0
8560    8561      1    1      9    7   4663      0  4784.5
7704    7705      1    1      9    8      0   3676  4390.0
6848    6849      1    1      9    9   5893   3377  4189.0
5992    5993      1    1      9   10   5403   3275  4235.5
5136    5137      1    1      9   11   5103   3392  4546.5
4280    4281      1    1      9   12   5079   3906  4466.5
3424    3425      1    0      9   13   5187   4270     0.0
2568    2569      1    1      9   14   4663      0  4430.0
1712    1713      1    1      9   15      0   4611  4033.0
856      857      1    1      9   16   4249   4381  6560.5
0          1      1    1      9   17   3685   4383  7751.5
40233  40234      3    1      8    1   8738   7893  4351.5
39377  39378      3    0      8    2   7610   4157     0.0
38521  38522      3    1      8    3   4546      0  7523.0
37665  37666      3    1      8    4      0   9055  7440.0
36809  36810      3    1      8    5   5991   8714  6740.5
35953  35954      3    1      8    6   6166   7499  6963.0
35097  35098      3    1      8    7   5982   8010  6377.0
34241  34242      3    1      8    8   5916   6763  3507.0
33385  33386      3    0      8    9   5991   3131     0.0
32529  32530      3    1      8   10   3883      0  8538.0
31673  31674      3    1      8   11      0   6114  7717.5
30817  30818      3    1      8   12  10962   5431  7251.5
         ...    ...  ...    ...  ...    ...    ...     ...

[1000 rows x 8 columns]

In [206]:
print out.query('tmp==0').shape
print out.query('y2013==0').shape
print out.query('y2014==0').shape
print out.query('Sales==0').shape


(5995, 8)
(6049, 8)
(4630, 8)
(6184, 8)

In [207]:
out_csv = out[['Id','Sales']].sort('Id')
print out_csv.head(10)


   Id    Sales
0   1   7751.5
1   2  10985.0
2   3   5510.0
3   4   7107.0
4   5   6024.5
5   6   8783.0
6   7   8225.0
7   8   5269.0
8   9   5980.0
9  10   7468.0

[10 rows x 2 columns]

In [208]:
out_csv.to_csv('/home/gv/ashabou/mycsv2.csv', index=False)

In [246]:
data = train.query('Year==2013 | Year==2014')[['Sales','Store', 'DayOfWeek','Promo','SchoolHoliday','StateHoliday']].sort(['Store','DayOfWeek'])
print data.head(10)
print set(data['Promo'].tolist()),  set(data['SchoolHoliday'].tolist()),  set(data['StateHoliday'].tolist())
data['StateHoliday'][data['StateHoliday']==0]='0'
print set(data['Promo'].tolist()),  set(data['SchoolHoliday'].tolist()),  set(data['StateHoliday'].tolist())


        Sales  Store  DayOfWeek  Promo  SchoolHoliday StateHoliday
238250   6463      1          1      0              1            0
244795   9331      1          1      0              1            0
251340   8069      1          1      1              0            0
257885   4884      1          1      0              0            0
264430   7380      1          1      1              0            0
270975   6527      1          1      1              0            0
277520   3582      1          1      0              0            0
284065   5474      1          1      1              0            0
290610   5857      1          1      1              0            0
297155   4260      1          1      0              1            0

[10 rows x 6 columns]
set([0, 1]) set([0, 1]) set([0, '0', 'c', 'b', 'a'])
set([0, 1]) set([0, 1]) set(['a', '0', 'c', 'b'])

In [256]:
data_indexed = data.groupby(['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday']).median()
data_indexed.rename(columns={'Sales': 'Median'}, inplace=True)
print data_indexed.head(5)
data_valid = data.join(data_indexed, on=['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday'])
print data_valid.head(10)
print eval(data_valid['Sales'].tolist(), data_valid['Median'].tolist())
out = test[['Id','Store','tmp','Day','DayOfWeek','Promo','SchoolHoliday','StateHoliday']].join(data_indexed, on=['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday'])#.sort('Id')
print out.head(10)
print set(test['Promo'].tolist()),  set(test['SchoolHoliday'].tolist()),  set(test['StateHoliday'].tolist())


                                                  Median
Store DayOfWeek Promo SchoolHoliday StateHoliday        
1     1         0     0             0               4004
                                    a                  0
                      1             0               4260
                                    b                  0
                1     0             0               5830

[5 rows x 1 columns]
        Sales  Store  DayOfWeek  Promo  SchoolHoliday StateHoliday  Median
238250   6463      1          1      0              1            0    4260
244795   9331      1          1      0              1            0    4260
251340   8069      1          1      1              0            0    5830
257885   4884      1          1      0              0            0    4004
264430   7380      1          1      1              0            0    5830
270975   6527      1          1      1              0            0    5830
277520   3582      1          1      0              0            0    4004
284065   5474      1          1      1              0            0    5830
290610   5857      1          1      1              0            0    5830
297155   4260      1          1      0              1            0    4260

[10 rows x 7 columns]
0.0439394927477
   Id  Store  tmp  Day  DayOfWeek  Promo  SchoolHoliday StateHoliday  Median
0   1      1    1   17          4      1              0            0  4945.0
1   2      3    1   17          4      1              0            0  7891.0
2   3      7    1   17          4      1              0            0  8664.0
3   4      8    1   17          4      1              0            0  6685.0
4   5      9    1   17          4      1              0            0  5880.5
5   6     10    1   17          4      1              0            0  5681.0
6   7     11    1   17          4      1              0            0  8114.0
7   8     12    1   17          4      1              0            0  7963.0
8   9     13    1   17          4      1              0            0  5271.0
9  10     14    1   17          4      1              0            0  6011.5

[10 rows x 9 columns]
set([0, 1]) set([0, 1]) set(['a', '0'])

In [271]:
idx_nan = pd.isnull(out['Median']).nonzero()[0]
print idx_nan
data_indexed2 = data[['Store','DayOfWeek','Promo','Sales']].groupby(['Store','DayOfWeek','Promo']).median()
data_indexed2.rename(columns={'Sales': 'Median'}, inplace=True)
print data_indexed2.head(5)
out2 = test[['Id','Store','tmp','Day','DayOfWeek','Promo','SchoolHoliday','StateHoliday']].join(data_indexed2, on=['Store','DayOfWeek','Promo'])
print pd.isnull(out2['Median']).nonzero()
out3 = out.copy()
out3['Median'][idx_nan] = out2['Median'][idx_nan]
print pd.isnull(out3['Median']).nonzero()


[28256 28261 28263 28271 28274 28278 28282 28287 28288 28291 28302 28306
 28310 28315 28322 28323 28328 28333 28334 28342 28344 28347 28349 28350
 28351 28356 28357 28360 28365 28368 28372 28373 28379 28380 28385 28387
 28389 28393 28394 28395 28404 28412 28415 28425 28433 28447 28450 28458
 28459 28461 28464 28465 28467 28468 28475 28483 28492 28511 28514 28517
 28529 28542 28554 28559 28564 28565 28571 28577 28578 28579 28580 28584
 28600 28610 28614 28617 28621 28623 28627 28629 28630 28640 28641 28645
 28649 28658 28663 28664 28666 28669 28677 28689 28691 28693 28703 28711
 28716 28719 28720 28724 28725 28733 28736 28737 28738 28739 28745 28747
 28749 28757 28764 28765 28766 28771 28775 28783 28789 28790 28793 28795
 28796 28797 28800 28803 28817 28820 28824 28828 28840 28845 28849 28852
 28863 28869 28870 28871 28877 28881 28885 28894 28902 28908 28909 28924
 28929 28934 28936 28941 28943 28944 28945 28950 28955 28958 28959 28969
 28976 28986 29000 29003 29010 29016 29018 29022 29025 29030 29036 29041
 29044 29050 29056 29065 29067 29077 29085 29086 29092 29094 29097 29098]
                       Median
Store DayOfWeek Promo        
1     1         0      4014.0
                1      5821.0
      2         0      3789.0
                1      5397.0
      3         0      3805.5

[5 rows x 1 columns]
(array([], dtype=int64),)
(array([], dtype=int64),)

In [272]:
out3['Sales'] = out3['Median']*out3['tmp']
print out3.sort('Store').head(10)


          Id  Store  tmp  Day  DayOfWeek  Promo  SchoolHoliday StateHoliday  \
0          1      1    1   17          4      1              0            0   
2568    2569      1    1   14          1      1              0            0   
38520  38521      1    1    3          1      1              1            0   
21400  21401      1    0   23          7      0              1            0   
31672  31673      1    1   11          2      0              1            0   
22256  22257      1    1   22          6      0              1            0   
30816  30817      1    1   12          3      0              1            0   
5992    5993      1    1   10          4      0              0            0   
23112  23113      1    1   21          5      1              1            0   
11984  11985      1    1    3          4      1              1            0   

       Median   Sales  
0      4945.0  4945.0  
2568   5830.0  5830.0  
38520  5655.0  5655.0  
21400     0.0     0.0  
31672  3582.0  3582.0  
22256  4423.0  4423.0  
30816  3805.0  3805.0  
5992   3795.0  3795.0  
23112  4494.0  4494.0  
11984  4520.5  4520.5  

[10 rows x 10 columns]

In [273]:
out3[['Id','Sales']].to_csv('/home/gv/ashabou/mycsv3.csv', index=False)

In [281]:
print store.head(10)
print store.count()
print 'nb stores=', len(train['Store'].unique())


   Store StoreType Assortment  CompetitionDistance  CompetitionOpenSinceMonth  \
0      1         c          a                 1270                          9   
1      2         a          a                  570                         11   
2      3         a          a                14130                         12   
3      4         c          c                  620                          9   
4      5         a          a                29910                          4   
5      6         a          a                  310                         12   
6      7         a          c                24000                          4   
7      8         a          a                 7520                         10   
8      9         a          c                 2030                          8   
9     10         a          a                 3160                          9   

   CompetitionOpenSinceYear  Promo2  Promo2SinceWeek  Promo2SinceYear  \
0                      2008       0              NaN              NaN   
1                      2007       1               13             2010   
2                      2006       1               14             2011   
3                      2009       0              NaN              NaN   
4                      2015       0              NaN              NaN   
5                      2013       0              NaN              NaN   
6                      2013       0              NaN              NaN   
7                      2014       0              NaN              NaN   
8                      2000       0              NaN              NaN   
9                      2009       0              NaN              NaN   

     PromoInterval  
0              NaN  
1  Jan,Apr,Jul,Oct  
2  Jan,Apr,Jul,Oct  
3              NaN  
4              NaN  
5              NaN  
6              NaN  
7              NaN  
8              NaN  
9              NaN  

[10 rows x 10 columns]
Store                        1115
StoreType                    1115
Assortment                   1115
CompetitionDistance          1112
CompetitionOpenSinceMonth     761
CompetitionOpenSinceYear      761
Promo2                       1115
Promo2SinceWeek               571
Promo2SinceYear               571
PromoInterval                 571
dtype: int64
nb stores= 1115

In [286]:
data = train[['Store', 'DayOfWeek','Promo','Sales']].merge(store[['Store','StoreType','Assortment','Promo2']], on='Store', how='left')
print data.head(10)


   Store  DayOfWeek  Promo  Sales StoreType Assortment  Promo2
0      1          5      1   5263         c          a       0
1      1          4      1   5020         c          a       0
2      1          3      1   4782         c          a       0
3      1          2      1   5011         c          a       0
4      1          1      1   6102         c          a       0
5      1          7      0      0         c          a       0
6      1          6      0   4364         c          a       0
7      1          5      0   3706         c          a       0
8      1          4      0   3769         c          a       0
9      1          3      0   3464         c          a       0

[10 rows x 7 columns]

In [287]:
data_indexed = data.groupby(['Store','DayOfWeek','Promo','StoreType','Assortment','Promo2']).median()
data_indexed.rename(columns={'Sales': 'Median'}, inplace=True)
print data_indexed.head(5)
data_valid = data.join(data_indexed, on=['Store','DayOfWeek','Promo','StoreType','Assortment','Promo2'])
print data_valid.head(10)
print eval(data_valid['Sales'].tolist(), data_valid['Median'].tolist())
#out = test[['Id','Store','tmp','Day','DayOfWeek','Promo','SchoolHoliday','StateHoliday']].join(data_indexed, on=['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday'])#.sort('Id')
#print out.head(10)
#print set(test['Promo'].tolist()),  set(test['SchoolHoliday'].tolist()),  set(test['StateHoliday'].tolist())


                                                   Median
Store DayOfWeek Promo StoreType Assortment Promo2        
1     1         0     c         a          0       3961.0
                1     c         a          0       5683.5
      2         0     c         a          0       3762.0
                1     c         a          0       5275.5
      3         0     c         a          0       3797.0

[5 rows x 1 columns]
   Store  DayOfWeek  Promo  Sales StoreType Assortment  Promo2  Median
0      1          5      1   5263         c          a       0  4919.0
1      1          4      1   5020         c          a       0  4716.5
2      1          3      1   4782         c          a       0  4951.5
3      1          2      1   5011         c          a       0  5275.5
4      1          1      1   6102         c          a       0  5683.5
5      1          7      0      0         c          a       0     0.0
6      1          6      0   4364         c          a       0  4785.0
7      1          5      0   3706         c          a       0  4127.0
8      1          4      0   3769         c          a       0  3717.0
9      1          3      0   3464         c          a       0  3797.0

[10 rows x 8 columns]
0.0410819300894

In [296]:
data_test = test[['Id', 'Store', 'tmp', 'DayOfWeek','Promo']].merge(store[['Store','StoreType','Assortment','Promo2']], on='Store', how='left')
print data.head(10)
out = data_test.join(data_indexed, on=['Store','DayOfWeek','Promo','StoreType','Assortment','Promo2'])#.sort('Id')
print out.head(10)
out['Sales'] = out['Median']*out['tmp']
print out.sort('Store').head(10)
print 'NAN=', pd.isnull(out['Median']).nonzero()[0]
out[['Id','Sales']].to_csv('/home/gv/ashabou/mycsv4.csv', index=False)


   Store  DayOfWeek  Promo  Sales StoreType Assortment  Promo2
0      1          5      1   5263         c          a       0
1      1          4      1   5020         c          a       0
2      1          3      1   4782         c          a       0
3      1          2      1   5011         c          a       0
4      1          1      1   6102         c          a       0
5      1          7      0      0         c          a       0
6      1          6      0   4364         c          a       0
7      1          5      0   3706         c          a       0
8      1          4      0   3769         c          a       0
9      1          3      0   3464         c          a       0

[10 rows x 7 columns]
     Id  Store  tmp  DayOfWeek  Promo StoreType Assortment  Promo2  Median
0     1      1    1          4      1         c          a       0  4716.5
1   857      1    1          3      1         c          a       0  4951.5
2  1713      1    1          2      1         c          a       0  5275.5
3  2569      1    1          1      1         c          a       0  5683.5
4  3425      1    0          7      0         c          a       0     0.0
5  4281      1    1          6      0         c          a       0  4785.0
6  5137      1    1          5      0         c          a       0  4127.0
7  5993      1    1          4      0         c          a       0  3717.0
8  6849      1    1          3      0         c          a       0  3797.0
9  7705      1    1          2      0         c          a       0  3762.0

[10 rows x 9 columns]
       Id  Store  tmp  DayOfWeek  Promo StoreType Assortment  Promo2  Median  \
0       1      1    1          4      1         c          a       0  4716.5   
26  22257      1    1          6      0         c          a       0  4785.0   
27  23113      1    1          5      1         c          a       0  4919.0   
28  23969      1    1          4      1         c          a       0  4716.5   
29  24825      1    1          3      1         c          a       0  4951.5   
30  25681      1    1          2      1         c          a       0  5275.5   
31  26537      1    1          1      1         c          a       0  5683.5   
32  27393      1    0          7      0         c          a       0     0.0   
33  28249      1    1          6      0         c          a       0  4785.0   
34  29105      1    1          5      0         c          a       0  4127.0   

     Sales  
0   4716.5  
26  4785.0  
27  4919.0  
28  4716.5  
29  4951.5  
30  5275.5  
31  5683.5  
32     0.0  
33  4785.0  
34  4127.0  

[10 rows x 10 columns]
NAN= []

In [ ]: