In [1]:
%matplotlib inline
import pandas as pd
import argparse
import os
import matplotlib.pyplot as plt
import numpy as np
In [125]:
def rmspe(tru, pred):
if tru==0.0 or isinstance(tru, str) or np.isnan(tru) or np.isnan(pred):
return 0.0
return (float(pred-tru)/tru)**2
def eval(trues, preds):
return np.mean([rmspe(t,p) for t,p in zip(trues, preds)])
In [275]:
train_file = "/home/gv/ashabou/data/ds/rossmann/train.csv"
test_file = "/home/gv/ashabou/data/ds/rossmann/test.csv"
store_file = "/home/gv/ashabou/data/ds/rossmann/store.csv"
In [276]:
train = pd.read_csv( train_file )
test = pd.read_csv( test_file )
store = pd.read_csv( store_file )
In [277]:
print train.head(5)
print test.head(5)
print train.count()
print test.count()
print store.head(10)
print store.count()
In [6]:
train['Store'].hist()
print train['Store'].min()
print train['Store'].max()
test['Store'].hist()
print test['Store'].min()
print test['Store'].max()
In [7]:
train['DayOfWeek'].hist()
test['DayOfWeek'].hist()
Out[7]:
In [8]:
train['Sales'].loc[train['Sales']>0].hist(bins=100)
print (train['Sales']==0).sum()
In [9]:
train['Customers'].loc[train['Customers']>0].hist(bins=100)
print (train['Customers']==0).sum()
print train['Customers'].loc[train['Customers']>0].describe()
In [10]:
#0 sales while there are some customers
train.loc[(train['Customers']>0) & (train['Sales']==0)]
Out[10]:
In [11]:
plt.figure()
train[['Sales', 'Customers']].plot(x='Sales', y='Customers', style='bo')
plt.show()
In [12]:
#from pandas.tools.plotting import scatter_matrix
#_ = scatter_matrix(train[['Sales', 'Store', 'Customers']], figsize=(14, 10))
In [13]:
plt.figure()
train.query('Open==1')[['Sales', 'Store']].plot(x='Sales', y='Store', style='bo')
plt.show()
In [14]:
train.query('Open==1')[['Sales', 'DayOfWeek']].plot(x='Sales', y='DayOfWeek', style='bo')
Out[14]:
In [15]:
train.query('Open==1')[['Sales', 'Promo']].plot(x='Sales', y='Promo', style='bo')
Out[15]:
In [16]:
train.query('Open==1')[['Store', 'Promo']].plot(x='Store', y='Promo', style='bo')
Out[16]:
In [17]:
train.query('Open==1')[['Store', 'Promo']].groupby('Store').sum().plot()
print train.query('Open==1')[['Store', 'Promo']].groupby('Store').sum().sort('Promo',ascending=0).head(10)
In [18]:
train.query('Open==1')[['Store', 'Sales']].groupby('Store').sum().plot()
print train.query('Open==1')[['Store', 'Sales']].groupby('Store').sum().sort('Sales',ascending=0).head(5)
In [19]:
#not stores that have more propmo have more sales
print train.query('Open==1')[['Store', 'Sales','Promo']].groupby('Store').sum().sort('Sales',ascending=0).head(5)
In [20]:
train.query('Open==1')[['Store', 'Sales','DayOfWeek']].groupby(['Store','DayOfWeek']).sum()
Out[20]:
In [21]:
train['Year'] = train['Date'].map(lambda x: int(x.split("-")[0]))
train['Month'] = train['Date'].map(lambda x: int(x.split("-")[1]))
train['Day'] = train['Date'].map(lambda x: int(x.split("-")[2]))
train.head(5)
Out[21]:
In [22]:
print set(train['Year'].tolist())
In [23]:
print set(train.query("Open==1")['Month'].tolist())
In [71]:
print "2013-->", set(train.query("Open==1 & Year==2013")['Month'].tolist())
print "2014-->", set(train.query("Open==1 & Year==2014")['Month'].tolist())
print "2015-->", set(train.query("Open==1 & Year==2015")['Month'].tolist())
In [24]:
test['Year'] = test['Date'].map(lambda x: int(x.split("-")[0]))
test['Month'] = test['Date'].map(lambda x: int(x.split("-")[1]))
test['Day'] = test['Date'].map(lambda x: int(x.split("-")[2]))
test.head(5)
Out[24]:
In [25]:
print set(test['Year'].tolist())
In [26]:
print set(test.query("Open==1")['Month'].tolist())
In [27]:
data = train.query("Open==1").query('Month==6')[['Sales','Year','Store']].groupby(['Store','Year']).sum()
print data.head(20)
data.query('Year==2013').plot()
data.query('Year==2014').plot()
data.query('Year==2015').plot()
Out[27]:
In [38]:
data = train.query("Open==1").query('Month==8 | Month==9')[['Sales','Store','Month','Year']].groupby(['Store','Month']).median()
data.rename(columns={'Sales': 'Median'}, inplace=True)
print data.head(10)
In [96]:
valid = train.copy().query('(Month==8 | Month==9) & Year==2014')
valid['tmp'] = train['Open'].map(lambda x: 1 if x == 1 else 0)
print valid.head(5)
In [97]:
ttrain = train.query("Open==1").query('(Month==8 | Month==9) & Year==2013')[['Sales','Store','Month']].groupby(['Store','Month']).median()
ttrain.rename(columns={'Sales': 'Median'}, inplace=True)
print ttrain.head(10)
out = valid.join(ttrain, on=['Store','Month'])
print out.head(5)
In [98]:
out['preds'] = out['tmp']*out['Median']
print out.shape
print out[['Sales','preds']].head(5)
print eval(out['Sales'].tolist(), out['preds'].tolist())
In [291]:
test['tmp'] = test['Open'].map(lambda x: 1 if x == 1 else 0)
print test.head(5)
In [68]:
out = test[['Id','Store','tmp','Month']].join(data, on=['Store','Month'])#.sort('Id')
print out.head(10)
print out.query('Store==1 & Month==8').head(10)
print out.query('Store==1 & Month==9').head(10)
In [69]:
out['Sales'] = out['tmp']*out['Median']
print out.head(10)
print out.query('Store==1 & Month==8').head(10)
print out.query('Store==1 & Month==9').head(10)
In [70]:
out = out[['Id','Sales']]
print out.head(10)
In [141]:
out.to_csv('/home/gv/ashabou/mycsv.csv', index=False)
In [103]:
data_2013 = train.query('(Month==8 | Month==9) & Year==2013')[['Sales','Store', 'Day','Month']].sort(['Store','Month', 'Day'])
data_2013.index=range(1, len(data_2013) + 1)
print data_2013.head(10)
data_2014 = train.query('(Month==8 | Month==9) & Year==2014')[['Sales','Store','Day','Month']].sort(['Store','Month', 'Day'])
data_2014.index=range(1, len(data_2014) + 1)
print data_2014.head(10)
In [109]:
join1314 = data_2013.join(data_2014.groupby(['Store','Month','Day']).mean().rename(columns={'Sales': 'Median'}), on=['Store','Month','Day'])
print join1314.head(5)
In [126]:
sales = join1314['Sales'][1:]
sales.index =range(1, len(sales) + 1)
median = join1314['Median'][:-1]
median.index =range(1, len(median) + 1)
ct = pd.concat([sales, median], axis=1)
print ct.head(5)
ct.plot()
print sales.describe()
print median.describe()
print eval(sales.tolist(), median.tolist())
In [130]:
join_indexed = join1314.groupby(['Store','Month','Day']).median()
print join_indexed.head(5)
In [202]:
out = test[['Id','Store','tmp','Month','Day']].join(join_indexed, on=['Store','Month','Day'])#.sort('Id')
out.rename(columns={'Sales': 'y2013', 'Median':'y2014'}, inplace=True)
print out.head(10)
print out.query('Store==1 & Month==8').head(10)
print out.query('Store==1 & Month==9').head(10)
In [203]:
out['Sales']=[0]*len(out['tmp'])
print out.head(30)
In [205]:
def computeValue(x,y):
if np.isnan(x) or x==0:
return y
if np.isnan(y) or y==0:
return x
return (x+y)/2.0
out = out.sort(['Store','Month','Day'])
out['Sales']=[computeValue(x,y) for x,y in zip(out['y2014'].tolist()[1:-1], out['y2013'].tolist()[2:])]+\
[computeValue(x,y) for x,y in zip(out['y2014'].tolist()[-2:], out['y2013'].tolist()[-2:])]
out['Sales'] = out['Sales']*out['tmp']
print out.head(1000)
In [206]:
print out.query('tmp==0').shape
print out.query('y2013==0').shape
print out.query('y2014==0').shape
print out.query('Sales==0').shape
In [207]:
out_csv = out[['Id','Sales']].sort('Id')
print out_csv.head(10)
In [208]:
out_csv.to_csv('/home/gv/ashabou/mycsv2.csv', index=False)
In [246]:
data = train.query('Year==2013 | Year==2014')[['Sales','Store', 'DayOfWeek','Promo','SchoolHoliday','StateHoliday']].sort(['Store','DayOfWeek'])
print data.head(10)
print set(data['Promo'].tolist()), set(data['SchoolHoliday'].tolist()), set(data['StateHoliday'].tolist())
data['StateHoliday'][data['StateHoliday']==0]='0'
print set(data['Promo'].tolist()), set(data['SchoolHoliday'].tolist()), set(data['StateHoliday'].tolist())
In [256]:
data_indexed = data.groupby(['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday']).median()
data_indexed.rename(columns={'Sales': 'Median'}, inplace=True)
print data_indexed.head(5)
data_valid = data.join(data_indexed, on=['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday'])
print data_valid.head(10)
print eval(data_valid['Sales'].tolist(), data_valid['Median'].tolist())
out = test[['Id','Store','tmp','Day','DayOfWeek','Promo','SchoolHoliday','StateHoliday']].join(data_indexed, on=['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday'])#.sort('Id')
print out.head(10)
print set(test['Promo'].tolist()), set(test['SchoolHoliday'].tolist()), set(test['StateHoliday'].tolist())
In [271]:
idx_nan = pd.isnull(out['Median']).nonzero()[0]
print idx_nan
data_indexed2 = data[['Store','DayOfWeek','Promo','Sales']].groupby(['Store','DayOfWeek','Promo']).median()
data_indexed2.rename(columns={'Sales': 'Median'}, inplace=True)
print data_indexed2.head(5)
out2 = test[['Id','Store','tmp','Day','DayOfWeek','Promo','SchoolHoliday','StateHoliday']].join(data_indexed2, on=['Store','DayOfWeek','Promo'])
print pd.isnull(out2['Median']).nonzero()
out3 = out.copy()
out3['Median'][idx_nan] = out2['Median'][idx_nan]
print pd.isnull(out3['Median']).nonzero()
In [272]:
out3['Sales'] = out3['Median']*out3['tmp']
print out3.sort('Store').head(10)
In [273]:
out3[['Id','Sales']].to_csv('/home/gv/ashabou/mycsv3.csv', index=False)
In [281]:
print store.head(10)
print store.count()
print 'nb stores=', len(train['Store'].unique())
In [286]:
data = train[['Store', 'DayOfWeek','Promo','Sales']].merge(store[['Store','StoreType','Assortment','Promo2']], on='Store', how='left')
print data.head(10)
In [287]:
data_indexed = data.groupby(['Store','DayOfWeek','Promo','StoreType','Assortment','Promo2']).median()
data_indexed.rename(columns={'Sales': 'Median'}, inplace=True)
print data_indexed.head(5)
data_valid = data.join(data_indexed, on=['Store','DayOfWeek','Promo','StoreType','Assortment','Promo2'])
print data_valid.head(10)
print eval(data_valid['Sales'].tolist(), data_valid['Median'].tolist())
#out = test[['Id','Store','tmp','Day','DayOfWeek','Promo','SchoolHoliday','StateHoliday']].join(data_indexed, on=['Store','DayOfWeek','Promo','SchoolHoliday','StateHoliday'])#.sort('Id')
#print out.head(10)
#print set(test['Promo'].tolist()), set(test['SchoolHoliday'].tolist()), set(test['StateHoliday'].tolist())
In [296]:
data_test = test[['Id', 'Store', 'tmp', 'DayOfWeek','Promo']].merge(store[['Store','StoreType','Assortment','Promo2']], on='Store', how='left')
print data.head(10)
out = data_test.join(data_indexed, on=['Store','DayOfWeek','Promo','StoreType','Assortment','Promo2'])#.sort('Id')
print out.head(10)
out['Sales'] = out['Median']*out['tmp']
print out.sort('Store').head(10)
print 'NAN=', pd.isnull(out['Median']).nonzero()[0]
out[['Id','Sales']].to_csv('/home/gv/ashabou/mycsv4.csv', index=False)
In [ ]: