Practical Deep Learning for Coders, v3

Lesson6_rossmann



In [ ]:

    
%reload_ext autoreload
%autoreload 2



In [ ]:

    
from fastai.tabular import *

Rossmann

连锁店销量预估

Data preparation 数据准备

To create the feature-engineered train_clean and test_clean from the Kaggle competition data, run rossman_data_clean.ipynb. One important step that deals with time series is this:

为了从Kaggle竞赛数据集中创建经过特征工程处理的train_clean和test_clean，运行rossman_data_clean.ipynb，其中一个处理时间序列数据的重要步骤如下：

add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)



In [ ]:

    
path = Config().data_path()/'rossmann'
train_df = pd.read_pickle(path/'train_clean')



In [ ]:

    
train_df.head().T









    Out[ ]:







  
    
      
      0
      1
      2
      3
      4
    
  
  
    
      index
      0
      1
      2
      3
      4
    
    
      Store
      1
      2
      3
      4
      5
    
    
      DayOfWeek
      5
      5
      5
      5
      5
    
    
      Date
      2015-07-31
      2015-07-31
      2015-07-31
      2015-07-31
      2015-07-31
    
    
      Sales
      5263
      6064
      8314
      13995
      4822
    
    
      Customers
      555
      625
      821
      1498
      559
    
    
      Open
      1
      1
      1
      1
      1
    
    
      Promo
      1
      1
      1
      1
      1
    
    
      StateHoliday
      False
      False
      False
      False
      False
    
    
      SchoolHoliday
      1
      1
      1
      1
      1
    
    
      Year
      2015
      2015
      2015
      2015
      2015
    
    
      Month
      7
      7
      7
      7
      7
    
    
      Week
      31
      31
      31
      31
      31
    
    
      Day
      31
      31
      31
      31
      31
    
    
      Dayofweek
      4
      4
      4
      4
      4
    
    
      Dayofyear
      212
      212
      212
      212
      212
    
    
      Is_month_end
      True
      True
      True
      True
      True
    
    
      Is_month_start
      False
      False
      False
      False
      False
    
    
      Is_quarter_end
      False
      False
      False
      False
      False
    
    
      Is_quarter_start
      False
      False
      False
      False
      False
    
    
      Is_year_end
      False
      False
      False
      False
      False
    
    
      Is_year_start
      False
      False
      False
      False
      False
    
    
      Elapsed
      1438300800
      1438300800
      1438300800
      1438300800
      1438300800
    
    
      StoreType
      c
      a
      a
      c
      a
    
    
      Assortment
      a
      a
      a
      c
      a
    
    
      CompetitionDistance
      1270
      570
      14130
      620
      29910
    
    
      CompetitionOpenSinceMonth
      9
      11
      12
      9
      4
    
    
      CompetitionOpenSinceYear
      2008
      2007
      2006
      2009
      2015
    
    
      Promo2
      0
      1
      1
      0
      0
    
    
      Promo2SinceWeek
      1
      13
      14
      1
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      Min_Sea_Level_PressurehPa
      1015
      1017
      1017
      1014
      1016
    
    
      Max_VisibilityKm
      31
      10
      31
      10
      10
    
    
      Mean_VisibilityKm
      15
      10
      14
      10
      10
    
    
      Min_VisibilitykM
      10
      10
      10
      10
      10
    
    
      Max_Wind_SpeedKm_h
      24
      14
      14
      23
      14
    
    
      Mean_Wind_SpeedKm_h
      11
      11
      5
      16
      11
    
    
      Max_Gust_SpeedKm_h
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      Precipitationmm
      0
      0
      0
      0
      0
    
    
      CloudCover
      1
      4
      2
      6
      4
    
    
      Events
      Fog
      Fog
      Fog
      NaN
      NaN
    
    
      WindDirDegrees
      13
      309
      354
      282
      290
    
    
      StateName
      Hessen
      Thueringen
      NordrheinWestfalen
      Berlin
      Sachsen
    
    
      CompetitionOpenSince
      2008-09-15
      2007-11-15
      2006-12-15
      2009-09-15
      2015-04-15
    
    
      CompetitionDaysOpen
      2510
      2815
      3150
      2145
      107
    
    
      CompetitionMonthsOpen
      24
      24
      24
      24
      3
    
    
      Promo2Since
      1900-01-01
      2010-03-29
      2011-04-04
      1900-01-01
      1900-01-01
    
    
      Promo2Days
      0
      1950
      1579
      0
      0
    
    
      Promo2Weeks
      0
      25
      25
      0
      0
    
    
      AfterSchoolHoliday
      0
      0
      0
      0
      0
    
    
      BeforeSchoolHoliday
      0
      0
      0
      0
      0
    
    
      AfterStateHoliday
      57
      67
      57
      67
      57
    
    
      BeforeStateHoliday
      0
      0
      0
      0
      0
    
    
      AfterPromo
      0
      0
      0
      0
      0
    
    
      BeforePromo
      0
      0
      0
      0
      0
    
    
      SchoolHoliday_bw
      5
      5
      5
      5
      5
    
    
      StateHoliday_bw
      0
      0
      0
      0
      0
    
    
      Promo_bw
      5
      5
      5
      5
      5
    
    
      SchoolHoliday_fw
      7
      1
      5
      1
      1
    
    
      StateHoliday_fw
      0
      0
      0
      0
      0
    
    
      Promo_fw
      5
      1
      5
      1
      1
    
  

93 rows × 5 columns



In [ ]:

    
n = len(train_df); n









    Out[ ]:





844338

Experimenting with a sample 样例数据实验



In [ ]:

    
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]



In [ ]:

    
small_train_df.head()









    Out[ ]:







  
    
      
      Store
      DayOfWeek
      PromoInterval
      CompetitionDistance
      Mean_Humidity
      Sales
    
  
  
    
      267
      268
      5
      NaN
      4520.0
      67
      7492
    
    
      604
      606
      5
      NaN
      2260.0
      61
      7187
    
    
      983
      986
      5
      Feb,May,Aug,Nov
      620.0
      61
      7051
    
    
      1636
      525
      4
      NaN
      1870.0
      55
      9673
    
    
      2348
      123
      3
      NaN
      16760.0
      50
      10007



In [ ]:

    
small_test_df.head()









    Out[ ]:







  
    
      
      Store
      DayOfWeek
      PromoInterval
      CompetitionDistance
      Mean_Humidity
      Sales
    
  
  
    
      420510
      829
      3
      NaN
      110.0
      55
      6802
    
    
      420654
      973
      3
      Jan,Apr,Jul,Oct
      330.0
      59
      6644
    
    
      420990
      194
      2
      Feb,May,Aug,Nov
      16970.0
      55
      4720
    
    
      421308
      512
      2
      Mar,Jun,Sept,Dec
      590.0
      72
      6248
    
    
      421824
      1029
      2
      NaN
      1590.0
      64
      8004



In [ ]:

    
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)



In [ ]:

    
small_test_df.head()









    Out[ ]:







  
    
      
      Store
      DayOfWeek
      PromoInterval
      CompetitionDistance
      Mean_Humidity
      Sales
    
  
  
    
      420510
      NaN
      3
      NaN
      110.0
      55
      6802
    
    
      420654
      973.0
      3
      Jan,Apr,Jul,Oct
      330.0
      59
      6644
    
    
      420990
      NaN
      2
      Feb,May,Aug,Nov
      16970.0
      55
      4720
    
    
      421308
      512.0
      2
      Mar,Jun,Sept,Dec
      590.0
      72
      6248
    
    
      421824
      1029.0
      2
      NaN
      1590.0
      64
      8004



In [ ]:

    
small_train_df.PromoInterval.cat.categories









    Out[ ]:





Index(['Feb,May,Aug,Nov', 'Jan,Apr,Jul,Oct', 'Mar,Jun,Sept,Dec'], dtype='object')



In [ ]:

    
small_train_df['PromoInterval'].cat.codes[:5]









    Out[ ]:





267    -1
604    -1
983     0
1636   -1
2348   -1
dtype: int8



In [ ]:

    
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)



In [ ]:

    
small_train_df[small_train_df['CompetitionDistance_na'] == True]









    Out[ ]:







  
    
      
      Store
      DayOfWeek
      PromoInterval
      CompetitionDistance
      Mean_Humidity
      Sales
      CompetitionDistance_na
    
  
  
    
      185749
      622
      2
      NaN
      2300.0
      93
      4508
      True

Preparing full data set 准备全部数据集



In [ ]:

    
train_df = pd.read_pickle(path/'train_clean')
test_df = pd.read_pickle(path/'test_clean')



In [ ]:

    
len(train_df),len(test_df)









    Out[ ]:





(844338, 41088)



In [ ]:

    
procs=[FillMissing, Categorify, Normalize]



In [ ]:

    
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']



In [ ]:

    
dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()



In [ ]:

    
test_df['Date'].min(), test_df['Date'].max()









    Out[ ]:





('2015-08-01', '2015-09-17')



In [ ]:

    
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut









    Out[ ]:





41395



In [ ]:

    
valid_idx = range(cut)



In [ ]:

    
df[dep_var].head()









    Out[ ]:





0     5263
1     6064
2     8314
3    13995
4     4822
Name: Sales, dtype: int64



In [ ]:

    
data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
                .databunch())



In [ ]:

    
doc(FloatList)

Model 模型



In [ ]:

    
max_log_y = np.log(np.max(train_df['Sales'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)



In [ ]:

    
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, 
                        y_range=y_range, metrics=exp_rmspe)



In [ ]:

    
learn.model









    Out[ ]:





TabularModel(
  (embeds): ModuleList(
    (0): Embedding(1116, 81)
    (1): Embedding(8, 5)
    (2): Embedding(4, 3)
    (3): Embedding(13, 7)
    (4): Embedding(32, 11)
    (5): Embedding(3, 3)
    (6): Embedding(26, 10)
    (7): Embedding(27, 10)
    (8): Embedding(5, 4)
    (9): Embedding(4, 3)
    (10): Embedding(4, 3)
    (11): Embedding(24, 9)
    (12): Embedding(9, 5)
    (13): Embedding(13, 7)
    (14): Embedding(53, 15)
    (15): Embedding(22, 9)
    (16): Embedding(7, 5)
    (17): Embedding(7, 5)
    (18): Embedding(4, 3)
    (19): Embedding(4, 3)
    (20): Embedding(9, 5)
    (21): Embedding(9, 5)
    (22): Embedding(3, 3)
    (23): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.04)
  (bn_cont): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=233, out_features=1000, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.001)
    (4): Linear(in_features=1000, out_features=500, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.01)
    (8): Linear(in_features=500, out_features=1, bias=True)
  )
)



In [ ]:

    
len(data.train_ds.cont_names)









    Out[ ]:





16



In [ ]:

    
learn.lr_find()









    



LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.



In [ ]:

    
learn.recorder.plot()



In [ ]:

    
learn.fit_one_cycle(5, 1e-3, wd=0.2)









    




Total time: 11:27 

  
    epoch
    train_loss
    valid_loss
    exp_rmspe
  
  
    1
    0.023587
    0.020941
    0.140551
  
  
    2
    0.017678
    0.023431
    0.132211
  
  
    3
    0.017453
    0.016929
    0.120169
  
  
    4
    0.012608
    0.016296
    0.109245
  
  
    5
    0.010222
    0.011238
    0.105433



In [ ]:

    
learn.save('1')



In [ ]:

    
learn.recorder.plot_losses(last=-1)



In [ ]:

    
learn.load('1');



In [ ]:

    
learn.fit_one_cycle(5, 3e-4)









    




Total time: 11:32 

  
    epoch
    train_loss
    valid_loss
    exp_rmspe
  
  
    1
    0.012223
    0.014312
    0.116988
  
  
    2
    0.012001
    0.017789
    0.117619
  
  
    3
    0.011402
    0.035596
    0.114396
  
  
    4
    0.010067
    0.015125
    0.113652
  
  
    5
    0.009148
    0.031326
    0.116344



In [ ]:

    
learn.fit_one_cycle(5, 3e-4)









    




Total time: 11:31 

  
    epoch
    train_loss
    valid_loss
    exp_rmspe
  
  
    1
    0.011840
    0.013236
    0.110483
  
  
    2
    0.010765
    0.057664
    0.129586
  
  
    3
    0.010101
    0.042744
    0.111584
  
  
    4
    0.008820
    0.116893
    0.135458
  
  
    5
    0.009144
    0.017969
    0.126323

(10th place in the competition was 0.108)



In [ ]:

    
test_preds=learn.get_preds(DatasetType.Test)
test_df["Sales"]=np.exp(test_preds[0].data).numpy().T[0]
test_df[["Id","Sales"]]=test_df[["Id","Sales"]].astype("int")
test_df[["Id","Sales"]].to_csv("rossmann_submission.csv",index=False)

	0	1	2	3	4
index	0	1	2	3	4
Store	1	2	3	4	5
DayOfWeek	5	5	5	5	5
Date	2015-07-31	2015-07-31	2015-07-31	2015-07-31	2015-07-31
Sales	5263	6064	8314	13995	4822
Customers	555	625	821	1498	559
Open	1	1	1	1	1
Promo	1	1	1	1	1
StateHoliday	False	False	False	False	False
SchoolHoliday	1	1	1	1	1
Year	2015	2015	2015	2015	2015
Month	7	7	7	7	7
Week	31	31	31	31	31
Day	31	31	31	31	31
Dayofweek	4	4	4	4	4
Dayofyear	212	212	212	212	212
Is_month_end	True	True	True	True	True
Is_month_start	False	False	False	False	False
Is_quarter_end	False	False	False	False	False
Is_quarter_start	False	False	False	False	False
Is_year_end	False	False	False	False	False
Is_year_start	False	False	False	False	False
Elapsed	1438300800	1438300800	1438300800	1438300800	1438300800
StoreType	c	a	a	c	a
Assortment	a	a	a	c	a
CompetitionDistance	1270	570	14130	620	29910
CompetitionOpenSinceMonth	9	11	12	9	4
CompetitionOpenSinceYear	2008	2007	2006	2009	2015
Promo2	0	1	1	0	0
Promo2SinceWeek	1	13	14	1	1
...	...	...	...	...	...
Min_Sea_Level_PressurehPa	1015	1017	1017	1014	1016
Max_VisibilityKm	31	10	31	10	10
Mean_VisibilityKm	15	10	14	10	10
Min_VisibilitykM	10	10	10	10	10
Max_Wind_SpeedKm_h	24	14	14	23	14
Mean_Wind_SpeedKm_h	11	11	5	16	11
Max_Gust_SpeedKm_h	NaN	NaN	NaN	NaN	NaN
Precipitationmm	0	0	0	0	0
CloudCover	1	4	2	6	4
Events	Fog	Fog	Fog	NaN	NaN
WindDirDegrees	13	309	354	282	290
StateName	Hessen	Thueringen	NordrheinWestfalen	Berlin	Sachsen
CompetitionOpenSince	2008-09-15	2007-11-15	2006-12-15	2009-09-15	2015-04-15
CompetitionDaysOpen	2510	2815	3150	2145	107
CompetitionMonthsOpen	24	24	24	24	3
Promo2Since	1900-01-01	2010-03-29	2011-04-04	1900-01-01	1900-01-01
Promo2Days	0	1950	1579	0	0
Promo2Weeks	0	25	25	0	0
AfterSchoolHoliday	0	0	0	0	0
BeforeSchoolHoliday	0	0	0	0	0
AfterStateHoliday	57	67	57	67	57
BeforeStateHoliday	0	0	0	0	0
AfterPromo	0	0	0	0	0
BeforePromo	0	0	0	0	0
SchoolHoliday_bw	5	5	5	5	5
StateHoliday_bw	0	0	0	0	0
Promo_bw	5	5	5	5	5
SchoolHoliday_fw	7	1	5	1	1
StateHoliday_fw	0	0	0	0	0
Promo_fw	5	1	5	1	1

	Store	DayOfWeek	PromoInterval	CompetitionDistance	Mean_Humidity	Sales
267	268	5	NaN	4520.0	67	7492
604	606	5	NaN	2260.0	61	7187
983	986	5	Feb,May,Aug,Nov	620.0	61	7051
1636	525	4	NaN	1870.0	55	9673
2348	123	3	NaN	16760.0	50	10007

	Store	DayOfWeek	PromoInterval	CompetitionDistance	Mean_Humidity	Sales
420510	829	3	NaN	110.0	55	6802
420654	973	3	Jan,Apr,Jul,Oct	330.0	59	6644
420990	194	2	Feb,May,Aug,Nov	16970.0	55	4720
421308	512	2	Mar,Jun,Sept,Dec	590.0	72	6248
421824	1029	2	NaN	1590.0	64	8004

epoch	train_loss	valid_loss	exp_rmspe
1	0.023587	0.020941	0.140551
2	0.017678	0.023431	0.132211
3	0.017453	0.016929	0.120169
4	0.012608	0.016296	0.109245
5	0.010222	0.011238	0.105433

epoch	train_loss	valid_loss	exp_rmspe
1	0.012223	0.014312	0.116988
2	0.012001	0.017789	0.117619
3	0.011402	0.035596	0.114396
4	0.010067	0.015125	0.113652
5	0.009148	0.031326	0.116344

epoch	train_loss	valid_loss	exp_rmspe
1	0.011840	0.013236	0.110483
2	0.010765	0.057664	0.129586
3	0.010101	0.042744	0.111584
4	0.008820	0.116893	0.135458
5	0.009144	0.017969	0.126323