In [ ]:
%reload_ext autoreload
%autoreload 2
In [ ]:
from fastai.tabular import *
To create the feature-engineered train_clean and test_clean from the Kaggle competition data, run rossman_data_clean.ipynb
. One important step that deals with time series is this:
add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)
In [ ]:
path = Config().data_path()/'rossmann'
train_df = pd.read_pickle(path/'train_clean')
In [ ]:
train_df.head().T
Out[ ]:
In [ ]:
n = len(train_df); n
Out[ ]:
In [ ]:
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]
In [ ]:
small_train_df.head()
Out[ ]:
In [ ]:
small_test_df.head()
Out[ ]:
In [ ]:
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)
In [ ]:
small_test_df.head()
Out[ ]:
In [ ]:
small_train_df.PromoInterval.cat.categories
Out[ ]:
In [ ]:
small_train_df['PromoInterval'].cat.codes[:5]
Out[ ]:
In [ ]:
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)
In [ ]:
small_train_df[small_train_df['CompetitionDistance_na'] == True]
Out[ ]:
In [ ]:
train_df = pd.read_pickle(path/'train_clean')
test_df = pd.read_pickle(path/'test_clean')
In [ ]:
len(train_df),len(test_df)
Out[ ]:
In [ ]:
procs=[FillMissing, Categorify, Normalize]
In [ ]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
In [ ]:
dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()
In [ ]:
test_df['Date'].min(), test_df['Date'].max()
Out[ ]:
In [ ]:
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut
Out[ ]:
In [ ]:
valid_idx = range(cut)
In [ ]:
df[dep_var].head()
Out[ ]:
In [ ]:
data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
.split_by_idx(valid_idx)
.label_from_df(cols=dep_var, label_cls=FloatList, log=True)
.add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
.databunch())
In [ ]:
doc(FloatList)
In [ ]:
max_log_y = np.log(np.max(train_df['Sales'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)
In [ ]:
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
y_range=y_range, metrics=exp_rmspe)
In [ ]:
learn.model
Out[ ]:
In [ ]:
len(data.train_ds.cont_names)
Out[ ]:
In [ ]:
learn.lr_find()
In [ ]:
learn.recorder.plot()
In [ ]:
learn.fit_one_cycle(5, 1e-3, wd=0.2)
In [ ]:
learn.save('1')
In [ ]:
learn.recorder.plot_losses(skip_start=10000)
In [ ]:
learn.load('1');
In [ ]:
learn.fit_one_cycle(5, 3e-4)
In [ ]:
learn.fit_one_cycle(5, 3e-4)
(10th place in the competition was 0.108)
In [ ]:
test_preds=learn.get_preds(DatasetType.Test)
test_df["Sales"]=np.exp(test_preds[0].data).numpy().T[0]
test_df[["Id","Sales"]]=test_df[["Id","Sales"]].astype("int")
test_df[["Id","Sales"]].to_csv("rossmann_submission.csv",index=False)