Important: This notebook will only work with fastai-0.7.x. Do not try to run any fastai-1.x code from this path in the repository because it will load fastai-0.7.x

Deep learning for Bulldozers


In [ ]:
%load_ext autoreload
%autoreload 2

In [ ]:
%matplotlib inline

from fastai.imports import *
from fastai.torch_imports import *
from fastai.dataset import *
from fastai.learner import *
from fastai.structured import *
from fastai.column_data import *

Load in our data from last lesson


In [ ]:
dep = 'SalePrice'
PATH = "data/bulldozers/"
df_raw = pd.read_feather('tmp/bulldozers-raw')
keep_cols = list(np.load('tmp/keep_cols.npy'))

In [ ]:
df_raw.loc[df_raw.YearMade<1950, 'YearMade'] = 1950
df_raw['age'] = df_raw.saleYear-df_raw.YearMade
df_raw = df_raw[keep_cols+['age', dep]].copy()
df_indep = df_raw.drop(dep,axis=1)

n_valid = 12000
n_trn = len(df_raw)-n_valid

In [ ]:
cat_flds = [n for n in df_indep.columns if df_raw[n].nunique()<n_trn/50]
' '.join(cat_flds)


Out[ ]:
'YearMade Coupler_System ProductSize fiProductClassDesc ModelID saleElapsed fiSecondaryDesc Enclosure fiModelDesc Hydraulics_Flow fiModelDescriptor Hydraulics Drive_System ProductGroupDesc ProductGroup state saleDay Track_Type saleDayofyear Stick_Length age'

In [ ]:
for o in ['saleElapsed', 'saleDayofyear', 'saleDay', 'age', 'YearMade']: cat_flds.remove(o)
[n for n in df_indep.drop(cat_flds,axis=1).columns if not is_numeric_dtype(df_raw[n])]


Out[ ]:
[]

In [ ]:
for n in cat_flds: df_raw[n] = df_raw[n].astype('category').cat.as_ordered()

cont_flds = [n for n in df_indep.columns if n not in cat_flds]
' '.join(cont_flds)


Out[ ]:
'YearMade saleElapsed SalesID MachineID saleDay saleDayofyear age'

In [ ]:
df_raw = df_raw[cat_flds+cont_flds+[dep]]
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', do_scale=True)

val_idx = list(range(n_trn, len(df)))

In [ ]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y, cat_flds=cat_flds, bs=64)

In [ ]:
df.head()


Out[ ]:
Coupler_System ProductSize fiProductClassDesc ModelID fiSecondaryDesc Enclosure fiModelDesc Hydraulics_Flow fiModelDescriptor Hydraulics ... state Track_Type Stick_Length YearMade saleElapsed SalesID MachineID saleDay saleDayofyear age
0 0 0 59 644 41 3 950 0 0 1 ... 1 0 0 0.913196 0.397377 -0.858580 -0.496185 -0.013101 1.352092 -0.828814
1 0 4 62 11 55 3 1725 0 0 1 ... 33 0 0 0.405756 -0.061496 -0.858578 -2.494936 1.173518 -0.907472 -0.430749
2 1 0 39 1542 0 6 331 3 0 4 ... 32 0 0 0.722906 -0.075286 -0.858577 -1.775759 1.173518 -1.187503 -0.762470
3 0 6 8 110 0 3 3674 0 0 1 ... 44 0 0 0.722906 1.179600 -0.858574 -0.434096 0.342885 -0.395690 -0.298060
4 1 0 40 3540 0 1 4208 3 0 4 ... 32 0 0 1.103486 0.863382 -0.858572 -0.364020 0.817532 0.231967 -0.828814

5 rows × 23 columns

Model


In [ ]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

In [ ]:
emb_c = {n: len(c.cat.categories)+1 for n,c in df_raw[cat_flds].items()}
emb_c


Out[ ]:
{'Coupler_System': 3,
 'Drive_System': 5,
 'Enclosure': 7,
 'Hydraulics': 13,
 'Hydraulics_Flow': 4,
 'ModelID': 5219,
 'ProductGroup': 7,
 'ProductGroupDesc': 7,
 'ProductSize': 7,
 'Stick_Length': 30,
 'Track_Type': 3,
 'fiModelDesc': 5000,
 'fiModelDescriptor': 140,
 'fiProductClassDesc': 75,
 'fiSecondaryDesc': 176,
 'state': 54}

In [ ]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in emb_c.items()]
metrics=[rmse]

In [ ]:
y_range=(0,np.max(y)*1.2)

In [ ]:
m = md.get_learner(emb_szs, len(cont_flds), 0.05, 1, [500,250], [0.5,0.05],
                   y_range=y_range, use_bn=True)

In [ ]:
m.lr_find()


 63%|██████▎   | 3812/6081 [00:28<00:14, 154.49it/s, loss=0.202] 

In [ ]:
m.sched.plot(1300)


 63%|██████▎   | 3812/6081 [00:40<00:23, 95.21it/s, loss=0.314]

In [ ]:
lr=1e-3; wd=1e-7

In [ ]:
m.fit(lr, 2, wd, cycle_len=1, cycle_mult=2)


[ 0.       0.06207  0.09731]                                     
[ 1.       0.06048  0.07684]                                     
[ 2.       0.05326  0.06389]                                     


In [ ]:
m.fit(lr, 2, wd, cycle_len=2, cycle_mult=2)


[ 0.       0.05471  0.0523 ]                                     
[ 1.       0.04767  0.0512 ]                                     
[ 2.       0.05249  0.05747]                                     
[ 3.       0.04643  0.05393]                                     
[ 4.       0.04984  0.04934]                                     
[ 5.       0.04277  0.04869]                                     


In [ ]:
math.sqrt(0.0487)


Out[ ]:
0.22068076490713912