Important: This notebook will only work with fastai-0.7.x. Do not try to run any fastai-1.x code from this path in the repository because it will load fastai-0.7.x

A Linear Model for Bulldozers


In [ ]:
%load_ext autoreload
%autoreload 2

In [ ]:
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

In [ ]:
set_plot_sizes(12,14,16)

Load in our data from last lesson


In [ ]:
PATH = "data/bulldozers/"

df_raw = pd.read_feather('tmp/bulldozers-raw')

In [ ]:
df_raw['age'] = df_raw.saleYear-df_raw.YearMade

In [ ]:
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', max_n_cat=10, do_scale=True)

In [ ]:
def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df)-n_valid
y_train, y_valid = split_vals(y, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)

In [ ]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

Linear regression for Bulldozers

Data scaling


In [ ]:
df.describe().transpose()


Out[ ]:
count mean std min 25% 50% 75% max
SalesID 401125.0 3.147483e-15 1.000001 -0.858580 -0.551518 -0.308344 0.355322 4.855369
MachineID 401125.0 2.895953e-15 1.000001 -2.761738 -0.292989 0.139657 0.567277 2.876309
ModelID 401125.0 -1.300773e-13 1.000001 -1.102854 -0.583548 -0.367372 0.294819 4.871330
datasource 401125.0 -6.855920e-13 1.000001 -1.524823 -0.297450 -0.297450 0.148868 4.165727
auctioneerID 401125.0 -1.064768e-13 1.000001 -0.381738 -0.321407 -0.261075 -0.140412 5.591096
YearMade 401125.0 6.797770e-16 1.000001 -3.081446 0.294188 0.328458 0.345593 0.390145
MachineHoursCurrentMeter 401125.0 8.635210e-14 1.000001 -0.074396 -0.074396 -0.074396 -0.074396 150.038499
fiModelDesc 401125.0 1.673709e+03 1263.331163 1.000000 631.000000 1395.000000 2292.000000 4999.000000
fiBaseModel 401125.0 5.591654e+02 469.310266 1.000000 206.000000 406.000000 704.000000 1950.000000
fiSecondaryDesc 401125.0 3.668504e+01 38.228243 0.000000 0.000000 29.000000 57.000000 175.000000
fiModelSeries 401125.0 9.192686e+00 27.006859 0.000000 0.000000 0.000000 0.000000 122.000000
fiModelDescriptor 401125.0 1.223343e+01 29.041950 0.000000 0.000000 0.000000 0.000000 139.000000
fiProductClassDesc 401125.0 3.226222e+01 22.596618 1.000000 11.000000 35.000000 52.000000 74.000000
state 401125.0 2.350832e+01 15.732898 1.000000 9.000000 22.000000 41.000000 53.000000
Hydraulics 401125.0 4.253801e+00 4.809368 0.000000 1.000000 1.000000 12.000000 12.000000
Tire_Size 401125.0 3.059847e+00 5.950781 0.000000 0.000000 0.000000 0.000000 17.000000
Undercarriage_Pad_Width 401125.0 4.311895e+00 7.741894 0.000000 0.000000 0.000000 0.000000 19.000000
Stick_Length 401125.0 6.535564e+00 11.923308 0.000000 0.000000 0.000000 0.000000 29.000000
saleYear 401125.0 -2.781277e-13 1.000001 -2.623435 -0.711783 0.330937 0.852297 1.199870
saleMonth 401125.0 2.443335e-14 1.000001 -1.578894 -0.994879 -0.118857 0.757165 1.633187
saleWeek 401125.0 -9.469515e-15 1.000001 -1.702718 -0.891252 -0.079785 0.866926 1.813637
saleDay 401125.0 -1.649883e-15 1.000001 -1.793028 -0.843733 -0.013101 0.817532 1.766827
saleDayofweek 401125.0 5.134902e-14 1.000001 -1.850225 -0.427508 0.283850 0.283850 2.417924
saleDayofyear 401125.0 3.377684e-15 1.000001 -1.718597 -0.926784 -0.115659 0.878936 1.786624
saleis_month_end 401125.0 3.491732e-14 1.000001 -0.186900 -0.186900 -0.186900 -0.186900 5.350455
saleis_month_start 401125.0 8.367190e-14 1.000001 -0.161180 -0.161180 -0.161180 -0.161180 6.204230
saleis_quarter_end 401125.0 4.067403e-14 1.000001 -0.124679 -0.124679 -0.124679 -0.124679 8.020586
saleis_quarter_start 401125.0 1.443002e-14 1.000001 -0.079495 -0.079495 -0.079495 -0.079495 12.579339
saleis_year_end 401125.0 -1.624260e-14 1.000001 -0.001579 -0.001579 -0.001579 -0.001579 633.343509
saleis_year_start 401125.0 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
... ... ... ... ... ... ... ... ...
Blade_Type_Angle 401125.0 4.143347e-03 0.064235 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_Coal 401125.0 2.742287e-05 0.005237 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_Landfill 401125.0 6.232471e-05 0.007894 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_No 401125.0 1.852290e-03 0.042998 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_None or Unspecified 401125.0 2.849735e-02 0.166389 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_PAT 401125.0 9.625927e-02 0.294947 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_Semi U 401125.0 2.148208e-02 0.144985 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_Straight 401125.0 3.321409e-02 0.179195 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_U 401125.0 4.641945e-03 0.067974 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_VPAT 401125.0 8.842630e-03 0.093619 0.000000 0.000000 0.000000 0.000000 1.000000
Blade_Type_nan 401125.0 8.009773e-01 0.399266 0.000000 1.000000 1.000000 1.000000 1.000000
Travel_Controls_1 Speed 401125.0 2.492988e-05 0.004993 0.000000 0.000000 0.000000 0.000000 1.000000
Travel_Controls_2 Pedal 401125.0 2.846993e-03 0.053281 0.000000 0.000000 0.000000 0.000000 1.000000
Travel_Controls_Differential Steer 401125.0 1.216329e-02 0.109615 0.000000 0.000000 0.000000 0.000000 1.000000
Travel_Controls_Finger Tip 401125.0 6.541602e-03 0.080615 0.000000 0.000000 0.000000 0.000000 1.000000
Travel_Controls_Lever 401125.0 2.094110e-03 0.045714 0.000000 0.000000 0.000000 0.000000 1.000000
Travel_Controls_None or Unspecified 401125.0 1.743172e-01 0.379383 0.000000 0.000000 0.000000 0.000000 1.000000
Travel_Controls_Pedal 401125.0 1.037083e-03 0.032187 0.000000 0.000000 0.000000 0.000000 1.000000
Travel_Controls_nan 401125.0 8.009748e-01 0.399268 0.000000 1.000000 1.000000 1.000000 1.000000
Differential_Type_Limited Slip 401125.0 2.817077e-03 0.053001 0.000000 0.000000 0.000000 0.000000 1.000000
Differential_Type_Locking 401125.0 4.985977e-06 0.002233 0.000000 0.000000 0.000000 0.000000 1.000000
Differential_Type_No Spin 401125.0 5.135556e-04 0.022656 0.000000 0.000000 0.000000 0.000000 1.000000
Differential_Type_Standard 401125.0 1.697052e-01 0.375374 0.000000 0.000000 0.000000 0.000000 1.000000
Differential_Type_nan 401125.0 8.269592e-01 0.378283 0.000000 1.000000 1.000000 1.000000 1.000000
Steering_Controls_Command Control 401125.0 1.338735e-03 0.036564 0.000000 0.000000 0.000000 0.000000 1.000000
Steering_Controls_Conventional 401125.0 1.712160e-01 0.376698 0.000000 0.000000 0.000000 0.000000 1.000000
Steering_Controls_Four Wheel Standard 401125.0 3.440324e-04 0.018545 0.000000 0.000000 0.000000 0.000000 1.000000
Steering_Controls_No 401125.0 2.492988e-06 0.001579 0.000000 0.000000 0.000000 0.000000 1.000000
Steering_Controls_Wheel 401125.0 3.490184e-05 0.005908 0.000000 0.000000 0.000000 0.000000 1.000000
Steering_Controls_nan 401125.0 8.270639e-01 0.378193 0.000000 1.000000 1.000000 1.000000 1.000000

192 rows × 8 columns


In [ ]:
X_train, X_valid = split_vals(df, n_trn)

In [ ]:
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)


Out[ ]:
0.31399415664841568

In [ ]:
m.score(X_train, y_train)


Out[ ]:
0.60966668735322171

In [ ]:
preds = m.predict(X_valid)

In [ ]:
rmse(preds, y_valid)


Out[ ]:
0.6197833720320716

In [ ]:
plt.scatter(preds, y_valid, alpha=0.1, s=2);


Feature selection from RF


In [ ]:
keep_cols = list(np.load('tmp/keep_cols.npy'))
', '.join(keep_cols)


Out[ ]:
'YearMade, Coupler_System, ProductSize, fiProductClassDesc, ModelID, Hydraulics_Flow, saleElapsed, fiSecondaryDesc, Enclosure, fiModelDesc, SalesID, fiModelDescriptor, MachineID, ProductGroupDesc, ProductGroup, Drive_System, Hydraulics, state, saleDay, Tire_Size, saleDayofyear'

In [ ]:
df_sub = df_raw[keep_cols+['age', 'SalePrice']]

In [ ]:
df, y, nas, mapper = proc_df(df_sub, 'SalePrice', max_n_cat=10, do_scale=True)

In [ ]:
X_train, X_valid = split_vals(df, n_trn)

In [ ]:
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)


Out[ ]:
0.67602943126238957

In [ ]:
rmse(m.predict(X_valid), y_valid)


Out[ ]:
0.42592074374245537

In [ ]:
from operator import itemgetter

In [ ]:
sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))


Out[ ]:
[('saleElapsed', -145.64883869187287),
 ('ProductSize_Compact', -0.54007475122954363),
 ('ProductSize_Mini', -0.50048058229134851),
 ('Drive_System_Two Wheel Drive', -0.26738317785805954),
 ('ProductGroupDesc_Skid Steer Loaders', -0.24922453667794642),
 ('ProductGroup_SSL', -0.24922453667794109),
 ('Enclosure_None or Unspecified', -0.24279691741993142),
 ('Enclosure_nan', -0.18422637193293984),
 ('Drive_System_nan', -0.17481049937861393),
 ('Hydraulics_Flow_Standard', -0.16350014357513842),
 ('Enclosure_EROPS AC', -0.13844787814745835),
 ('Enclosure_OROPS', -0.11883972857575564),
 ('ProductSize_nan', -0.091923065651293712),
 ('MachineID', -0.074919067087428459),
 ('Drive_System_Four Wheel Drive', -0.070763049128190758),
 ('Enclosure_EROPS', -0.054481426247864562),
 ('Hydraulics_Flow_High Flow', -0.04086147315765426),
 ('ProductGroupDesc_Track Type Tractors', -0.0040345196773952097),
 ('ProductGroup_TTT', -0.0040345196773863279),
 ('Coupler_System_nan', -0.0033786325283386986),
 ('Tire_Size', -0.0029100702750870511),
 ('saleDay', -0.0018416299990260399),
 ('SalesID', -0.00054830445029807606),
 ('state', -0.00024678007480360975),
 ('Coupler_System_None or Unspecified', -5.2370856060690585e-06),
 ('ProductGroupDesc_nan', 0.0),
 ('ProductGroup_nan', 0.0),
 ('fiModelDesc', 4.2934486685908269e-06),
 ('Hydraulics', 0.00040543370646162202),
 ('fiModelDescriptor', 0.0021681557153456765),
 ('fiSecondaryDesc', 0.0027035894408982131),
 ('Coupler_System_Yes', 0.0033838696139043556),
 ('Hydraulics_Flow_nan', 0.004228336726749049),
 ('fiProductClassDesc', 0.0056127773827328031),
 ('ModelID', 0.015501818523116384),
 ('ProductGroup_WL', 0.02704461444028361),
 ('ProductGroupDesc_Wheel Loader', 0.027044614440295156),
 ('ProductGroup_TEX', 0.052690045537289176),
 ('ProductGroupDesc_Track Excavators', 0.052690045537290064),
 ('ProductGroupDesc_Backhoe Loaders', 0.059547357765127318),
 ('ProductGroup_BL', 0.05954735776512976),
 ('ProductGroup_MG', 0.11397703861262443),
 ('ProductGroupDesc_Motor Graders', 0.1139770386126262),
 ('Drive_System_No', 0.17580439323345765),
 ('ProductSize_Small', 0.17871767761687707),
 ('Hydraulics_Flow_None or Unspecified', 0.20013328000603833),
 ('ProductSize_Medium', 0.25052367123948038),
 ('ProductSize_Large', 0.28820495005131264),
 ('Enclosure_EROPS w AC', 0.28910815935957146),
 ('Drive_System_All Wheel Drive', 0.33715233313139936),
 ('ProductSize_Large / Medium', 0.41503210026451676),
 ('Enclosure_NO ROPS', 0.44968416296443187),
 ('saleDayofyear', 7.1497706071154195),
 ('YearMade', 145.69251462398279),
 ('age', 182.96050934738616)]

In [ ]:
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)


Out[ ]:
0.634974200475658

In [ ]:
rmse(m.predict(X_valid), y_valid)


Out[ ]:
0.452103421164379

In [ ]:
m.alpha_


Out[ ]:
0.015888143202334452

In [ ]:
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs


Out[ ]:
[('ProductSize_Mini', -0.32283011739021739),
 ('ProductGroupDesc_Skid Steer Loaders', -0.29197866480532003),
 ('ProductSize_nan', -0.16481153369955914),
 ('Enclosure_OROPS', -0.10072067167233209),
 ('MachineID', -0.077025231732003333),
 ('age', -0.07146167736242677),
 ('saleDayofyear', -0.005420434471588245),
 ('ProductGroup_SSL', -0.00026018612542580649),
 ('state', -0.00020510032574012273),
 ('ModelID', -0.0),
 ('saleElapsed', -0.0),
 ('SalesID', -0.0),
 ('Hydraulics', -0.0),
 ('saleDay', -0.0),
 ('Coupler_System_None or Unspecified', -0.0),
 ('Coupler_System_Yes', -0.0),
 ('ProductSize_Compact', -0.0),
 ('ProductSize_Large', 0.0),
 ('ProductSize_Medium', 0.0),
 ('ProductSize_Small', 0.0),
 ('Hydraulics_Flow_High Flow', -0.0),
 ('Hydraulics_Flow_None or Unspecified', 0.0),
 ('Hydraulics_Flow_Standard', -0.0),
 ('Hydraulics_Flow_nan', 0.0),
 ('Enclosure_EROPS', 0.0),
 ('Enclosure_EROPS AC', -0.0),
 ('Enclosure_NO ROPS', 0.0),
 ('Enclosure_None or Unspecified', -0.0),
 ('Enclosure_nan', -0.0),
 ('ProductGroupDesc_Backhoe Loaders', -0.0),
 ('ProductGroupDesc_Track Excavators', 0.0),
 ('ProductGroupDesc_Track Type Tractors', 0.0),
 ('ProductGroupDesc_Wheel Loader', -0.0),
 ('ProductGroupDesc_nan', 0.0),
 ('ProductGroup_BL', -0.0),
 ('ProductGroup_MG', 0.0),
 ('ProductGroup_TEX', 0.0),
 ('ProductGroup_TTT', 0.0),
 ('ProductGroup_WL', -0.0),
 ('ProductGroup_nan', 0.0),
 ('Drive_System_All Wheel Drive', 0.0),
 ('Drive_System_Four Wheel Drive', 0.0),
 ('Drive_System_No', 0.0),
 ('Drive_System_Two Wheel Drive', -0.0),
 ('Drive_System_nan', -0.0),
 ('fiModelDesc', 5.509329640857254e-06),
 ('fiProductClassDesc', 0.0022891561283628222),
 ('fiSecondaryDesc', 0.0030514019161731946),
 ('fiModelDescriptor', 0.0034724295356978328),
 ('Tire_Size', 0.0095336232150601847),
 ('YearMade', 0.088383969701718024),
 ('ProductGroupDesc_Motor Graders', 0.11373349421170943),
 ('ProductSize_Large / Medium', 0.15991368864779024),
 ('Coupler_System_nan', 0.2023810180165819),
 ('Enclosure_EROPS w AC', 0.34461861098943347)]

In [ ]:
skip = [n for n,c in coefs if abs(c)<0.01]

In [ ]:
df.drop(skip, axis=1, inplace=True)

# for n,c in df.items():
#     if '_' not in n: df[n+'2'] = df[n]**2

In [ ]:
X_train, X_valid = split_vals(df, n_trn)

In [ ]:
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)


Out[ ]:
0.63678168860529394

In [ ]:
rmse(m.predict(X_valid), y_valid)


Out[ ]:
0.4509826980922721

In [ ]:
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs


Out[ ]:
[('ProductSize_Mini', -0.77420353132504083),
 ('ProductSize_nan', -0.37649627504627403),
 ('ProductGroupDesc_Skid Steer Loaders', -0.31232507784064917),
 ('MachineID', -0.092899281127635194),
 ('Enclosure_OROPS', -0.034526809361612412),
 ('age', -0.024367657576137017),
 ('ProductSize_Large / Medium', 0.08968235308777571),
 ('YearMade', 0.15983396599807115),
 ('Enclosure_EROPS w AC', 0.40652833657308551),
 ('Coupler_System_nan', 0.42191856864836064),
 ('ProductGroupDesc_Motor Graders', 0.49456961585974812)]

In [ ]:
np.savez(f'{PATH}tmp/regr_resid', m.predict(X_train), m.predict(X_valid))

In [ ]: