A Linear Model for Bulldozers


In [ ]:
%load_ext autoreload
%autoreload 2

In [ ]:
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

In [ ]:
set_plot_sizes(12,14,16)

Load in our data from last lesson


In [ ]:
PATH = "data/bulldozers/"

df_raw = pd.read_feather('tmp/raw')

In [ ]:
df_raw['age'] = df_raw.saleYear-df_raw.YearMade

In [ ]:
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', max_n_cat=10, do_scale=True)

In [ ]:
def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df)-n_valid
y_train, y_valid = split_vals(y, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)

In [ ]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

Linear regression for Bulldozers

Data scaling


In [ ]:
df.describe().transpose()

In [ ]:
X_train, X_valid = split_vals(df, n_trn)

In [ ]:
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)

In [ ]:
m.score(X_train, y_train)

In [ ]:
preds = m.predict(X_valid)

In [ ]:
rmse(preds, y_valid)

In [ ]:
plt.scatter(preds, y_valid, alpha=0.1, s=2);

Feature selection from RF


In [ ]:
keep_cols = list(np.load('tmp/keep_cols.npy'))
', '.join(keep_cols)

In [ ]:
df_sub = df_raw[keep_cols+['age', 'SalePrice']]

In [ ]:
df, y, mapper, nas = proc_df(df_sub, 'SalePrice', max_n_cat=10, do_scale=True)

In [ ]:
X_train, X_valid = split_vals(df, n_trn)

In [ ]:
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)

In [ ]:
rmse(m.predict(X_valid), y_valid)

In [ ]:
from operator import itemgetter

In [ ]:
sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))

In [ ]:
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)

In [ ]:
rmse(m.predict(X_valid), y_valid)

In [ ]:
m.alpha_

In [ ]:
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs

In [ ]:
skip = [n for n,c in coefs if abs(c)<0.01]

In [ ]:
df.drop(skip, axis=1, inplace=True)

# for n,c in df.items():
#     if '_' not in n: df[n+'2'] = df[n]**2

In [ ]:
X_train, X_valid = split_vals(df, n_trn)

In [ ]:
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)

In [ ]:
rmse(m.predict(X_valid), y_valid)

In [ ]:
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs

In [ ]:
np.savez(f'{PATH}tmp/regr_resid', m.predict(X_train), m.predict(X_valid))

In [ ]: