Important: This notebook will only work with fastai-0.7.x. Do not try to run any fastai-1.x code from this path in the repository because it will load fastai-0.7.x
In [ ]:
%load_ext autoreload
%autoreload 2
In [ ]:
%matplotlib inline
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
In [ ]:
set_plot_sizes(12,14,16)
In [ ]:
PATH = "data/bulldozers/"
df_raw = pd.read_feather('tmp/bulldozers-raw')
In [ ]:
df_raw['age'] = df_raw.saleYear-df_raw.YearMade
In [ ]:
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', max_n_cat=10, do_scale=True)
In [ ]:
def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df)-n_valid
y_train, y_valid = split_vals(y, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)
In [ ]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
In [ ]:
df.describe().transpose()
Out[ ]:
In [ ]:
X_train, X_valid = split_vals(df, n_trn)
In [ ]:
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)
Out[ ]:
In [ ]:
m.score(X_train, y_train)
Out[ ]:
In [ ]:
preds = m.predict(X_valid)
In [ ]:
rmse(preds, y_valid)
Out[ ]:
In [ ]:
plt.scatter(preds, y_valid, alpha=0.1, s=2);
In [ ]:
keep_cols = list(np.load('tmp/keep_cols.npy'))
', '.join(keep_cols)
Out[ ]:
In [ ]:
df_sub = df_raw[keep_cols+['age', 'SalePrice']]
In [ ]:
df, y, nas, mapper = proc_df(df_sub, 'SalePrice', max_n_cat=10, do_scale=True)
In [ ]:
X_train, X_valid = split_vals(df, n_trn)
In [ ]:
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)
Out[ ]:
In [ ]:
rmse(m.predict(X_valid), y_valid)
Out[ ]:
In [ ]:
from operator import itemgetter
In [ ]:
sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
Out[ ]:
In [ ]:
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)
Out[ ]:
In [ ]:
rmse(m.predict(X_valid), y_valid)
Out[ ]:
In [ ]:
m.alpha_
Out[ ]:
In [ ]:
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs
Out[ ]:
In [ ]:
skip = [n for n,c in coefs if abs(c)<0.01]
In [ ]:
df.drop(skip, axis=1, inplace=True)
# for n,c in df.items():
# if '_' not in n: df[n+'2'] = df[n]**2
In [ ]:
X_train, X_valid = split_vals(df, n_trn)
In [ ]:
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)
Out[ ]:
In [ ]:
rmse(m.predict(X_valid), y_valid)
Out[ ]:
In [ ]:
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs
Out[ ]:
In [ ]:
np.savez(f'{PATH}tmp/regr_resid', m.predict(X_train), m.predict(X_valid))
In [ ]: