In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
In [3]:
%matplotlib` inline
In [41]:
data = pd.read_csv("http://bit.do/df-housing")
In [44]:
df = data[['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt', 'FireplaceQu', 'LotFrontage']]
In [49]:
df.head()
Out[49]:
In [56]:
X_raw = df.drop('SalePrice', axis=1)
y_raw = df.SalePrice
In [59]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
In [73]:
cat_cols = ['FireplaceQu']
num_cols = ['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt', "OverallQual" ,'LotFrontage']
In [74]:
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)
cat_transformers = [('cat', cat_pipe, cat_cols)]
In [75]:
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)
num_transformers = [('num', num_pipe, num_cols)]
In [76]:
transformers = [('cat', cat_pipe, cat_cols),
('num', num_pipe, num_cols)]
ct = ColumnTransformer(transformers=transformers)
X_encoded = ct.fit_transform(X)
X_encoded.shape
Out[76]:
In [78]:
from sklearn.linear_model import Ridge
In [79]:
ml_pipe = Pipeline([('transform', ct), ('ridge', Ridge())])
ml_pipe.fit(X, y)
Out[79]:
In [81]:
ml_pipe.score(X, y)
Out[81]:
In [83]:
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=123)
cross_val_score(ml_pipe, X, y, cv=kf).mean()
Out[83]:
In [85]:
from sklearn.model_selection import GridSearchCV
param_grid = {
'transform__num__si__strategy': ['mean', 'median'],
'ridge__alpha': [.001, 0.1, 1.0, 5, 10, 50, 100, 1000],
}
gs = GridSearchCV(ml_pipe, param_grid, cv=kf)
gs.fit(X, y)
gs.best_params_
Out[85]:
In [ ]: