Machine Learning Supervised Pipeline

Frame

Supervised Learning - Regression

  • y: Predict Sale Price
  • X: Features about the house
  • score: Mean Squared Error

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [3]:
%matplotlib` inline

Acquire


In [41]:
data = pd.read_csv("http://bit.do/df-housing")

In [44]:
df = data[['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt', 'FireplaceQu', 'LotFrontage']]

In [49]:
df.head()


Out[49]:
SalePrice OverallQual GrLivArea GarageCars TotalBsmtSF FullBath YearBuilt FireplaceQu LotFrontage
0 208500 7 1710 2 856 2 2003 NaN 65.0
1 181500 6 1262 2 1262 2 1976 TA 80.0
2 223500 7 1786 2 920 2 2001 TA 68.0
3 140000 7 1717 3 756 1 1915 Gd 60.0
4 250000 8 2198 3 1145 2 2000 TA 84.0

In [56]:
X_raw = df.drop('SalePrice', axis=1)
y_raw = df.SalePrice

Pipeline

  • Refine
  • Transform

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [73]:
cat_cols = ['FireplaceQu']
num_cols = ['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt', "OverallQual" ,'LotFrontage']

In [74]:
cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)
cat_transformers = [('cat', cat_pipe, cat_cols)]

In [75]:
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())
num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)
num_transformers = [('num', num_pipe, num_cols)]

In [76]:
transformers = [('cat', cat_pipe, cat_cols),
                ('num', num_pipe, num_cols)]
ct = ColumnTransformer(transformers=transformers)
X_encoded = ct.fit_transform(X)
X_encoded.shape


Out[76]:
(1460, 13)

In [78]:
from sklearn.linear_model import Ridge

In [79]:
ml_pipe = Pipeline([('transform', ct), ('ridge', Ridge())])
ml_pipe.fit(X, y)


Out[79]:
Pipeline(memory=None,
     steps=[('transform', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', Pipeline(memory=None,
     steps=[('si', SimpleImputer(copy=True, fill_value='MISSING', missing_values=nan,
       strategy='constant', verbos...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [81]:
ml_pipe.score(X, y)


Out[81]:
0.7772097032829757

In [83]:
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=123)
cross_val_score(ml_pipe, X, y, cv=kf).mean()


Out[83]:
0.7550365007527828

In [85]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'transform__num__si__strategy': ['mean', 'median'],
    'ridge__alpha': [.001, 0.1, 1.0, 5, 10, 50, 100, 1000],
    }
gs = GridSearchCV(ml_pipe, param_grid, cv=kf)
gs.fit(X, y)
gs.best_params_


Out[85]:
{'ridge__alpha': 50, 'transform__num__si__strategy': 'median'}

In [ ]: