In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection, preprocessing, metrics, linear_model, pipeline, ensemble
import numpy as np

import seaborn as sns

import scipy

np.set_printoptions(suppress=True, precision=4)

%matplotlib inline

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = 10, 6

In [2]:
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2919 non-null object
LotFrontage      2919 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            2919 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2919 non-null object
Exterior2nd      2919 non-null object
MasVnrType       2919 non-null object
MasVnrArea       2919 non-null float64
ExterQual        2919 non-null object
ExterCond        2919 non-null object
Foundation       2919 non-null object
BsmtQual         2919 non-null object
BsmtCond         2919 non-null object
BsmtExposure     2919 non-null object
BsmtFinType1     2919 non-null object
BsmtFinSF1       2919 non-null float64
BsmtFinType2     2919 non-null object
BsmtFinSF2       2919 non-null float64
BsmtUnfSF        2919 non-null float64
TotalBsmtSF      2919 non-null float64
Heating          2919 non-null object
HeatingQC        2919 non-null object
CentralAir       2919 non-null object
Electrical       2919 non-null object
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2919 non-null float64
BsmtHalfBath     2919 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
KitchenQual      2919 non-null object
TotRmsAbvGrd     2919 non-null int64
Functional       2919 non-null object
Fireplaces       2919 non-null int64
FireplaceQu      2919 non-null object
GarageType       2919 non-null object
GarageYrBlt      2919 non-null float64
GarageFinish     2919 non-null object
GarageCars       2919 non-null float64
GarageArea       2919 non-null float64
GarageQual       2919 non-null object
GarageCond       2919 non-null object
PavedDrive       2919 non-null object
WoodDeckSF       2919 non-null int64
OpenPorchSF      2919 non-null int64
EnclosedPorch    2919 non-null int64
3SsnPorch        2919 non-null int64
ScreenPorch      2919 non-null int64
PoolArea         2919 non-null int64
PoolQC           2919 non-null object
Fence            2919 non-null object
MiscFeature      2919 non-null object
MiscVal          2919 non-null int64
MoSold           2919 non-null int64
YrSold           2919 non-null int64
SaleType         2919 non-null object
SaleCondition    2919 non-null object
SalesPrice       1460 non-null float64
dtypes: float64(12), int64(26), object(42)
memory usage: 1.8+ MB

In [3]:
df_train = df[~np.isnan(df.SalesPrice)]
df_test = df[np.isnan(df.SalesPrice)]

In [4]:
df_train.shape, df_test.shape


Out[4]:
((1460, 80), (1459, 80))

In [5]:
plt.hist(df_train.SalesPrice, bins = 50);



In [6]:
plt.hist(np.log(df_train.SalesPrice), bins = 50);



In [7]:
y = np.log(df.SalesPrice)
ids = df.Id
X = df.copy()
del X["Id"]
del X["SalesPrice"]
X.head()


Out[7]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour LotConfig LandSlope ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 RL 65.0 8450 Pave None Reg Lvl Inside Gtl ... 0 0 None None None 0 2 2008 WD Normal
1 20 RL 80.0 9600 Pave None Reg Lvl FR2 Gtl ... 0 0 None None None 0 5 2007 WD Normal
2 60 RL 68.0 11250 Pave None IR1 Lvl Inside Gtl ... 0 0 None None None 0 9 2008 WD Normal
3 70 RL 60.0 9550 Pave None IR1 Lvl Corner Gtl ... 0 0 None None None 0 2 2006 WD Abnorml
4 60 RL 84.0 14260 Pave None IR1 Lvl FR2 Gtl ... 0 0 None None None 0 12 2008 WD Normal

5 rows × 78 columns


In [8]:
X_dummy = pd.get_dummies(X, drop_first= True)
X_train = X_dummy[~np.isnan(y)]
X_test = X_dummy[np.isnan(y)]
y_train = y[~np.isnan(y)]
    
pd.DataFrame(X_train).describe()


Out[8]:
MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 ... SaleType_ConLI SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 ... 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 56.897260 70.176370 10516.828082 6.099315 5.575342 1971.267808 1984.865753 103.117123 443.639726 46.549315 ... 0.003425 0.003425 0.083562 0.002055 0.867808 0.002740 0.008219 0.013699 0.820548 0.085616
std 42.300571 22.433457 9981.264932 1.382997 1.112799 30.202904 20.645407 180.731373 456.098091 161.319273 ... 0.058440 0.058440 0.276824 0.045299 0.338815 0.052289 0.090317 0.116277 0.383862 0.279893
min 20.000000 21.000000 1300.000000 1.000000 1.000000 1872.000000 1950.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 20.000000 60.000000 7553.500000 5.000000 5.000000 1954.000000 1967.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000
50% 50.000000 70.000000 9478.500000 6.000000 5.000000 1973.000000 1994.000000 0.000000 383.500000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000
75% 70.000000 80.000000 11601.500000 7.000000 6.000000 2000.000000 2004.000000 164.250000 712.250000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000
max 190.000000 313.000000 215245.000000 10.000000 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 1474.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 259 columns


In [9]:
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", linear_model.Lasso())
])
param_grid = {
    "fit__alpha": 10 ** np.linspace(-3, 1, 20)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
best params {'fit__alpha': 0.0042813323987193957} best scores 0.0216501437072
CPU times: user 3.61 s, sys: 226 ms, total: 3.83 s
Wall time: 3.83 s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.8s finished

In [10]:
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", linear_model.Ridge(random_state = 1))
])
param_grid = {
    "fit__alpha": 10 ** np.linspace(-3, 2, 20)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
best params {'fit__alpha': 100.0} best scores 0.0223362035465
CPU times: user 6.32 s, sys: 570 ms, total: 6.89 s
Wall time: 3.42 s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.4s finished

In [11]:
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", linear_model.SGDRegressor(random_state=1, max_iter = 10000, tol=1e-6))
])
param_grid = {
    "fit__alpha": 10 ** np.linspace(0, 2, 5),
    "fit__loss": ["squared_loss", "huber"],
    "fit__l1_ratio": np.linspace(0.1, 0.9, 5)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
best params {'fit__alpha': 1.0, 'fit__l1_ratio': 0.10000000000000001, 'fit__loss': 'huber'} best scores 0.0256927996113
CPU times: user 23.9 s, sys: 1.14 s, total: 25 s
Wall time: 25 s
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   24.8s finished

In [12]:
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", ensemble.GradientBoostingRegressor(random_state=1, learning_rate=0.1, ))
])

param_grid = {
    "fit__learning_rate": [0.1, 0.01],
    "fit__alpha": np.linspace(0.001, 0.999, 5),
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   44.4s finished
best params {'fit__alpha': 0.001, 'fit__learning_rate': 0.1} best scores 0.017065447917
CPU times: user 45 s, sys: 331 ms, total: 45.3 s
Wall time: 45.4 s

In [13]:
%%time 

import xgboost as xgb

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", xgb.XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=100, 
                     objective='reg:linear', booster='gbtree', random_state=1))
])

param_grid = {
    "fit__reg_alpha": 10 ** np.linspace(-1, 1, 5),
#    "fit__max_depth": 2 * np.arange(1, 10),
#    "fit__reg_lambda": np.linspace(0.1, 0.9, 5)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   34.9s finished
best params {'fit__reg_alpha': 0.31622776601683794} best scores 0.018332738302
CPU times: user 36.4 s, sys: 206 ms, total: 36.6 s
Wall time: 36.6 s

In [ ]: