notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection, preprocessing, metrics, linear_model, pipeline, ensemble
import numpy as np

import seaborn as sns

import scipy

np.set_printoptions(suppress=True, precision=4)

%matplotlib inline

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = 10, 6



In [2]:

    
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2919 non-null object
LotFrontage      2919 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            2919 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2919 non-null object
Exterior2nd      2919 non-null object
MasVnrType       2919 non-null object
MasVnrArea       2919 non-null float64
ExterQual        2919 non-null object
ExterCond        2919 non-null object
Foundation       2919 non-null object
BsmtQual         2919 non-null object
BsmtCond         2919 non-null object
BsmtExposure     2919 non-null object
BsmtFinType1     2919 non-null object
BsmtFinSF1       2919 non-null float64
BsmtFinType2     2919 non-null object
BsmtFinSF2       2919 non-null float64
BsmtUnfSF        2919 non-null float64
TotalBsmtSF      2919 non-null float64
Heating          2919 non-null object
HeatingQC        2919 non-null object
CentralAir       2919 non-null object
Electrical       2919 non-null object
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2919 non-null float64
BsmtHalfBath     2919 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
KitchenQual      2919 non-null object
TotRmsAbvGrd     2919 non-null int64
Functional       2919 non-null object
Fireplaces       2919 non-null int64
FireplaceQu      2919 non-null object
GarageType       2919 non-null object
GarageYrBlt      2919 non-null float64
GarageFinish     2919 non-null object
GarageCars       2919 non-null float64
GarageArea       2919 non-null float64
GarageQual       2919 non-null object
GarageCond       2919 non-null object
PavedDrive       2919 non-null object
WoodDeckSF       2919 non-null int64
OpenPorchSF      2919 non-null int64
EnclosedPorch    2919 non-null int64
3SsnPorch        2919 non-null int64
ScreenPorch      2919 non-null int64
PoolArea         2919 non-null int64
PoolQC           2919 non-null object
Fence            2919 non-null object
MiscFeature      2919 non-null object
MiscVal          2919 non-null int64
MoSold           2919 non-null int64
YrSold           2919 non-null int64
SaleType         2919 non-null object
SaleCondition    2919 non-null object
SalesPrice       1460 non-null float64
dtypes: float64(12), int64(26), object(42)
memory usage: 1.8+ MB



In [3]:

    
df_train = df[~np.isnan(df.SalesPrice)]
df_test = df[np.isnan(df.SalesPrice)]



In [4]:

    
df_train.shape, df_test.shape









    Out[4]:





((1460, 80), (1459, 80))



In [5]:

    
plt.hist(df_train.SalesPrice, bins = 50);



In [6]:

    
plt.hist(np.log(df_train.SalesPrice), bins = 50);



In [7]:

    
y = np.log(df.SalesPrice)
ids = df.Id
X = df.copy()
del X["Id"]
del X["SalesPrice"]
X.head()









    Out[7]:







  
    
      
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      LotConfig
      LandSlope
      ...
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      60
      RL
      65.0
      8450
      Pave
      None
      Reg
      Lvl
      Inside
      Gtl
      ...
      0
      0
      None
      None
      None
      0
      2
      2008
      WD
      Normal
    
    
      1
      20
      RL
      80.0
      9600
      Pave
      None
      Reg
      Lvl
      FR2
      Gtl
      ...
      0
      0
      None
      None
      None
      0
      5
      2007
      WD
      Normal
    
    
      2
      60
      RL
      68.0
      11250
      Pave
      None
      IR1
      Lvl
      Inside
      Gtl
      ...
      0
      0
      None
      None
      None
      0
      9
      2008
      WD
      Normal
    
    
      3
      70
      RL
      60.0
      9550
      Pave
      None
      IR1
      Lvl
      Corner
      Gtl
      ...
      0
      0
      None
      None
      None
      0
      2
      2006
      WD
      Abnorml
    
    
      4
      60
      RL
      84.0
      14260
      Pave
      None
      IR1
      Lvl
      FR2
      Gtl
      ...
      0
      0
      None
      None
      None
      0
      12
      2008
      WD
      Normal
    
  

5 rows × 78 columns



In [8]:

    
X_dummy = pd.get_dummies(X, drop_first= True)
X_train = X_dummy[~np.isnan(y)]
X_test = X_dummy[np.isnan(y)]
y_train = y[~np.isnan(y)]
    
pd.DataFrame(X_train).describe()









    Out[8]:







  
    
      
      MSSubClass
      LotFrontage
      LotArea
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      MasVnrArea
      BsmtFinSF1
      BsmtFinSF2
      ...
      SaleType_ConLI
      SaleType_ConLw
      SaleType_New
      SaleType_Oth
      SaleType_WD
      SaleCondition_AdjLand
      SaleCondition_Alloca
      SaleCondition_Family
      SaleCondition_Normal
      SaleCondition_Partial
    
  
  
    
      count
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      ...
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
      1460.000000
    
    
      mean
      56.897260
      70.176370
      10516.828082
      6.099315
      5.575342
      1971.267808
      1984.865753
      103.117123
      443.639726
      46.549315
      ...
      0.003425
      0.003425
      0.083562
      0.002055
      0.867808
      0.002740
      0.008219
      0.013699
      0.820548
      0.085616
    
    
      std
      42.300571
      22.433457
      9981.264932
      1.382997
      1.112799
      30.202904
      20.645407
      180.731373
      456.098091
      161.319273
      ...
      0.058440
      0.058440
      0.276824
      0.045299
      0.338815
      0.052289
      0.090317
      0.116277
      0.383862
      0.279893
    
    
      min
      20.000000
      21.000000
      1300.000000
      1.000000
      1.000000
      1872.000000
      1950.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      20.000000
      60.000000
      7553.500000
      5.000000
      5.000000
      1954.000000
      1967.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
    
    
      50%
      50.000000
      70.000000
      9478.500000
      6.000000
      5.000000
      1973.000000
      1994.000000
      0.000000
      383.500000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
    
    
      75%
      70.000000
      80.000000
      11601.500000
      7.000000
      6.000000
      2000.000000
      2004.000000
      164.250000
      712.250000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
    
    
      max
      190.000000
      313.000000
      215245.000000
      10.000000
      9.000000
      2010.000000
      2010.000000
      1600.000000
      5644.000000
      1474.000000
      ...
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 259 columns



In [9]:

    
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", linear_model.Lasso())
])
param_grid = {
    "fit__alpha": 10 ** np.linspace(-3, 1, 20)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)









    



Fitting 5 folds for each of 20 candidates, totalling 100 fits
best params {'fit__alpha': 0.0042813323987193957} best scores 0.0216501437072
CPU times: user 3.61 s, sys: 226 ms, total: 3.83 s
Wall time: 3.83 s






    



[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.8s finished



In [10]:

    
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", linear_model.Ridge(random_state = 1))
])
param_grid = {
    "fit__alpha": 10 ** np.linspace(-3, 2, 20)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)









    



Fitting 5 folds for each of 20 candidates, totalling 100 fits
best params {'fit__alpha': 100.0} best scores 0.0223362035465
CPU times: user 6.32 s, sys: 570 ms, total: 6.89 s
Wall time: 3.42 s






    



[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.4s finished



In [11]:

    
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", linear_model.SGDRegressor(random_state=1, max_iter = 10000, tol=1e-6))
])
param_grid = {
    "fit__alpha": 10 ** np.linspace(0, 2, 5),
    "fit__loss": ["squared_loss", "huber"],
    "fit__l1_ratio": np.linspace(0.1, 0.9, 5)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)









    



Fitting 5 folds for each of 50 candidates, totalling 250 fits
best params {'fit__alpha': 1.0, 'fit__l1_ratio': 0.10000000000000001, 'fit__loss': 'huber'} best scores 0.0256927996113
CPU times: user 23.9 s, sys: 1.14 s, total: 25 s
Wall time: 25 s






    



[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   24.8s finished



In [12]:

    
%%time 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", ensemble.GradientBoostingRegressor(random_state=1, learning_rate=0.1, ))
])

param_grid = {
    "fit__learning_rate": [0.1, 0.01],
    "fit__alpha": np.linspace(0.001, 0.999, 5),
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)









    



Fitting 5 folds for each of 10 candidates, totalling 50 fits






    



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   44.4s finished






    



best params {'fit__alpha': 0.001, 'fit__learning_rate': 0.1} best scores 0.017065447917
CPU times: user 45 s, sys: 331 ms, total: 45.3 s
Wall time: 45.4 s



In [13]:

    
%%time 

import xgboost as xgb

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("fit", xgb.XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=100, 
                     objective='reg:linear', booster='gbtree', random_state=1))
])

param_grid = {
    "fit__reg_alpha": 10 ** np.linspace(-1, 1, 5),
#    "fit__max_depth": 2 * np.arange(1, 10),
#    "fit__reg_lambda": np.linspace(0.1, 0.9, 5)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
                scoring="neg_mean_squared_error", param_grid=param_grid)

gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)









    



Fitting 5 folds for each of 5 candidates, totalling 25 fits






    



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   34.9s finished






    



best params {'fit__reg_alpha': 0.31622776601683794} best scores 0.018332738302
CPU times: user 36.4 s, sys: 206 ms, total: 36.6 s
Wall time: 36.6 s



In [ ]:

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	LotConfig	LandSlope	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition
0	60	RL	65.0	8450	Pave	None	Reg	Lvl	Inside	Gtl	...	None	None	None	2	2008	WD	Normal
1	20	RL	80.0	9600	Pave	None	Reg	Lvl	FR2	Gtl	...	None	None	None	5	2007	WD	Normal
2	60	RL	68.0	11250	Pave	None	IR1	Lvl	Inside	Gtl	...	None	None	None	9	2008	WD	Normal
3	70	RL	60.0	9550	Pave	None	IR1	Lvl	Corner	Gtl	...	None	None	None	2	2006	WD	Abnorml
4	60	RL	84.0	14260	Pave	None	IR1	Lvl	FR2	Gtl	...	None	None	None	12	2008	WD	Normal

	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtFinSF2	...	SaleType_ConLI	SaleType_ConLw	SaleType_New	SaleType_Oth	SaleType_WD	SaleCondition_AdjLand	SaleCondition_Alloca	SaleCondition_Family	SaleCondition_Normal	SaleCondition_Partial
count	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	...	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000
mean	56.897260	70.176370	10516.828082	6.099315	5.575342	1971.267808	1984.865753	103.117123	443.639726	46.549315	...	0.003425	0.003425	0.083562	0.002055	0.867808	0.002740	0.008219	0.013699	0.820548	0.085616
std	42.300571	22.433457	9981.264932	1.382997	1.112799	30.202904	20.645407	180.731373	456.098091	161.319273	...	0.058440	0.058440	0.276824	0.045299	0.338815	0.052289	0.090317	0.116277	0.383862	0.279893
min	20.000000	21.000000	1300.000000	1.000000	1.000000	1872.000000	1950.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	20.000000	60.000000	7553.500000	5.000000	5.000000	1954.000000	1967.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.000000
50%	50.000000	70.000000	9478.500000	6.000000	5.000000	1973.000000	1994.000000	0.000000	383.500000	0.000000	...	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.000000
75%	70.000000	80.000000	11601.500000	7.000000	6.000000	2000.000000	2004.000000	164.250000	712.250000	0.000000	...	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.000000
max	190.000000	313.000000	215245.000000	10.000000	9.000000	2010.000000	2010.000000	1600.000000	5644.000000	1474.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000