notebook.community

Edit and run



In [7]:

    
import pandas as pd

pd.options.display.max_columns = 1000


# Data: https://raw.githubusercontent.com/abulbasar/data/master/kaggle-houseprice/data_combined_cleaned.csv



In [2]:

    
# https://github.com/abulbasar/machine-learning/blob/master/Scikit%20-%2020%20Kaggle%20House%20Data%20Preprocessing.ipynb



In [3]:

    
df = pd.read_csv("/data/kaggle/data_combined_cleaned.csv")



In [8]:

    
df.head()









    Out[8]:







  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      LotConfig
      LandSlope
      Neighborhood
      Condition1
      Condition2
      BldgType
      HouseStyle
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      RoofStyle
      RoofMatl
      Exterior1st
      Exterior2nd
      MasVnrType
      MasVnrArea
      ExterQual
      ExterCond
      Foundation
      BsmtQual
      BsmtCond
      BsmtExposure
      BsmtFinType1
      BsmtFinSF1
      BsmtFinType2
      BsmtFinSF2
      BsmtUnfSF
      TotalBsmtSF
      Heating
      HeatingQC
      CentralAir
      Electrical
      1stFlrSF
      2ndFlrSF
      LowQualFinSF
      GrLivArea
      BsmtFullBath
      BsmtHalfBath
      FullBath
      HalfBath
      BedroomAbvGr
      KitchenAbvGr
      KitchenQual
      TotRmsAbvGrd
      Functional
      Fireplaces
      FireplaceQu
      GarageType
      GarageYrBlt
      GarageFinish
      GarageCars
      GarageArea
      GarageQual
      GarageCond
      PavedDrive
      WoodDeckSF
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalesPrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      None
      Reg
      Lvl
      Inside
      Gtl
      CollgCr
      Norm
      Norm
      1Fam
      2Story
      7
      5
      2003
      2003
      Gable
      CompShg
      VinylSd
      VinylSd
      BrkFace
      196.0
      Gd
      TA
      PConc
      Gd
      TA
      No
      GLQ
      706.0
      Unf
      0.0
      150.0
      856.0
      GasA
      Ex
      Y
      SBrkr
      856
      854
      0
      1710
      1.0
      0.0
      2
      1
      3
      1
      Gd
      8
      Typ
      0
      None
      Attchd
      2003.0
      RFn
      2.0
      548.0
      TA
      TA
      Y
      0
      61
      0
      0
      0
      0
      None
      None
      None
      0
      2
      2008
      WD
      Normal
      208500.0
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      None
      Reg
      Lvl
      FR2
      Gtl
      Veenker
      Feedr
      Norm
      1Fam
      1Story
      6
      8
      1976
      1976
      Gable
      CompShg
      MetalSd
      MetalSd
      None
      0.0
      TA
      TA
      CBlock
      Gd
      TA
      Gd
      ALQ
      978.0
      Unf
      0.0
      284.0
      1262.0
      GasA
      Ex
      Y
      SBrkr
      1262
      0
      0
      1262
      0.0
      1.0
      2
      0
      3
      1
      TA
      6
      Typ
      1
      TA
      Attchd
      1976.0
      RFn
      2.0
      460.0
      TA
      TA
      Y
      298
      0
      0
      0
      0
      0
      None
      None
      None
      0
      5
      2007
      WD
      Normal
      181500.0
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      None
      IR1
      Lvl
      Inside
      Gtl
      CollgCr
      Norm
      Norm
      1Fam
      2Story
      7
      5
      2001
      2002
      Gable
      CompShg
      VinylSd
      VinylSd
      BrkFace
      162.0
      Gd
      TA
      PConc
      Gd
      TA
      Mn
      GLQ
      486.0
      Unf
      0.0
      434.0
      920.0
      GasA
      Ex
      Y
      SBrkr
      920
      866
      0
      1786
      1.0
      0.0
      2
      1
      3
      1
      Gd
      6
      Typ
      1
      TA
      Attchd
      2001.0
      RFn
      2.0
      608.0
      TA
      TA
      Y
      0
      42
      0
      0
      0
      0
      None
      None
      None
      0
      9
      2008
      WD
      Normal
      223500.0
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      None
      IR1
      Lvl
      Corner
      Gtl
      Crawfor
      Norm
      Norm
      1Fam
      2Story
      7
      5
      1915
      1970
      Gable
      CompShg
      Wd Sdng
      Wd Shng
      None
      0.0
      TA
      TA
      BrkTil
      TA
      Gd
      No
      ALQ
      216.0
      Unf
      0.0
      540.0
      756.0
      GasA
      Gd
      Y
      SBrkr
      961
      756
      0
      1717
      1.0
      0.0
      1
      0
      3
      1
      Gd
      7
      Typ
      1
      Gd
      Detchd
      1998.0
      Unf
      3.0
      642.0
      TA
      TA
      Y
      0
      35
      272
      0
      0
      0
      None
      None
      None
      0
      2
      2006
      WD
      Abnorml
      140000.0
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      None
      IR1
      Lvl
      FR2
      Gtl
      NoRidge
      Norm
      Norm
      1Fam
      2Story
      8
      5
      2000
      2000
      Gable
      CompShg
      VinylSd
      VinylSd
      BrkFace
      350.0
      Gd
      TA
      PConc
      Gd
      TA
      Av
      GLQ
      655.0
      Unf
      0.0
      490.0
      1145.0
      GasA
      Ex
      Y
      SBrkr
      1145
      1053
      0
      2198
      1.0
      0.0
      2
      1
      4
      1
      Gd
      9
      Typ
      1
      TA
      Attchd
      2000.0
      RFn
      3.0
      836.0
      TA
      TA
      Y
      192
      84
      0
      0
      0
      0
      None
      None
      None
      0
      12
      2008
      WD
      Normal
      250000.0



In [10]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2919 non-null   object 
 3   LotFrontage    2919 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          2919 non-null   object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   LotConfig      2919 non-null   object 
 10  LandSlope      2919 non-null   object 
 11  Neighborhood   2919 non-null   object 
 12  Condition1     2919 non-null   object 
 13  Condition2     2919 non-null   object 
 14  BldgType       2919 non-null   object 
 15  HouseStyle     2919 non-null   object 
 16  OverallQual    2919 non-null   int64  
 17  OverallCond    2919 non-null   int64  
 18  YearBuilt      2919 non-null   int64  
 19  YearRemodAdd   2919 non-null   int64  
 20  RoofStyle      2919 non-null   object 
 21  RoofMatl       2919 non-null   object 
 22  Exterior1st    2919 non-null   object 
 23  Exterior2nd    2919 non-null   object 
 24  MasVnrType     2919 non-null   object 
 25  MasVnrArea     2919 non-null   float64
 26  ExterQual      2919 non-null   object 
 27  ExterCond      2919 non-null   object 
 28  Foundation     2919 non-null   object 
 29  BsmtQual       2919 non-null   object 
 30  BsmtCond       2919 non-null   object 
 31  BsmtExposure   2919 non-null   object 
 32  BsmtFinType1   2919 non-null   object 
 33  BsmtFinSF1     2919 non-null   float64
 34  BsmtFinType2   2919 non-null   object 
 35  BsmtFinSF2     2919 non-null   float64
 36  BsmtUnfSF      2919 non-null   float64
 37  TotalBsmtSF    2919 non-null   float64
 38  Heating        2919 non-null   object 
 39  HeatingQC      2919 non-null   object 
 40  CentralAir     2919 non-null   object 
 41  Electrical     2919 non-null   object 
 42  1stFlrSF       2919 non-null   int64  
 43  2ndFlrSF       2919 non-null   int64  
 44  LowQualFinSF   2919 non-null   int64  
 45  GrLivArea      2919 non-null   int64  
 46  BsmtFullBath   2919 non-null   float64
 47  BsmtHalfBath   2919 non-null   float64
 48  FullBath       2919 non-null   int64  
 49  HalfBath       2919 non-null   int64  
 50  BedroomAbvGr   2919 non-null   int64  
 51  KitchenAbvGr   2919 non-null   int64  
 52  KitchenQual    2919 non-null   object 
 53  TotRmsAbvGrd   2919 non-null   int64  
 54  Functional     2919 non-null   object 
 55  Fireplaces     2919 non-null   int64  
 56  FireplaceQu    2919 non-null   object 
 57  GarageType     2919 non-null   object 
 58  GarageYrBlt    2919 non-null   float64
 59  GarageFinish   2919 non-null   object 
 60  GarageCars     2919 non-null   float64
 61  GarageArea     2919 non-null   float64
 62  GarageQual     2919 non-null   object 
 63  GarageCond     2919 non-null   object 
 64  PavedDrive     2919 non-null   object 
 65  WoodDeckSF     2919 non-null   int64  
 66  OpenPorchSF    2919 non-null   int64  
 67  EnclosedPorch  2919 non-null   int64  
 68  3SsnPorch      2919 non-null   int64  
 69  ScreenPorch    2919 non-null   int64  
 70  PoolArea       2919 non-null   int64  
 71  PoolQC         2919 non-null   object 
 72  Fence          2919 non-null   object 
 73  MiscFeature    2919 non-null   object 
 74  MiscVal        2919 non-null   int64  
 75  MoSold         2919 non-null   int64  
 76  YrSold         2919 non-null   int64  
 77  SaleType       2919 non-null   object 
 78  SaleCondition  2919 non-null   object 
 79  SalesPrice     1460 non-null   float64
dtypes: float64(12), int64(26), object(42)
memory usage: 1.8+ MB



In [11]:

    
del df["Id"]



In [53]:

    
df = df[~df.SalesPrice.isna()]



In [54]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemodAdd   1460 non-null   int64  
 19  RoofStyle      1460 non-null   object 
 20  RoofMatl       1460 non-null   object 
 21  Exterior1st    1460 non-null   object 
 22  Exterior2nd    1460 non-null   object 
 23  MasVnrType     1460 non-null   object 
 24  MasVnrArea     1460 non-null   float64
 25  ExterQual      1460 non-null   object 
 26  ExterCond      1460 non-null   object 
 27  Foundation     1460 non-null   object 
 28  BsmtQual       1460 non-null   object 
 29  BsmtCond       1460 non-null   object 
 30  BsmtExposure   1460 non-null   object 
 31  BsmtFinType1   1460 non-null   object 
 32  BsmtFinSF1     1460 non-null   float64
 33  BsmtFinType2   1460 non-null   object 
 34  BsmtFinSF2     1460 non-null   float64
 35  BsmtUnfSF      1460 non-null   float64
 36  TotalBsmtSF    1460 non-null   float64
 37  Heating        1460 non-null   object 
 38  HeatingQC      1460 non-null   object 
 39  CentralAir     1460 non-null   object 
 40  Electrical     1460 non-null   object 
 41  1stFlrSF       1460 non-null   int64  
 42  2ndFlrSF       1460 non-null   int64  
 43  LowQualFinSF   1460 non-null   int64  
 44  GrLivArea      1460 non-null   int64  
 45  BsmtFullBath   1460 non-null   float64
 46  BsmtHalfBath   1460 non-null   float64
 47  FullBath       1460 non-null   int64  
 48  HalfBath       1460 non-null   int64  
 49  BedroomAbvGr   1460 non-null   int64  
 50  KitchenAbvGr   1460 non-null   int64  
 51  KitchenQual    1460 non-null   object 
 52  TotRmsAbvGrd   1460 non-null   int64  
 53  Functional     1460 non-null   object 
 54  Fireplaces     1460 non-null   int64  
 55  FireplaceQu    1460 non-null   object 
 56  GarageType     1460 non-null   object 
 57  GarageYrBlt    1460 non-null   float64
 58  GarageFinish   1460 non-null   object 
 59  GarageCars     1460 non-null   float64
 60  GarageArea     1460 non-null   float64
 61  GarageQual     1460 non-null   object 
 62  GarageCond     1460 non-null   object 
 63  PavedDrive     1460 non-null   object 
 64  WoodDeckSF     1460 non-null   int64  
 65  OpenPorchSF    1460 non-null   int64  
 66  EnclosedPorch  1460 non-null   int64  
 67  3SsnPorch      1460 non-null   int64  
 68  ScreenPorch    1460 non-null   int64  
 69  PoolArea       1460 non-null   int64  
 70  PoolQC         1460 non-null   object 
 71  Fence          1460 non-null   object 
 72  MiscFeature    1460 non-null   object 
 73  MiscVal        1460 non-null   int64  
 74  MoSold         1460 non-null   int64  
 75  YrSold         1460 non-null   int64  
 76  SaleType       1460 non-null   object 
 77  SaleCondition  1460 non-null   object 
 78  SalesPrice     1460 non-null   float64
dtypes: float64(12), int64(25), object(42)
memory usage: 912.5+ KB



In [55]:

    
df.head()









    Out[55]:







  
    
      
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      LotConfig
      LandSlope
      Neighborhood
      Condition1
      Condition2
      BldgType
      HouseStyle
      OverallQual
      OverallCond
      YearBuilt
      YearRemodAdd
      RoofStyle
      RoofMatl
      Exterior1st
      Exterior2nd
      MasVnrType
      MasVnrArea
      ExterQual
      ExterCond
      Foundation
      BsmtQual
      BsmtCond
      BsmtExposure
      BsmtFinType1
      BsmtFinSF1
      BsmtFinType2
      BsmtFinSF2
      BsmtUnfSF
      TotalBsmtSF
      Heating
      HeatingQC
      CentralAir
      Electrical
      1stFlrSF
      2ndFlrSF
      LowQualFinSF
      GrLivArea
      BsmtFullBath
      BsmtHalfBath
      FullBath
      HalfBath
      BedroomAbvGr
      KitchenAbvGr
      KitchenQual
      TotRmsAbvGrd
      Functional
      Fireplaces
      FireplaceQu
      GarageType
      GarageYrBlt
      GarageFinish
      GarageCars
      GarageArea
      GarageQual
      GarageCond
      PavedDrive
      WoodDeckSF
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalesPrice
    
  
  
    
      0
      60
      RL
      65.0
      8450
      Pave
      None
      Reg
      Lvl
      Inside
      Gtl
      CollgCr
      Norm
      Norm
      1Fam
      2Story
      7
      5
      2003
      2003
      Gable
      CompShg
      VinylSd
      VinylSd
      BrkFace
      196.0
      Gd
      TA
      PConc
      Gd
      TA
      No
      GLQ
      706.0
      Unf
      0.0
      150.0
      856.0
      GasA
      Ex
      Y
      SBrkr
      856
      854
      0
      1710
      1.0
      0.0
      2
      1
      3
      1
      Gd
      8
      Typ
      0
      None
      Attchd
      2003.0
      RFn
      2.0
      548.0
      TA
      TA
      Y
      0
      61
      0
      0
      0
      0
      None
      None
      None
      0
      2
      2008
      WD
      Normal
      208500.0
    
    
      1
      20
      RL
      80.0
      9600
      Pave
      None
      Reg
      Lvl
      FR2
      Gtl
      Veenker
      Feedr
      Norm
      1Fam
      1Story
      6
      8
      1976
      1976
      Gable
      CompShg
      MetalSd
      MetalSd
      None
      0.0
      TA
      TA
      CBlock
      Gd
      TA
      Gd
      ALQ
      978.0
      Unf
      0.0
      284.0
      1262.0
      GasA
      Ex
      Y
      SBrkr
      1262
      0
      0
      1262
      0.0
      1.0
      2
      0
      3
      1
      TA
      6
      Typ
      1
      TA
      Attchd
      1976.0
      RFn
      2.0
      460.0
      TA
      TA
      Y
      298
      0
      0
      0
      0
      0
      None
      None
      None
      0
      5
      2007
      WD
      Normal
      181500.0
    
    
      2
      60
      RL
      68.0
      11250
      Pave
      None
      IR1
      Lvl
      Inside
      Gtl
      CollgCr
      Norm
      Norm
      1Fam
      2Story
      7
      5
      2001
      2002
      Gable
      CompShg
      VinylSd
      VinylSd
      BrkFace
      162.0
      Gd
      TA
      PConc
      Gd
      TA
      Mn
      GLQ
      486.0
      Unf
      0.0
      434.0
      920.0
      GasA
      Ex
      Y
      SBrkr
      920
      866
      0
      1786
      1.0
      0.0
      2
      1
      3
      1
      Gd
      6
      Typ
      1
      TA
      Attchd
      2001.0
      RFn
      2.0
      608.0
      TA
      TA
      Y
      0
      42
      0
      0
      0
      0
      None
      None
      None
      0
      9
      2008
      WD
      Normal
      223500.0
    
    
      3
      70
      RL
      60.0
      9550
      Pave
      None
      IR1
      Lvl
      Corner
      Gtl
      Crawfor
      Norm
      Norm
      1Fam
      2Story
      7
      5
      1915
      1970
      Gable
      CompShg
      Wd Sdng
      Wd Shng
      None
      0.0
      TA
      TA
      BrkTil
      TA
      Gd
      No
      ALQ
      216.0
      Unf
      0.0
      540.0
      756.0
      GasA
      Gd
      Y
      SBrkr
      961
      756
      0
      1717
      1.0
      0.0
      1
      0
      3
      1
      Gd
      7
      Typ
      1
      Gd
      Detchd
      1998.0
      Unf
      3.0
      642.0
      TA
      TA
      Y
      0
      35
      272
      0
      0
      0
      None
      None
      None
      0
      2
      2006
      WD
      Abnorml
      140000.0
    
    
      4
      60
      RL
      84.0
      14260
      Pave
      None
      IR1
      Lvl
      FR2
      Gtl
      NoRidge
      Norm
      Norm
      1Fam
      2Story
      8
      5
      2000
      2000
      Gable
      CompShg
      VinylSd
      VinylSd
      BrkFace
      350.0
      Gd
      TA
      PConc
      Gd
      TA
      Av
      GLQ
      655.0
      Unf
      0.0
      490.0
      1145.0
      GasA
      Ex
      Y
      SBrkr
      1145
      1053
      0
      2198
      1.0
      0.0
      2
      1
      4
      1
      Gd
      9
      Typ
      1
      TA
      Attchd
      2000.0
      RFn
      3.0
      836.0
      TA
      TA
      Y
      192
      84
      0
      0
      0
      0
      None
      None
      None
      0
      12
      2008
      WD
      Normal
      250000.0



In [56]:

    
df_dummy = pd.get_dummies(df)



In [57]:

    
target = "SalesPrice"



In [58]:

    
import numpy as np
import matplotlib.pyplot as plt



In [59]:

    
y = np.log(df[target])



In [60]:

    
X = df_dummy.drop(columns=target)



In [61]:

    
y.plot.hist(bins = 35)









    Out[61]:





<matplotlib.axes._subplots.AxesSubplot at 0x11952f990>



In [62]:

    
df[target].plot.hist(bins = 35)









    Out[62]:





<matplotlib.axes._subplots.AxesSubplot at 0x1a25237a10>



In [63]:

    
from sklearn import *



In [64]:

    
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)



In [98]:

    
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)



In [99]:

    
lr = linear_model.Lasso(alpha=0.05, random_state=1)
lr.fit(X_train_std, y_train)









    Out[99]:





Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1,
      selection='cyclic', tol=0.0001, warm_start=False)



In [100]:

    
y_train_pred = lr.predict(X_train_std)



In [101]:

    
rmse = metrics.mean_squared_error(y_train, y_train_pred) ** 0.5
rmse









    Out[101]:





0.17846084266441412



In [102]:

    
y_test_pred = lr.predict(X_test_std)



In [103]:

    
rmse = metrics.mean_squared_error(y_test, y_test_pred) ** 0.5
rmse









    Out[103]:





0.198150257844202



In [92]:

    
metrics.r2_score(y_train, y_train_pred), metrics.r2_score(y_test, y_test_pred)









    Out[92]:





(0.7826767930566376, 0.7922449325754148)



In [104]:

    
y_train_error = y_train - y_train_pred



In [105]:

    
lr2 = linear_model.Lasso(alpha=0.05, random_state=1)
lr2.fit(X_train_std, y_train_error)









    Out[105]:





Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1,
      selection='cyclic', tol=0.0001, warm_start=False)



In [109]:

    
metrics.mean_squared_error(y_train, lr.predict(X_train_std)) ** .5









    Out[109]:





0.17846084266441412



In [110]:

    
metrics.mean_squared_error(y_train, lr.predict(X_train_std) + lr2.predict(X_train_std)) ** .5









    Out[110]:





0.17845893538984411



In [122]:

    
est = ensemble.GradientBoostingRegressor(max_depth=6, n_estimators=10, )
est.fit(X_train_std, y_train)
y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)









    



train rmse:  0.16015444122722466
train r2:  0.9082817626656599
test rmse:  0.24020018690351005
test r2:  0.833494150500505



In [123]:

    
import xgboost as xgb



In [181]:

    
est_xgb = xgb.XGBRegressor(
                            booster= "gblinear",
                            max_depth=3, 
                           n_estimators=200,
                           learning_rate=0.1,
                           objective="reg:squarederror", 
                           colsample_bytree = 0.5,
                            alpha = 0.5,
                            reg_lambda = 0.3
                          )
est_xgb.fit(X_train_std, y_train)









    Out[181]:





XGBRegressor(alpha=0.5, base_score=0.5, booster='gblinear', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=0.3, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)



In [182]:

    
y_train_pred = est_xgb.predict(X_train_std)
y_test_pred = est_xgb.predict(X_test_std)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)









    



train rmse:  0.09590374316262254
train r2:  0.9681109362863142
test rmse:  0.1394151314931741
test r2:  0.9471827914333071



In [190]:

    
param_grid = {
    "booster": ["gblinear", "gbtree"],
    "max_depth": np.arange(2, 10),
    #"learning_rate": np.linspace(0.1, 0.9, 10),
    #"colsample_bytree": np.linspace(0.3, 0.7, 10)
}

gsearch = model_selection.GridSearchCV(estimator= est_xgb, cv = 5, param_grid = param_grid,  verbose=1, n_jobs = 8)
gsearch.fit(X_train_std, y_train)









    



Fitting 5 folds for each of 16 candidates, totalling 80 fits






    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=8)]: Done  80 out of  80 | elapsed:   30.5s finished






    Out[190]:





GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(alpha=0.5, base_score=0.5,
                                    booster='gblinear', colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=0.5,
                                    gamma=0, importance_type='gain',
                                    learning_rate=0.1, max_delta_step=0,
                                    max_depth=3, min_child_weight=1,
                                    missing=None, n_estimators=200, n_jobs=1,
                                    nthread=None, objective='reg:squarederror',
                                    random_state=0, reg_alpha=0, reg_lambda=0.3,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=8,
             param_grid={'booster': ['gblinear', 'gbtree'],
                         'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)



In [191]:

    
gsearch.best_params_









    Out[191]:





{'booster': 'gbtree', 'max_depth': 3}



In [192]:

    
y_train_pred = gsearch.predict(X_train_std)
y_test_pred = gsearch.predict(X_test_std)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)









    



train rmse:  0.059831818542215316
train r2:  0.9877105772531495
test rmse:  0.1370707566541226
test r2:  0.9489915645502398



In [ ]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalesPrice
0	1	60	RL	65.0	8450	Pave	None	Reg	Lvl	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196.0	Gd	TA	PConc	Gd	TA	No	GLQ	706.0	Unf	150.0	856.0	GasA	Ex	Y	SBrkr	856	854	1710	1.0	0.0	2	1	3	1	Gd	8	Typ	0	None	Attchd	2003.0	RFn	2.0	548.0	TA	TA	Y	0	61	0	None	None	None	2	2008	WD	Normal	208500.0
1	2	20	RL	80.0	9600	Pave	None	Reg	Lvl	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0.0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978.0	Unf	284.0	1262.0	GasA	Ex	Y	SBrkr	1262	0	1262	0.0	1.0	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976.0	RFn	2.0	460.0	TA	TA	Y	298	0	0	None	None	None	5	2007	WD	Normal	181500.0
2	3	60	RL	68.0	11250	Pave	None	IR1	Lvl	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162.0	Gd	TA	PConc	Gd	TA	Mn	GLQ	486.0	Unf	434.0	920.0	GasA	Ex	Y	SBrkr	920	866	1786	1.0	0.0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001.0	RFn	2.0	608.0	TA	TA	Y	0	42	0	None	None	None	9	2008	WD	Normal	223500.0
3	4	70	RL	60.0	9550	Pave	None	IR1	Lvl	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0.0	TA	TA	BrkTil	TA	Gd	No	ALQ	216.0	Unf	540.0	756.0	GasA	Gd	Y	SBrkr	961	756	1717	1.0	0.0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998.0	Unf	3.0	642.0	TA	TA	Y	0	35	272	None	None	None	2	2006	WD	Abnorml	140000.0
4	5	60	RL	84.0	14260	Pave	None	IR1	Lvl	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350.0	Gd	TA	PConc	Gd	TA	Av	GLQ	655.0	Unf	490.0	1145.0	GasA	Ex	Y	SBrkr	1145	1053	2198	1.0	0.0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000.0	RFn	3.0	836.0	TA	TA	Y	192	84	0	None	None	None	12	2008	WD	Normal	250000.0