In [7]:
import pandas as pd

pd.options.display.max_columns = 1000


# Data: https://raw.githubusercontent.com/abulbasar/data/master/kaggle-houseprice/data_combined_cleaned.csv

In [2]:
# https://github.com/abulbasar/machine-learning/blob/master/Scikit%20-%2020%20Kaggle%20House%20Data%20Preprocessing.ipynb

In [3]:
df = pd.read_csv("/data/kaggle/data_combined_cleaned.csv")

In [8]:
df.head()


Out[8]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalesPrice
0 1 60 RL 65.0 8450 Pave None Reg Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706.0 Unf 0.0 150.0 856.0 GasA Ex Y SBrkr 856 854 0 1710 1.0 0.0 2 1 3 1 Gd 8 Typ 0 None Attchd 2003.0 RFn 2.0 548.0 TA TA Y 0 61 0 0 0 0 None None None 0 2 2008 WD Normal 208500.0
1 2 20 RL 80.0 9600 Pave None Reg Lvl FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978.0 Unf 0.0 284.0 1262.0 GasA Ex Y SBrkr 1262 0 0 1262 0.0 1.0 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.0 RFn 2.0 460.0 TA TA Y 298 0 0 0 0 0 None None None 0 5 2007 WD Normal 181500.0
2 3 60 RL 68.0 11250 Pave None IR1 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486.0 Unf 0.0 434.0 920.0 GasA Ex Y SBrkr 920 866 0 1786 1.0 0.0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.0 RFn 2.0 608.0 TA TA Y 0 42 0 0 0 0 None None None 0 9 2008 WD Normal 223500.0
3 4 70 RL 60.0 9550 Pave None IR1 Lvl Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216.0 Unf 0.0 540.0 756.0 GasA Gd Y SBrkr 961 756 0 1717 1.0 0.0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.0 Unf 3.0 642.0 TA TA Y 0 35 272 0 0 0 None None None 0 2 2006 WD Abnorml 140000.0
4 5 60 RL 84.0 14260 Pave None IR1 Lvl FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655.0 Unf 0.0 490.0 1145.0 GasA Ex Y SBrkr 1145 1053 0 2198 1.0 0.0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.0 RFn 3.0 836.0 TA TA Y 192 84 0 0 0 0 None None None 0 12 2008 WD Normal 250000.0

In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2919 non-null   object 
 3   LotFrontage    2919 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          2919 non-null   object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   LotConfig      2919 non-null   object 
 10  LandSlope      2919 non-null   object 
 11  Neighborhood   2919 non-null   object 
 12  Condition1     2919 non-null   object 
 13  Condition2     2919 non-null   object 
 14  BldgType       2919 non-null   object 
 15  HouseStyle     2919 non-null   object 
 16  OverallQual    2919 non-null   int64  
 17  OverallCond    2919 non-null   int64  
 18  YearBuilt      2919 non-null   int64  
 19  YearRemodAdd   2919 non-null   int64  
 20  RoofStyle      2919 non-null   object 
 21  RoofMatl       2919 non-null   object 
 22  Exterior1st    2919 non-null   object 
 23  Exterior2nd    2919 non-null   object 
 24  MasVnrType     2919 non-null   object 
 25  MasVnrArea     2919 non-null   float64
 26  ExterQual      2919 non-null   object 
 27  ExterCond      2919 non-null   object 
 28  Foundation     2919 non-null   object 
 29  BsmtQual       2919 non-null   object 
 30  BsmtCond       2919 non-null   object 
 31  BsmtExposure   2919 non-null   object 
 32  BsmtFinType1   2919 non-null   object 
 33  BsmtFinSF1     2919 non-null   float64
 34  BsmtFinType2   2919 non-null   object 
 35  BsmtFinSF2     2919 non-null   float64
 36  BsmtUnfSF      2919 non-null   float64
 37  TotalBsmtSF    2919 non-null   float64
 38  Heating        2919 non-null   object 
 39  HeatingQC      2919 non-null   object 
 40  CentralAir     2919 non-null   object 
 41  Electrical     2919 non-null   object 
 42  1stFlrSF       2919 non-null   int64  
 43  2ndFlrSF       2919 non-null   int64  
 44  LowQualFinSF   2919 non-null   int64  
 45  GrLivArea      2919 non-null   int64  
 46  BsmtFullBath   2919 non-null   float64
 47  BsmtHalfBath   2919 non-null   float64
 48  FullBath       2919 non-null   int64  
 49  HalfBath       2919 non-null   int64  
 50  BedroomAbvGr   2919 non-null   int64  
 51  KitchenAbvGr   2919 non-null   int64  
 52  KitchenQual    2919 non-null   object 
 53  TotRmsAbvGrd   2919 non-null   int64  
 54  Functional     2919 non-null   object 
 55  Fireplaces     2919 non-null   int64  
 56  FireplaceQu    2919 non-null   object 
 57  GarageType     2919 non-null   object 
 58  GarageYrBlt    2919 non-null   float64
 59  GarageFinish   2919 non-null   object 
 60  GarageCars     2919 non-null   float64
 61  GarageArea     2919 non-null   float64
 62  GarageQual     2919 non-null   object 
 63  GarageCond     2919 non-null   object 
 64  PavedDrive     2919 non-null   object 
 65  WoodDeckSF     2919 non-null   int64  
 66  OpenPorchSF    2919 non-null   int64  
 67  EnclosedPorch  2919 non-null   int64  
 68  3SsnPorch      2919 non-null   int64  
 69  ScreenPorch    2919 non-null   int64  
 70  PoolArea       2919 non-null   int64  
 71  PoolQC         2919 non-null   object 
 72  Fence          2919 non-null   object 
 73  MiscFeature    2919 non-null   object 
 74  MiscVal        2919 non-null   int64  
 75  MoSold         2919 non-null   int64  
 76  YrSold         2919 non-null   int64  
 77  SaleType       2919 non-null   object 
 78  SaleCondition  2919 non-null   object 
 79  SalesPrice     1460 non-null   float64
dtypes: float64(12), int64(26), object(42)
memory usage: 1.8+ MB

In [11]:
del df["Id"]

In [53]:
df = df[~df.SalesPrice.isna()]

In [54]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemodAdd   1460 non-null   int64  
 19  RoofStyle      1460 non-null   object 
 20  RoofMatl       1460 non-null   object 
 21  Exterior1st    1460 non-null   object 
 22  Exterior2nd    1460 non-null   object 
 23  MasVnrType     1460 non-null   object 
 24  MasVnrArea     1460 non-null   float64
 25  ExterQual      1460 non-null   object 
 26  ExterCond      1460 non-null   object 
 27  Foundation     1460 non-null   object 
 28  BsmtQual       1460 non-null   object 
 29  BsmtCond       1460 non-null   object 
 30  BsmtExposure   1460 non-null   object 
 31  BsmtFinType1   1460 non-null   object 
 32  BsmtFinSF1     1460 non-null   float64
 33  BsmtFinType2   1460 non-null   object 
 34  BsmtFinSF2     1460 non-null   float64
 35  BsmtUnfSF      1460 non-null   float64
 36  TotalBsmtSF    1460 non-null   float64
 37  Heating        1460 non-null   object 
 38  HeatingQC      1460 non-null   object 
 39  CentralAir     1460 non-null   object 
 40  Electrical     1460 non-null   object 
 41  1stFlrSF       1460 non-null   int64  
 42  2ndFlrSF       1460 non-null   int64  
 43  LowQualFinSF   1460 non-null   int64  
 44  GrLivArea      1460 non-null   int64  
 45  BsmtFullBath   1460 non-null   float64
 46  BsmtHalfBath   1460 non-null   float64
 47  FullBath       1460 non-null   int64  
 48  HalfBath       1460 non-null   int64  
 49  BedroomAbvGr   1460 non-null   int64  
 50  KitchenAbvGr   1460 non-null   int64  
 51  KitchenQual    1460 non-null   object 
 52  TotRmsAbvGrd   1460 non-null   int64  
 53  Functional     1460 non-null   object 
 54  Fireplaces     1460 non-null   int64  
 55  FireplaceQu    1460 non-null   object 
 56  GarageType     1460 non-null   object 
 57  GarageYrBlt    1460 non-null   float64
 58  GarageFinish   1460 non-null   object 
 59  GarageCars     1460 non-null   float64
 60  GarageArea     1460 non-null   float64
 61  GarageQual     1460 non-null   object 
 62  GarageCond     1460 non-null   object 
 63  PavedDrive     1460 non-null   object 
 64  WoodDeckSF     1460 non-null   int64  
 65  OpenPorchSF    1460 non-null   int64  
 66  EnclosedPorch  1460 non-null   int64  
 67  3SsnPorch      1460 non-null   int64  
 68  ScreenPorch    1460 non-null   int64  
 69  PoolArea       1460 non-null   int64  
 70  PoolQC         1460 non-null   object 
 71  Fence          1460 non-null   object 
 72  MiscFeature    1460 non-null   object 
 73  MiscVal        1460 non-null   int64  
 74  MoSold         1460 non-null   int64  
 75  YrSold         1460 non-null   int64  
 76  SaleType       1460 non-null   object 
 77  SaleCondition  1460 non-null   object 
 78  SalesPrice     1460 non-null   float64
dtypes: float64(12), int64(25), object(42)
memory usage: 912.5+ KB

In [55]:
df.head()


Out[55]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalesPrice
0 60 RL 65.0 8450 Pave None Reg Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706.0 Unf 0.0 150.0 856.0 GasA Ex Y SBrkr 856 854 0 1710 1.0 0.0 2 1 3 1 Gd 8 Typ 0 None Attchd 2003.0 RFn 2.0 548.0 TA TA Y 0 61 0 0 0 0 None None None 0 2 2008 WD Normal 208500.0
1 20 RL 80.0 9600 Pave None Reg Lvl FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978.0 Unf 0.0 284.0 1262.0 GasA Ex Y SBrkr 1262 0 0 1262 0.0 1.0 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.0 RFn 2.0 460.0 TA TA Y 298 0 0 0 0 0 None None None 0 5 2007 WD Normal 181500.0
2 60 RL 68.0 11250 Pave None IR1 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486.0 Unf 0.0 434.0 920.0 GasA Ex Y SBrkr 920 866 0 1786 1.0 0.0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.0 RFn 2.0 608.0 TA TA Y 0 42 0 0 0 0 None None None 0 9 2008 WD Normal 223500.0
3 70 RL 60.0 9550 Pave None IR1 Lvl Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216.0 Unf 0.0 540.0 756.0 GasA Gd Y SBrkr 961 756 0 1717 1.0 0.0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.0 Unf 3.0 642.0 TA TA Y 0 35 272 0 0 0 None None None 0 2 2006 WD Abnorml 140000.0
4 60 RL 84.0 14260 Pave None IR1 Lvl FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655.0 Unf 0.0 490.0 1145.0 GasA Ex Y SBrkr 1145 1053 0 2198 1.0 0.0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.0 RFn 3.0 836.0 TA TA Y 192 84 0 0 0 0 None None None 0 12 2008 WD Normal 250000.0

In [56]:
df_dummy = pd.get_dummies(df)

In [57]:
target = "SalesPrice"

In [58]:
import numpy as np
import matplotlib.pyplot as plt

In [59]:
y = np.log(df[target])

In [60]:
X = df_dummy.drop(columns=target)

In [61]:
y.plot.hist(bins = 35)


Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x11952f990>

In [62]:
df[target].plot.hist(bins = 35)


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a25237a10>

In [63]:
from sklearn import *

In [64]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)

In [98]:
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [99]:
lr = linear_model.Lasso(alpha=0.05, random_state=1)
lr.fit(X_train_std, y_train)


Out[99]:
Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1,
      selection='cyclic', tol=0.0001, warm_start=False)

In [100]:
y_train_pred = lr.predict(X_train_std)

In [101]:
rmse = metrics.mean_squared_error(y_train, y_train_pred) ** 0.5
rmse


Out[101]:
0.17846084266441412

In [102]:
y_test_pred = lr.predict(X_test_std)

In [103]:
rmse = metrics.mean_squared_error(y_test, y_test_pred) ** 0.5
rmse


Out[103]:
0.198150257844202

In [92]:
metrics.r2_score(y_train, y_train_pred), metrics.r2_score(y_test, y_test_pred)


Out[92]:
(0.7826767930566376, 0.7922449325754148)

In [104]:
y_train_error = y_train - y_train_pred

In [105]:
lr2 = linear_model.Lasso(alpha=0.05, random_state=1)
lr2.fit(X_train_std, y_train_error)


Out[105]:
Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1,
      selection='cyclic', tol=0.0001, warm_start=False)

In [109]:
metrics.mean_squared_error(y_train, lr.predict(X_train_std)) ** .5


Out[109]:
0.17846084266441412

In [110]:
metrics.mean_squared_error(y_train, lr.predict(X_train_std) + lr2.predict(X_train_std)) ** .5


Out[110]:
0.17845893538984411

In [122]:
est = ensemble.GradientBoostingRegressor(max_depth=6, n_estimators=10, )
est.fit(X_train_std, y_train)
y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)


train rmse:  0.16015444122722466
train r2:  0.9082817626656599
test rmse:  0.24020018690351005
test r2:  0.833494150500505

In [123]:
import xgboost as xgb

In [181]:
est_xgb = xgb.XGBRegressor(
                            booster= "gblinear",
                            max_depth=3, 
                           n_estimators=200,
                           learning_rate=0.1,
                           objective="reg:squarederror", 
                           colsample_bytree = 0.5,
                            alpha = 0.5,
                            reg_lambda = 0.3
                          )
est_xgb.fit(X_train_std, y_train)


Out[181]:
XGBRegressor(alpha=0.5, base_score=0.5, booster='gblinear', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=0.3, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [182]:
y_train_pred = est_xgb.predict(X_train_std)
y_test_pred = est_xgb.predict(X_test_std)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)


train rmse:  0.09590374316262254
train r2:  0.9681109362863142
test rmse:  0.1394151314931741
test r2:  0.9471827914333071

In [190]:
param_grid = {
    "booster": ["gblinear", "gbtree"],
    "max_depth": np.arange(2, 10),
    #"learning_rate": np.linspace(0.1, 0.9, 10),
    #"colsample_bytree": np.linspace(0.3, 0.7, 10)
}

gsearch = model_selection.GridSearchCV(estimator= est_xgb, cv = 5, param_grid = param_grid,  verbose=1, n_jobs = 8)
gsearch.fit(X_train_std, y_train)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=8)]: Done  80 out of  80 | elapsed:   30.5s finished
Out[190]:
GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(alpha=0.5, base_score=0.5,
                                    booster='gblinear', colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=0.5,
                                    gamma=0, importance_type='gain',
                                    learning_rate=0.1, max_delta_step=0,
                                    max_depth=3, min_child_weight=1,
                                    missing=None, n_estimators=200, n_jobs=1,
                                    nthread=None, objective='reg:squarederror',
                                    random_state=0, reg_alpha=0, reg_lambda=0.3,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=8,
             param_grid={'booster': ['gblinear', 'gbtree'],
                         'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [191]:
gsearch.best_params_


Out[191]:
{'booster': 'gbtree', 'max_depth': 3}

In [192]:
y_train_pred = gsearch.predict(X_train_std)
y_test_pred = gsearch.predict(X_test_std)
print("train rmse: ",  metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ",  metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ",  metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ",  metrics.r2_score(y_test, y_test_pred) ** 0.5)


train rmse:  0.059831818542215316
train r2:  0.9877105772531495
test rmse:  0.1370707566541226
test r2:  0.9489915645502398

In [ ]: