In [7]:
import pandas as pd
pd.options.display.max_columns = 1000
# Data: https://raw.githubusercontent.com/abulbasar/data/master/kaggle-houseprice/data_combined_cleaned.csv
In [2]:
# https://github.com/abulbasar/machine-learning/blob/master/Scikit%20-%2020%20Kaggle%20House%20Data%20Preprocessing.ipynb
In [3]:
df = pd.read_csv("/data/kaggle/data_combined_cleaned.csv")
In [8]:
df.head()
Out[8]:
In [10]:
df.info()
In [11]:
del df["Id"]
In [53]:
df = df[~df.SalesPrice.isna()]
In [54]:
df.info()
In [55]:
df.head()
Out[55]:
In [56]:
df_dummy = pd.get_dummies(df)
In [57]:
target = "SalesPrice"
In [58]:
import numpy as np
import matplotlib.pyplot as plt
In [59]:
y = np.log(df[target])
In [60]:
X = df_dummy.drop(columns=target)
In [61]:
y.plot.hist(bins = 35)
Out[61]:
In [62]:
df[target].plot.hist(bins = 35)
Out[62]:
In [63]:
from sklearn import *
In [64]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 1)
In [98]:
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
In [99]:
lr = linear_model.Lasso(alpha=0.05, random_state=1)
lr.fit(X_train_std, y_train)
Out[99]:
In [100]:
y_train_pred = lr.predict(X_train_std)
In [101]:
rmse = metrics.mean_squared_error(y_train, y_train_pred) ** 0.5
rmse
Out[101]:
In [102]:
y_test_pred = lr.predict(X_test_std)
In [103]:
rmse = metrics.mean_squared_error(y_test, y_test_pred) ** 0.5
rmse
Out[103]:
In [92]:
metrics.r2_score(y_train, y_train_pred), metrics.r2_score(y_test, y_test_pred)
Out[92]:
In [104]:
y_train_error = y_train - y_train_pred
In [105]:
lr2 = linear_model.Lasso(alpha=0.05, random_state=1)
lr2.fit(X_train_std, y_train_error)
Out[105]:
In [109]:
metrics.mean_squared_error(y_train, lr.predict(X_train_std)) ** .5
Out[109]:
In [110]:
metrics.mean_squared_error(y_train, lr.predict(X_train_std) + lr2.predict(X_train_std)) ** .5
Out[110]:
In [122]:
est = ensemble.GradientBoostingRegressor(max_depth=6, n_estimators=10, )
est.fit(X_train_std, y_train)
y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)
print("train rmse: ", metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ", metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ", metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ", metrics.r2_score(y_test, y_test_pred) ** 0.5)
In [123]:
import xgboost as xgb
In [181]:
est_xgb = xgb.XGBRegressor(
booster= "gblinear",
max_depth=3,
n_estimators=200,
learning_rate=0.1,
objective="reg:squarederror",
colsample_bytree = 0.5,
alpha = 0.5,
reg_lambda = 0.3
)
est_xgb.fit(X_train_std, y_train)
Out[181]:
In [182]:
y_train_pred = est_xgb.predict(X_train_std)
y_test_pred = est_xgb.predict(X_test_std)
print("train rmse: ", metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ", metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ", metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ", metrics.r2_score(y_test, y_test_pred) ** 0.5)
In [190]:
param_grid = {
"booster": ["gblinear", "gbtree"],
"max_depth": np.arange(2, 10),
#"learning_rate": np.linspace(0.1, 0.9, 10),
#"colsample_bytree": np.linspace(0.3, 0.7, 10)
}
gsearch = model_selection.GridSearchCV(estimator= est_xgb, cv = 5, param_grid = param_grid, verbose=1, n_jobs = 8)
gsearch.fit(X_train_std, y_train)
Out[190]:
In [191]:
gsearch.best_params_
Out[191]:
In [192]:
y_train_pred = gsearch.predict(X_train_std)
y_test_pred = gsearch.predict(X_test_std)
print("train rmse: ", metrics.mean_squared_error(y_train, y_train_pred) ** 0.5)
print("train r2: ", metrics.r2_score(y_train, y_train_pred) ** 0.5)
print("test rmse: ", metrics.mean_squared_error(y_test, y_test_pred) ** 0.5)
print("test r2: ", metrics.r2_score(y_test, y_test_pred) ** 0.5)
In [ ]: