In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection, preprocessing, metrics, linear_model, pipeline, ensemble
import numpy as np
import seaborn as sns
import scipy
np.set_printoptions(suppress=True, precision=4)
%matplotlib inline
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = 10, 6
In [2]:
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
df.info()
In [3]:
df_train = df[~np.isnan(df.SalesPrice)]
df_test = df[np.isnan(df.SalesPrice)]
In [4]:
df_train.shape, df_test.shape
Out[4]:
In [5]:
plt.hist(df_train.SalesPrice, bins = 50);
In [6]:
plt.hist(np.log(df_train.SalesPrice), bins = 50);
In [7]:
y = np.log(df.SalesPrice)
ids = df.Id
X = df.copy()
del X["Id"]
del X["SalesPrice"]
X.head()
Out[7]:
In [8]:
X_dummy = pd.get_dummies(X, drop_first= True)
X_train = X_dummy[~np.isnan(y)]
X_test = X_dummy[np.isnan(y)]
y_train = y[~np.isnan(y)]
pd.DataFrame(X_train).describe()
Out[8]:
In [9]:
%%time
pipe = pipeline.Pipeline([
("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
("scaler", preprocessing.StandardScaler()),
("fit", linear_model.Lasso())
])
param_grid = {
"fit__alpha": 10 ** np.linspace(-3, 1, 20)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
scoring="neg_mean_squared_error", param_grid=param_grid)
gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)
In [10]:
%%time
pipe = pipeline.Pipeline([
("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
("scaler", preprocessing.StandardScaler()),
("fit", linear_model.Ridge(random_state = 1))
])
param_grid = {
"fit__alpha": 10 ** np.linspace(-3, 2, 20)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
scoring="neg_mean_squared_error", param_grid=param_grid)
gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)
In [11]:
%%time
pipe = pipeline.Pipeline([
("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
("scaler", preprocessing.StandardScaler()),
("fit", linear_model.SGDRegressor(random_state=1, max_iter = 10000, tol=1e-6))
])
param_grid = {
"fit__alpha": 10 ** np.linspace(0, 2, 5),
"fit__loss": ["squared_loss", "huber"],
"fit__l1_ratio": np.linspace(0.1, 0.9, 5)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
scoring="neg_mean_squared_error", param_grid=param_grid)
gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)
In [12]:
%%time
pipe = pipeline.Pipeline([
("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
("scaler", preprocessing.StandardScaler()),
("fit", ensemble.GradientBoostingRegressor(random_state=1, learning_rate=0.1, ))
])
param_grid = {
"fit__learning_rate": [0.1, 0.01],
"fit__alpha": np.linspace(0.001, 0.999, 5),
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
scoring="neg_mean_squared_error", param_grid=param_grid)
gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)
In [13]:
%%time
import xgboost as xgb
pipe = pipeline.Pipeline([
("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
("scaler", preprocessing.StandardScaler()),
("fit", xgb.XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=100,
objective='reg:linear', booster='gbtree', random_state=1))
])
param_grid = {
"fit__reg_alpha": 10 ** np.linspace(-1, 1, 5),
# "fit__max_depth": 2 * np.arange(1, 10),
# "fit__reg_lambda": np.linspace(0.1, 0.9, 5)
}
gs = model_selection.GridSearchCV(cv=5, estimator=pipe, verbose = True,
scoring="neg_mean_squared_error", param_grid=param_grid)
gs.fit(X_train, y_train)
print("best params", gs.best_params_, "best scores", - gs.best_score_)
In [ ]: