Sales prices prediction using an artificial neural network in Keras
Supervised Learning. Regression
Source: Ames Housing dataset (Kaggle website).
In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helper
import keras
helper.info_gpu()
sns.set_palette("GnBu_d")
#helper.reproducible(seed=0) # Setup reproducible results from run to run using Keras
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
data_path = 'data/house_prices_data.csv'
target = ['SalePrice']
df_original = pd.read_csv(data_path)
In [3]:
helper.info_data(df_original, target)
In [4]:
df_original.head(3)
Out[4]:
In [5]:
high_missing = helper.missing(df_original, limit=0.4, plot=True)
high_missing
Out[5]:
In [6]:
df = df_original.copy() # modified dataset
# remove non-significant and high-missing features
droplist = ['Id'] + high_missing
assert len(set(droplist).intersection(set(target))) == 0, 'Targets cannot be dropped'
df.drop(droplist, axis='columns', inplace=True)
In [7]:
numerical = list(df.select_dtypes(include=[np.number]))
df = helper.classify_data(df, target, numerical=numerical)
helper.get_types(df)
Out[7]:
In [8]:
df, dict_categories = helper.remove_categories(df, target, ratio=0.01)
In [9]:
helper.fill_simple(df, target, inplace=True)
In [10]:
g = sns.PairGrid(
df, y_vars=["SalePrice"], x_vars=["LotArea", "YearBuilt"], size=5, hue='OverallQual')
g.map(plt.scatter).add_legend()
g.axes[0, 0].set_xlim(0, 20000)
plt.ylim(df['SalePrice'].min(), 600000)
Out[10]:
Lower sale prices are usually found in very low overall quality houses, with less dependency on its size and the year of construction. These three features alone are insufficient to make a good price prediction.
In [11]:
helper.show_categorical(df, sharey=True)
In [12]:
helper.show_target_vs_categorical(df, target)
In [13]:
helper.show_numerical(df, kde=True)
In [14]:
helper.show_target_vs_numerical(df, target, point_size=20,
jitter=0.2,
fit_reg=False)
In [15]:
helper.correlation(df, target)
In [16]:
droplist = [] # features to drop
# For the model 'data' instead of 'df'
data = df.copy()
data.drop(droplist, axis='columns', inplace=True)
data.head(3)
Out[16]:
In [17]:
data, scale_param = helper.scale(data)
In [18]:
data, dict_dummies = helper.replace_by_dummies(data, target)
model_features = [f for f in data if f not in target] # sorted neural network inputs
data.head(3)
Out[18]:
In [19]:
test_size = 0.2
val_size = 0.1
random_state = 9
x_train, y_train, x_val, y_val, x_test, y_test = helper.train_val_test_split(
data, target, test_size=test_size, val_size=val_size, random_state=random_state)
One-hot encode the output not needed for regression
In [20]:
model_path = os.path.join("models", "house_prices.h5")
weights = weights = keras.initializers.TruncatedNormal(stddev=0.0001)
opt = keras.optimizers.adam(lr=0.00005)
model = None
model = helper.build_nn_reg(
x_train.shape[1],
y_train.shape[1],
hidden_layers=1,
input_nodes=x_train.shape[1] // 2,
dropout=0.2,
kernel_initializer=weights,
bias_initializer=weights,
optimizer=opt,
summary=True)
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0)]
helper.train_nn(
model,
x_train,
y_train,
validation_data=[x_val, y_val],
path=model_path,
epochs=500,
batch_size=16,
callbacks=callbacks)
from sklearn.metrics import r2_score
ypred_train = model.predict(x_train)
ypred_val = model.predict(x_val)
print('Training R2-score: \t{:.3f}'.format(r2_score(y_train, ypred_train)))
print('Validation R2-score: \t{:.3f}'.format(r2_score(y_val, ypred_val)))
In [21]:
# restore training set
x_train = np.vstack((x_train, x_val))
y_train = np.vstack((y_train, y_val))
In [22]:
from sklearn.model_selection import KFold
def cv_train_nn(x_train, y_train, n_splits):
""" Create and Train models for cross validation. Return best model """
skf = KFold(n_splits=n_splits, shuffle=True)
score = []
best_model = None
best_loss = float('inf')
print('Training {} models for Cross Validation ...'.format(n_splits))
for train, val in skf.split(x_train[:, 0], y_train[:, 0]):
model = None
model = helper.build_nn_reg(
x_train.shape[1],
y_train.shape[1],
hidden_layers=1,
input_nodes=x_train.shape[1] // 2,
dropout=0.2,
kernel_initializer=weights,
bias_initializer=weights,
optimizer=opt,
summary=False)
history = helper.train_nn(
model,
x_train[train],
y_train[train],
show=False,
validation_data=(x_train[val], y_train[val]),
epochs=500,
batch_size=16,
callbacks=callbacks)
val_loss = history.history['val_loss'][-1]
score.append(val_loss)
if val_loss < best_loss: # save best model (fold) for evaluation and predictions
best_model = model
best_loss = val_loss
print('\nCross Validation loss: {:.3f}'.format(np.mean(score)))
return best_model
model = cv_train_nn(x_train, y_train, 10)
In [23]:
y_pred_test = model.predict(x_test, verbose=0)
helper.regression_scores(y_test, y_pred_test, return_dataframe=True, index="DNN")
Out[23]:
In [24]:
def predict_nn(model, x_test, target):
""" Return a dataframe with actual and predicted targets in original scale"""
for t in target:
pred = model.predict(x_test, verbose=0)
restore_pred = pred * scale_param[t][1] + scale_param[t][0]
restore_pred = restore_pred.round()
restore_y = y_test * scale_param[t][1] + scale_param[t][0]
restore_y = restore_y.round()
pred_label = 'Predicted_' + t
error_label = t + ' error (%)'
pred_df = pd.DataFrame({t: np.squeeze(restore_y), pred_label: np.squeeze(restore_pred)})
pred_df[error_label] = ((pred_df[pred_label] - pred_df[t]) * 100 / pred_df[t]).round(1)
print(t, ". Prediction error:")
print("Mean: \t {:.2f}%".format(pred_df[error_label].mean()))
print("Stddev: {:.2f}%".format(pred_df[error_label].std()))
sns.distplot(pred_df[error_label])
return pred_df
pred_df = predict_nn(model, x_test, target)
In [25]:
pred_df.head(10)
Out[25]:
The error of the predicted sale prices can be modeled by a normal distribution, almost zero centered, and with a standard deviation of < 12%. Thus, ~95% of the houses are predicted within a price error < 24% respect to the actual one.
Note: there is data leakage when removing low-frequency categorical values and scaling numerical features
In [26]:
helper.ml_regression(x_train, y_train[:,0], x_test, y_test[:,0])
Out[26]:
In [27]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(
n_jobs=-1, n_estimators=100, random_state=9).fit(x_train, np.ravel(y_train))
y_pred = random_forest.predict(x_test)
helper.regression_scores(y_test, y_pred, return_dataframe=True, index="Random Forest")
Out[27]:
In [28]:
results = helper.feature_importances(model_features, random_forest)