Predicting the number of tickets requested by different clients
Supervised Learning. Regression
Data taken from Udacity's problem solving with advanced analytics
Here a neural network is effectively appllied in a simple problem usually solved with linear models
In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helper
import keras
helper.info_gpu()
helper.reproducible(seed=9) # setup reproducible results from run to run using Keras
%matplotlib inline
In [2]:
data_path = 'data/simple_tickets_data.csv'
target = ['Average Number of Tickets']
df = pd.read_csv(data_path)
print("rows: {} \ncolumns: {} \ntarget: {}".format(*df.shape, target))
In [3]:
df.head()
Out[3]:
In [4]:
df.describe(percentiles=[0.5])
Out[4]:
In [5]:
helper.missing(df);
In [6]:
df = df.drop('Client ID', axis='columns')
In [7]:
numerical = ['Number of Employees', 'Value of Contract', 'Average Number of Tickets']
df = helper.classify_data(df, target, numerical)
pd.DataFrame(dict(df.dtypes), index=["Type"])[df.columns].head() # show data types
Out[7]:
In [8]:
helper.show_categorical(df)
In [9]:
helper.show_target_vs_categorical(df, target)
In [10]:
helper.show_numerical(df, kde=True)
In [11]:
helper.show_target_vs_numerical(df, target, point_size=20)
In [12]:
g = sns.PairGrid(df, y_vars=target, x_vars=['Number of Employees', 'Value of Contract'],
size=7, hue='Industry', aspect=1.5)
g.map(sns.regplot).add_legend();
#sns.pairplot(df, hue = 'Industry', vars=['Number of Employees', 'Value of Contract'] +
# targets, size = 4)
These figures suggest that a simple linear model could be used to make accurate predictions
In [13]:
helper.show_correlation(df, target, figsize=(7,4))
In [14]:
droplist = [] # features to drop
# For the model 'data' instead of 'df'
data = df.copy()
data.drop(droplist, axis='columns', inplace=True)
data.head(3)
Out[14]:
In [15]:
data, scale_param = helper.scale(data)
In [16]:
data, dict_dummies = helper.replace_by_dummies(data, target)
model_features = [f for f in data if f not in target] # sorted neural network inputs
data.head(3)
Out[16]:
In [17]:
test_size = 0.2
random_state = 0
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=test_size, random_state=random_state)
# Separate the data into features and target (x=features, y=target)
x_train, y_train = train.drop(target, axis=1).values, train[target].values
x_test, y_test = test.drop(target, axis=1).values, test[target].values
One-hot encode the output not needed for regression
In [18]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("test size \t X:{} \t Y:{} ".format(x_test.shape, y_test.shape))
In [19]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
def build_nn(input_size, output_size, summary=False):
input_nodes = input_size
weights = keras.initializers.RandomNormal(stddev=0.001)
model = Sequential()
model.add(
Dense(
input_nodes,
input_dim=input_size,
activation='tanh',
kernel_initializer=weights,
bias_initializer=weights))
model.add(Dense(1, activation=None, kernel_initializer=weights, bias_initializer=weights))
model.compile(loss='mean_squared_error', optimizer='adam')
if summary:
model.summary()
return model
In [20]:
from time import time
model_path = os.path.join("models", "simple_tickets.h5")
def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
"""
Train the neural network model. If no validation_datais provided, a split for validation
will be used
"""
if show:
print('Training ....')
#callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=0)]
t0 = time()
history = model.fit(
x_train,
y_train,
epochs=30,
batch_size=16,
validation_split=0,
validation_data=validation_data,
callbacks=None,
verbose=0)
if show:
print("time: \t {:.1f} s".format(time() - t0))
helper.show_training(history)
if path:
model.save(path)
print("\nModel saved at", path)
return history
model = None
model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
train_nn(model, x_train, y_train, validation_data=None, path=model_path);
from sklearn.metrics import r2_score
ypred_train = model.predict(x_train)
#ypred_val = model.predict(x_val)
print('Training R2-score: \t{:.3f}'.format(r2_score(y_train, ypred_train)))
#print('Validation R2-score: \t{:.3f}'.format(r2_score(y_val, ypred_val)))
In [21]:
# model = keras.models.load_model(model_path)
# print("Model loaded:", model_path)
def evaluate_nn(model, x_test, y_test):
score = model.evaluate(x_test, y_test, verbose=0)
print("\nTest loss:\t\t{:.4f}".format(score))
ypred_test = model.predict(x_test)
print('\nTest R2-score: \t\t{:.3f}'.format(r2_score(y_test, ypred_test)))
evaluate_nn(model, x_test, y_test)
In [22]:
def predict_nn(model, x_test, target):
""" Return a dataframe with actual and predicted targets in original scale"""
for t in target:
pred = model.predict(x_test, verbose=0)
restore_pred = pred * scale_param[t][1] + scale_param[t][0]
restore_pred = restore_pred.round()
restore_y = y_test * scale_param[t][1] + scale_param[t][0]
restore_y = restore_y.round()
pred_label = 'Predicted_' + t
error_label = t + ' error (%)'
pred_df = pd.DataFrame({
t: np.squeeze(restore_y),
pred_label: np.squeeze(restore_pred)
})
pred_df[error_label] = ((pred_df[pred_label] - pred_df[t]) * 100 / pred_df[t]).round(1)
print(t, ". Prediction error:")
print("Mean: \t {:.2f}%".format(pred_df[error_label].mean()))
print("Stddev: {:.2f}%".format(pred_df[error_label].std()))
sns.distplot(pred_df[error_label])
plt.xlim(xmin=-600, xmax=600)
return pred_df
pred_df = predict_nn(model, x_test, target)
In [23]:
pred_df.head()
Out[23]:
The prediction error (%) can be especially high when the number of tickets is low. The absolute error could be a better indicator here.
In [24]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)
pred = reg.predict(x_test)
t=target[0]
restore_pred = pred * scale_param[t][1] + scale_param[t][0]
restore_pred = restore_pred.round()
restore_y = y_test * scale_param[t][1] + scale_param[t][0]
restore_y = restore_y.round()
pred_label = 'Predicted_' + t
error_label = t + ' error (%)'
pred_df = pd.DataFrame({
t: np.squeeze(restore_y),
pred_label: np.squeeze(restore_pred)
})
pred_df[error_label] = ((pred_df[pred_label] - pred_df[t]) * 100 / pred_df[t]).round(1)
print(t, ". Prediction error:")
print("Mean: \t {:.2f}%".format(pred_df[error_label].mean()))
print("Stddev: {:.2f}%".format(pred_df[error_label].std()))
sns.distplot(pred_df[error_label])
plt.xlim(xmin=-600, xmax=600)
Out[24]:
The mean and standard deviation of the error is higher with the linear model.
In [25]:
helper.ml_regression(x_train, y_train[:,0], x_test, y_test[:,0])
Out[25]: