In [63]:
import numpy as np
import pandas as pd
%matplotlib inline
from cycler import cycler
import matplotlib.style
import matplotlib as mpl
mpl.style.use('seaborn-white')
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.cluster import KMeans
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, RandomTreesEmbedding, GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from yellowbrick.features import Rank1D
from yellowbrick.features import Rank2D
from yellowbrick.model_selection import LearningCurve
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ClassificationReport
from yellowbrick.features.importances import FeatureImportances
In [49]:
import urllib.request
print('Beginning file download...')
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00445/Absenteeism_at_work_AAA.zip'
urllib.request.urlretrieve(url, '/Users/Yara/Downloads/Absenteeism_at_work.zip')
Out[49]:
In [50]:
import zipfile
fantasy_zip = zipfile.ZipFile('C:\\Users\\Yara\\Downloads\\Absenteeism_at_work.zip')
fantasy_zip.extract('Absenteeism_at_work.csv', 'C:\\Users\\Yara\\Downloads')
fantasy_zip.close()
In [64]:
dataset = pd.read_csv('C:\\Users\\Yara\\Downloads\\Absenteeism_at_work.csv', 'Absenteeism_at_work.csv', delimiter=';')
In [65]:
dataset.ID.count()
Out[65]:
In [66]:
dataset.head()
Out[66]:
In [67]:
dataset = dataset.drop(['ID'], axis=1)
In [68]:
dataset.columns
Out[68]:
In [69]:
features = ['Reason for absence', 'Month of absence', 'Day of the week',
'Seasons', 'Transportation expense', 'Distance from Residence to Work',
'Service time', 'Age', 'Work load Average/day ', 'Hit target',
'Disciplinary failure', 'Education', 'Son', 'Social drinker',
'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index']
target = ['Absenteeism time in hours']
In [70]:
# Getting basic statistical information for the target
print(dataset.loc[:, 'Absenteeism time in hours'].mean())
print(dataset.loc[:, 'Absenteeism time in hours'].min())
print(dataset.loc[:, 'Absenteeism time in hours'].max())
In [71]:
dataset['Absenteeism time in hours'] = np.where(dataset['Absenteeism time in hours'] < 6, 1, dataset['Absenteeism time in hours'])
dataset['Absenteeism time in hours'] = np.where(dataset['Absenteeism time in hours'].between(6, 30), 2, dataset['Absenteeism time in hours'])
dataset['Absenteeism time in hours'] = np.where(dataset['Absenteeism time in hours'] > 30, 3, dataset['Absenteeism time in hours'])
In [72]:
# 'Reason for absence' feature in my mind needs to be dropped because it is highly correlated with the target.
# The more serious the reason is, the longer an employee will be absent from work. If we keep this feature it will leek excessive information to the model.
X = dataset.drop(['Reason for absence', 'Absenteeism time in hours'], axis=1)
y = dataset.loc[:, 'Absenteeism time in hours']
In [73]:
# Updating features to remove 'Reason for absence'
features = ['Month of absence', 'Day of the week',
'Seasons', 'Transportation expense', 'Distance from Residence to Work',
'Service time', 'Age', 'Work load Average/day ', 'Hit target',
'Disciplinary failure', 'Education', 'Son', 'Social drinker',
'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index']
In [74]:
# Setting up some visual preferences
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
In [75]:
# Calculating population breakdown by target category
Target = y.value_counts()
print(color.BOLD, 'Low:', color.END, Target[1])
print(color.BOLD, 'Medium:', color.END, Target[2])
print(color.BOLD, 'High:', color.END, Target[3])
my_colors = ["red", "gold", "limegreen"]
a4_dims = (5, 5)
fig, ax = plt.subplots(figsize=a4_dims)
Target.plot(kind='bar', title='Population Breakdown by Target Category', color=my_colors, ax=ax)
ax.set_xticklabels(['Low', 'Medium', 'High'], rotation = 45)
ax.grid(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# There is an obvious class imbalance here, therefore, we can expect the model to have difficulties learning the pattern for Medium and High categories.
In [76]:
# Creating 1D visualizer with the Sharpiro feature ranking algorithm
fig, ax = plt.subplots(figsize=(10, 7))
visualizer = Rank1D(features=features, ax=ax, algorithm='shapiro')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()
In [77]:
figsize=(10, 7)
fig, ax = plt.subplots(figsize=figsize)
visualizer = Rank2D(features=features, ax=ax, algorithm='covariance', colormap='summer')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()
In [78]:
# Instantiate the visualizer with the Pearson ranking algorithm
figsize=(10, 7)
fig, ax = plt.subplots(figsize=figsize)
visualizer = Rank2D(features=features, algorithm='pearson', colormap='winter')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()
In [79]:
# Visualizing Ridge, Lasso and ElasticNet feature selection models side by side for comparison
# Ridge
# Create a new figure
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['red'])
fig = plt.gcf()
fig.set_size_inches(10,10)
ax = plt.subplot(311)
labels = features
viz = FeatureImportances(Ridge(alpha=0.1), ax=ax, labels=labels, relative=False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Fit and display
viz.fit(X, y)
viz.poof()
# ElasticNet
# Create a new figure
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['salmon'])
fig = plt.gcf()
fig.set_size_inches(10,10)
ax = plt.subplot(312)
labels = features
viz = FeatureImportances(ElasticNet(alpha=0.01), ax=ax, labels=labels, relative=False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Fit and display
viz.fit(X, y)
viz.poof()
# Lasso
# Create a new figure
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['purple'])
fig = plt.gcf()
fig.set_size_inches(10,10)
ax = plt.subplot(313)
labels = features
viz = FeatureImportances(Lasso(alpha=0.01), ax=ax, labels=labels, relative=False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Fit and display
viz.fit(X, y)
viz.poof()
Having analyzed the output of all utilized visualizations (Shapiro algorithm, Pearson Correlation Ranking, Covariance Ranking, Lasso, Ridge and ElasticNet, we can select a number of features which have meaningful coefficient value (positive or negative). These are the features to be kept in the model:
In [24]:
# Dropping features from X based on visual feature importance visualization
X = X.drop(['Transportation expense', 'Age', 'Transportation expense', 'Service time', 'Hit target', 'Education','Work load Average/day '], axis=1)
In [25]:
X.head()
Out[25]:
In [26]:
# Encoding some categorical features
X = pd.get_dummies(data=X, columns=['Month of absence', 'Day of the week', 'Seasons'])
In [27]:
X.head()
Out[27]:
In [28]:
print(X.columns)
In [86]:
# Perform 80/20 training/test split
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20, random_state=42)
In [29]:
# Creating a function to visualize estimators
def visual_model_selection(X, y, estimator):
visualizer = ClassificationReport(estimator, classes=['Low', 'Medium', 'High'], cmap='PRGn')
visualizer.fit(X, y)
visualizer.score(X, y)
visualizer.poof()
In [88]:
visual_model_selection(X, y, BaggingClassifier())
In [81]:
visual_model_selection(X, y, LogisticRegression(class_weight='balanced'))
In [82]:
visual_model_selection(X, y, KNeighborsClassifier())
In [83]:
visual_model_selection(X, y, RandomForestClassifier(class_weight='balanced'))
In [84]:
visual_model_selection(X, y, ExtraTreesClassifier(class_weight='balanced'))
For the purposes of this exercise we will consider f1 score when estimating models' performance and making a selection. All of the above models visualized through yellowbrick's Classification Report Visualizer makes clear that classifier algorithms performed the best. We need to pay special attention to the f1 score for the underrepresented classes, such as "High" and "Medium", as they contained significantly less instances than "Low" class. Therefore, high f1 score for all three classes indicate a very strong performance of the follwing models: Bagging Classifier, Random Forest Classifier, Extra Trees Classifier. We will also use Class Prediction Error visualizer for these models to confirm their strong performance.
In [89]:
# Visualizaing class prediction error for Bagging Classifier model
classes = ['Low', 'Medium', 'High']
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['turquoise', 'cyan', 'teal', 'coral', 'blue', 'lime', 'lavender', 'lightblue', 'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue'])
fig = plt.gcf()
fig.set_size_inches(10,10)
ax = plt.subplot(311)
visualizer = ClassPredictionError(BaggingClassifier(), classes=classes, ax=ax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
# Visualizaing class prediction error for Random Forest Classifier model
classes = ['Low', 'Medium', 'High']
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['coral', 'tan', 'darkred'])
fig = plt.gcf()
fig.set_size_inches(10,10)
ax = plt.subplot(312)
visualizer = ClassPredictionError(RandomForestClassifier(class_weight='balanced'), classes=classes, ax=ax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
# Visualizaing class prediction error for Extra Trees Classifier model
classes = ['Low', 'Medium', 'High']
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['limegreen', 'yellow', 'orange'])
fig = plt.gcf()
fig.set_size_inches(10,10)
ax = plt.subplot(313)
visualizer = ClassPredictionError(ExtraTreesClassifier(class_weight='balanced'), classes=classes, ax=ax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
Now we can conclude that ExtraTreesClassifier seems to perform better as it had no instances from "High" class reported under the "Low" class. Let's proceed with hyperparameter tuning for that model!
In [113]:
# Performing Hyperparameter tuning
# Validation Curve
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['purple', 'darkblue'])
fig = plt.gcf()
fig.set_size_inches(10,10)
ax = plt.subplot(411)
viz = ValidationCurve(ExtraTreesClassifier(class_weight='balanced'), ax=ax, param_name="max_depth", param_range=np.arange(1, 11), cv=3, scoring="accuracy")
# Fit and poof the visualizer
viz.fit(X, y)
viz.poof()
We can aobserve on the above chart that even though training score keeps rising continuosly, cross validation score drops down at max_depth=8. Therefore, we will chose that parameter for our selected model.
In [116]:
visual_model_selection(X, y, ExtraTreesClassifier(class_weight='balanced', max_depth=8))
Visualization techniques prove to be a useful tool in the machine learning toolkit, and yellowbrick provides a wide selection of visualizers to meet the needs at every step and stage of the data science project pipeline. Ranging from feature analysis and selection, to model selection and optimization, yellowbrick visualizers make it easy to make a decision as to which features to keep in the model, which model performs best, and how to tune model's hyperparameters for future use. Moreover, visualizing algorithmic output also makes it easy to present insights to the audience and stakeholders, and contribute to the simplified interpretability of the machine learning results.