Predicting Employee Turnover

Data downloaded at https://www.kaggle.com/ludobenistant/hr-analytics-1.


In [58]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

Data Preprocessing


In [59]:
df = pd.read_csv('HR_comma_sep.csv')

In [60]:
print('\033[1m\033[94mData Types:\n{}'.format('-' * 11))
print('\033[30m{}\n'.format(df.dtypes))
print('\033[1m\033[94mSum of null vaules in each column:\n{}'.format('-' * 34))
print('\033[30m{}'.format(df.isnull().sum()))


Data Types:
-----------
satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
sales                     object
salary                    object
dtype: object

Sum of null vaules in each column:
----------------------------------
satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
sales                    0
salary                   0
dtype: int64

In [61]:
df.head()


Out[61]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low

In [62]:
df = df.rename(columns={'sales': 'department'})

salary_map = {'low': 0, 'medium': 1, 'high': 2}
df['salary'] = df['salary'].map(salary_map)

df = pd.get_dummies(df, columns=['department'], drop_first=True)

In [63]:
df.head()


Out[63]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years salary department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical
0 0.38 0.53 2 157 3 0 1 0 0 0 0 0 0 0 0 1 0 0
1 0.80 0.86 5 262 6 0 1 0 1 0 0 0 0 0 0 1 0 0
2 0.11 0.88 7 272 4 0 1 0 1 0 0 0 0 0 0 1 0 0
3 0.72 0.87 5 223 5 0 1 0 0 0 0 0 0 0 0 1 0 0
4 0.37 0.52 2 159 3 0 1 0 0 0 0 0 0 0 0 1 0 0

Modeling


In [64]:
pos = df[df['left'] == 1].shape[0]
neg = df[df['left'] == 0].shape[0]

In [65]:
print('Positive examples = {}'.format(pos))
print('Negative examples = {}'.format(neg))
print('Proportion of positive to negative examples = {:.2f}%'.format((pos / neg) * 100))


Positive examples = 3571
Negative examples = 11428
Proportion of positive to negative examples = 31.25%

In [66]:
sns.countplot(df['left'])
plt.xticks((0, 1), ["Didn't leave", 'Left'])
plt.xlabel('Left')
plt.ylabel('Count')
plt.title('Class counts')


Out[66]:
Text(0.5,1,'Class counts')

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Convert dataframe into numpy objects and split them into
# train and test sets: 80/20
X = df.loc[:, df.columns != 'left'].values
y = df.loc[:, df.columns == 'left'].values.flatten()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=1)
# Upsample minority class
X_train_u, y_train_u = resample(X_train[y_train == 1],
                                y_train[y_train == 1],
                                replace=True,
                                n_samples=X_train[y_train == 0].shape[0],
                                random_state=1)
X_train_u = np.concatenate((X_train[y_train == 0], X_train_u))
y_train_u = np.concatenate((y_train[y_train == 0], y_train_u))
# Downsample majority class
X_train_d, y_train_d = resample(X_train[y_train == 0],
                                y_train[y_train == 0],
                                replace=True,
                                n_samples=X_train[y_train == 1].shape[0],
                                random_state=1)
X_train_d = np.concatenate((X_train[y_train == 1], X_train_d))
y_train_d = np.concatenate((y_train[y_train == 1], y_train_d))
print("Original shape:", X_train.shape, y_train.shape)
print("Upsampled shape:", X_train_u.shape, y_train_u.shape)
print("Downsampled shape:", X_train_d.shape, y_train_d.shape)


Original shape: (11999, 17) (11999,)
Upsampled shape: (18284, 17) (18284,)
Downsampled shape: (5714, 17) (5714,)

In [70]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Build PCA using standarized trained data
pca = PCA(n_components=None, svd_solver="full")
pca.fit(StandardScaler().fit_transform(X_train))

cum_var_exp = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(12, 6))
plt.bar(range(1, 18), pca.explained_variance_ratio_, align="center",
        color='red', label="Individual explained variance")
plt.step(range(1, 18), cum_var_exp, where="mid", label="Cumulative explained variance")
plt.xticks(range(1, 18))
plt.legend(loc="best")
plt.xlabel("Principal component index", {"fontsize": 14})
plt.ylabel("Explained variance ratio", {"fontsize": 14})
plt.title("PCA on training data", {"fontsize": 16})


Out[70]:
Text(0.5,1,'PCA on training data')

Random Forest


In [71]:
# Build random forest classifier
methods_data = {"Original": (X_train, y_train),
                "Upsampled": (X_train_u, y_train_u),
                "Downsampled": (X_train_d, y_train_d)}
for method in methods_data.keys():
    pip_rf = make_pipeline(StandardScaler(),
                           RandomForestClassifier(n_estimators=500,
                                                  class_weight="balanced",
                                                  random_state=123))
    
    hyperparam_grid = {
        "randomforestclassifier__n_estimators": [10, 50, 100, 500],
        "randomforestclassifier__max_features": ["sqrt", "log2", 0.4, 0.5],
        "randomforestclassifier__min_samples_leaf": [1, 3, 5],
        "randomforestclassifier__criterion": ["gini", "entropy"]}
    
    gs_rf = GridSearchCV(pip_rf,
                         hyperparam_grid,
                         scoring="f1",
                         cv=10,
                         n_jobs=-1)
    
    gs_rf.fit(methods_data[method][0], methods_data[method][1])
    
    print("\033[1m" + "\033[0m" + "The best hyperparameters for {} data:".format(method))
    for hyperparam in gs_rf.best_params_.keys():
        print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_rf.best_params_[hyperparam])
        
    print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_rf.best_score_) * 100))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-71-7ebf875db7e1> in <module>()
      4                 "Downsampled": (X_train_d, y_train_d)}
      5 for method in methods_data.keys():
----> 6     pip_rf = make_pipeline(StandardScaler(),
      7                            RandomForestClassifier(n_estimators=500,
      8                                                   class_weight="balanced",

NameError: name 'make_pipeline' is not defined

In [ ]: