Data downloaded at https://www.kaggle.com/ludobenistant/hr-analytics-1.
In [58]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [59]:
df = pd.read_csv('HR_comma_sep.csv')
In [60]:
print('\033[1m\033[94mData Types:\n{}'.format('-' * 11))
print('\033[30m{}\n'.format(df.dtypes))
print('\033[1m\033[94mSum of null vaules in each column:\n{}'.format('-' * 34))
print('\033[30m{}'.format(df.isnull().sum()))
In [61]:
df.head()
Out[61]:
In [62]:
df = df.rename(columns={'sales': 'department'})
salary_map = {'low': 0, 'medium': 1, 'high': 2}
df['salary'] = df['salary'].map(salary_map)
df = pd.get_dummies(df, columns=['department'], drop_first=True)
In [63]:
df.head()
Out[63]:
In [64]:
pos = df[df['left'] == 1].shape[0]
neg = df[df['left'] == 0].shape[0]
In [65]:
print('Positive examples = {}'.format(pos))
print('Negative examples = {}'.format(neg))
print('Proportion of positive to negative examples = {:.2f}%'.format((pos / neg) * 100))
In [66]:
sns.countplot(df['left'])
plt.xticks((0, 1), ["Didn't leave", 'Left'])
plt.xlabel('Left')
plt.ylabel('Count')
plt.title('Class counts')
Out[66]:
In [67]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# Convert dataframe into numpy objects and split them into
# train and test sets: 80/20
X = df.loc[:, df.columns != 'left'].values
y = df.loc[:, df.columns == 'left'].values.flatten()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=1)
# Upsample minority class
X_train_u, y_train_u = resample(X_train[y_train == 1],
y_train[y_train == 1],
replace=True,
n_samples=X_train[y_train == 0].shape[0],
random_state=1)
X_train_u = np.concatenate((X_train[y_train == 0], X_train_u))
y_train_u = np.concatenate((y_train[y_train == 0], y_train_u))
# Downsample majority class
X_train_d, y_train_d = resample(X_train[y_train == 0],
y_train[y_train == 0],
replace=True,
n_samples=X_train[y_train == 1].shape[0],
random_state=1)
X_train_d = np.concatenate((X_train[y_train == 1], X_train_d))
y_train_d = np.concatenate((y_train[y_train == 1], y_train_d))
print("Original shape:", X_train.shape, y_train.shape)
print("Upsampled shape:", X_train_u.shape, y_train_u.shape)
print("Downsampled shape:", X_train_d.shape, y_train_d.shape)
In [70]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Build PCA using standarized trained data
pca = PCA(n_components=None, svd_solver="full")
pca.fit(StandardScaler().fit_transform(X_train))
cum_var_exp = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(12, 6))
plt.bar(range(1, 18), pca.explained_variance_ratio_, align="center",
color='red', label="Individual explained variance")
plt.step(range(1, 18), cum_var_exp, where="mid", label="Cumulative explained variance")
plt.xticks(range(1, 18))
plt.legend(loc="best")
plt.xlabel("Principal component index", {"fontsize": 14})
plt.ylabel("Explained variance ratio", {"fontsize": 14})
plt.title("PCA on training data", {"fontsize": 16})
Out[70]:
In [71]:
# Build random forest classifier
methods_data = {"Original": (X_train, y_train),
"Upsampled": (X_train_u, y_train_u),
"Downsampled": (X_train_d, y_train_d)}
for method in methods_data.keys():
pip_rf = make_pipeline(StandardScaler(),
RandomForestClassifier(n_estimators=500,
class_weight="balanced",
random_state=123))
hyperparam_grid = {
"randomforestclassifier__n_estimators": [10, 50, 100, 500],
"randomforestclassifier__max_features": ["sqrt", "log2", 0.4, 0.5],
"randomforestclassifier__min_samples_leaf": [1, 3, 5],
"randomforestclassifier__criterion": ["gini", "entropy"]}
gs_rf = GridSearchCV(pip_rf,
hyperparam_grid,
scoring="f1",
cv=10,
n_jobs=-1)
gs_rf.fit(methods_data[method][0], methods_data[method][1])
print("\033[1m" + "\033[0m" + "The best hyperparameters for {} data:".format(method))
for hyperparam in gs_rf.best_params_.keys():
print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_rf.best_params_[hyperparam])
print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_rf.best_score_) * 100))
In [ ]: