In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import classification_report
In [2]:
data = pd.read_csv('/file.csv')
In [3]:
data.info()
No missing data, all variables are numeric except employee id.
In [4]:
data.head()
Out[4]:
In [5]:
data.fired.unique()
Out[5]:
In [6]:
print('There are {:.2f}% zero values in "fired" column.'.format((1 - sum(data.fired) / 2100) * 100))
The dataset has imbalanced classes: 95.52% of all rows have "0" value in the column "fired". This means that predicting "1" is more difficult that "0". And model accuracy of 95.52% could mean that model dumbly predicts "0" in all cases. To increase predictive capacity of the model the following methods are commonly used:
Random Forest is a great model to use in case of imbalanced data.
In [7]:
X_train = data.drop(['fired', 'employee_id'], axis=1)
Y_train = data.fired
In [8]:
#Evaluating feature importance.
clf = RandomForestClassifier(n_estimators=200)
clf = clf.fit(X_train, Y_train)
indices = np.argsort(clf.feature_importances_)[::-1]
print('Feature ranking:')
for f in range(X_train.shape[1]):
print('%d. feature %d %s (%f)' % (f + 1, indices[f], X_train.columns[indices[f]],
clf.feature_importances_[indices[f]]))
Only several features have high importance. But I can't just throw away other features due to class imbalance, as less important factors could be importand for predicting "1".
In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train, Y_train, test_size=0.20, stratify = Y_train)
I used GridSearchCV to tune model's parameters. Also I use CalibratedClassifierCV to improve probability prediction.
In [10]:
clf = RandomForestClassifier(n_estimators=150, n_jobs=-1, criterion = 'gini', max_features = 'sqrt',
min_samples_split=7, min_weight_fraction_leaf=0.0,
max_leaf_nodes=40, max_depth=10)
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(Xtrain, ytrain)
y_val = calibrated_clf.predict_proba(Xtest)
Let's see several metrics to measure model's performance.
At first simple accuracy.
In [11]:
print("Accuracy {0}%".format(round(sum(pd.DataFrame(y_val).idxmax(axis=1).values == ytest)/len(ytest)*100, 4)))
The accuracy is quite good, but this metric doesn't show how accurate are predictions for each class.
In [12]:
print(classification_report(ytest, pd.DataFrame(y_val).idxmax(axis=1).values, target_names=['0', '1'], digits=4))
Now we can cee that the models give good predictions for both classes.
But the result may be improves if the threshold of choosing the class is changed.
In [13]:
y_threshold = np.zeros(ytest.shape).astype(int)
for i in range(len(y_val)):
if y_val[i][1] > 0.1:
y_threshold[i] = 1
print(classification_report(ytest, y_threshold, target_names=['0', '1'], digits=4))
Model's quality has improved. But it is better to automate the search of the optimal threshold.
In [14]:
def optimal_threshold(do):
threshold = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
f1 = []
for j in range(len(threshold)):
y_threshold = np.zeros(ytest.shape).astype(int)
for i in range(len(y_val)):
if y_val[i][1] > threshold[j]:
y_threshold[i] = 1
f1.append(classification_report(ytest, y_threshold, target_names=['0', '1'], digits=4).split()[19])
if do == 'print':
print('Maximum value of F1-score is {0} with threshold {1}.'.format(max(f1), threshold[f1.index(max(f1))]))
elif do == 'calc':
return max(f1)
In [15]:
optimal_threshold('print')
So, the model is quite accurate. But there is one more challenge: the amount of observations isn't very hish, so data splitting has a serious influence on the predictions. And in some cases optimal threshold may be 0.5. So it is better to use closs-validation.
In the code below I split data into train and test 10 times, each time model is fitted on train data, predicts values for test data and the best f1 score is calculate. After ten iterations mean value of F1-score and standard deviation is shown.
In [16]:
j = 0
score = []
while j < 10:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train, Y_train, test_size=0.20, stratify = Y_train)
calibrated_clf.fit(Xtrain, ytrain)
y_val = calibrated_clf.predict_proba(Xtest)
y_ = np.zeros(ytest.shape).astype(int)
score_max = optimal_threshold('calc')
score.append(float(score_max))
j = j + 1
print('Average value of F1-score is {0} with standard deviation of {1}'.format(round(np.mean(score), 4),
round(np.std(score), 4)))