Classification model

Here we use machine learning techniques to create and validate a model that can predict the probability of a relatively rare event (imbalanced classes problem).


In [ ]:
import sys
sys.path.append('../')

# import generic packages
import numpy as np
import pandas as pd
# pd.options.display.max_columns = None
# pd.options.display.max_colwidth = 100
from IPython.display import display

# visualization packages
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set(style="white")
%matplotlib inline

# module loading settings
%load_ext autoreload
%autoreload 2

In [ ]:
# load to data frame
df = pd.read_csv('')

# extract and remove timestamps from data frame
timestamps = df['timestamp']
df.drop('timestamp', axis=1, inplace=True)

# determine categoricals
high_capacity = df.columns.values[~np.array(df.dtypes == np.number)].tolist()
print "high capacity categorical feature columns:"
print high_capacity

# print some info
print "{:d} observations".format(len(df))
df.head()

Model specification

Here we set some specifications for the model: type, how it should be fitted, optimized and validated.


In [ ]:
model_type = 'rf'  # the classification algorithm
tune_model = False  # optimize hyperparameters

cross_val_method = 'temporal'  # cross-validation routine

cost_fp = 1000  # preferably in euros!
benefit_tp = 3000
class_weights = {0: cost_fp, 1: benefit_tp}  # costs for fn and fp

Cross-validation procedure

To validate whether the model makes sensible predictions, we need to perform cross-validation. The exact procedure for this is specified below. Random cross-validation (set-aside a random sample for testing) is fast, but temporal cross-validation (set-aside a time period for testing) gives the most realistic results due to the resemblence of real-world model usage.


In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

#source: https://github.com/BigDataRepublic/bdr-analytics-py
#! pip install -e git+ssh://git@github.com/BigDataRepublic/bdr-analytics.git#egg=bdranalytics-0.1
from bdranalytics.pipeline.encoders import WeightOfEvidenceEncoder
from bdranalytics.model_selection.growingwindow import IntervalGrowingWindow

from sklearn.metrics import average_precision_score, make_scorer, roc_auc_score

if cross_val_method is 'random':
  
    # split train data into stratified random folds
    cv_dev = StratifiedShuffleSplit(test_size=0.1, train_size=0.1, n_splits=5, random_state=1)
    
    cv_test = StratifiedShuffleSplit(test_size=0.33, n_splits=1, random_state=2)

elif cross_val_method is 'temporal':
    
    train_size = pd.Timedelta(days=365 * 4 )
    
    # create a cross-validation routine for parameter tuning
    cv_dev = IntervalGrowingWindow(timestamps=timestamps,
                               test_start_date=pd.datetime(year=2015, month=1, day=1),
                               test_end_date=pd.datetime(year=2015, month=12, day=31),
                               test_size=pd.Timedelta(days=30), 
                               train_size=train_size)
    
    # create a cross-validation routine for model evaluation
    cv_test = IntervalGrowingWindow(timestamps=timestamps,
                               test_start_date=pd.datetime(year=2016, month=1, day=1),
                               test_end_date=pd.datetime(year=2016, month=8, day=31),
                               test_size=pd.Timedelta(days=2*30),
                               train_size=train_size)    

# number of parallel jobs for cross-validation
n_jobs = 1

# two functions for advanced performance evaluation metrics
def roc_auc(y_true, y_pred):
    return roc_auc_score(pd.get_dummies(y_true), y_pred)

roc_auc_scorer = make_scorer(roc_auc, needs_proba=True)

def pr_auc(y_true, y_pred):
    return average_precision_score(pd.get_dummies(y_true), y_pred, average="micro")

pr_auc_scorer = make_scorer(pr_auc, needs_proba=True)

In [ ]:
from sklearn.preprocessing import StandardScaler, Imputer

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

# convert date frame to bare X and y variables for the model pipeline
y_col = 'target'
X = df.copy().drop(y_col, axis=1)
y = np.array(df[y_col])
n_features = X.shape[1]

# define preprocessing steps
preproc_steps = [('woe', WeightOfEvidenceEncoder(cols=high_capacity)),
                 ('imputer', Imputer(missing_values='NaN', strategy='median', axis=0)),
                 ('standardizer', StandardScaler(with_mean=True, with_std=True))]

# specification of different model types and their defaults
model_steps_dict = {'lr': [('lr', LogisticRegression(C=0.001, penalty='l2', tol=0.01,
                                                     class_weight=class_weights))],
                   'rf': [('rf', RandomForestClassifier(n_estimators=400, max_features='auto',
                                                       class_weight=class_weights))],
                   'gbc': [('gbc', GradientBoostingClassifier(n_estimators=400, max_depth=3))],
                   'xgb': [('xgb', XGBClassifier(scale_pos_weight=class_weights[1],
                                                 n_estimators=100, max_depth=4))],
                   'dummy': [('dummy', DummyClassifier(strategy='prior'))]
                   }

# specification of the different model hyperparameters and tuning space
model_params_grid = {'lr': {'lr__C': [1e-4, 1e-3, 1e-2, 1e-1]},
                    'rf': {'rf__max_features': [3, n_features, np.sqrt(n_features)],
                           'rf__n_estimators': [10, 100, 1000]},
                     'gbc': {'gbc__n_estimators': [100, 200]},
                     'xgb': {'xgb__max_depth': [3,6,9],
                             'xgb__reg_alpha': [0,5,15],
                             'xgb__reg_lambda': [0,5,15],
                             'xgb__gamma' : [0,10,50,100]},
                     'dummy': {}}

# store the model step
model_steps = model_steps_dict[model_type]

# combine everything in one pipeline
estimator = Pipeline(steps=(preproc_steps + model_steps))
print estimator

Model parameter tuning

If desired, we can optimize the model hyperparameters to get the best possible model.


In [ ]:
# procedure depends on cross-validation type
if cross_val_method is 'random': 
    train_index = next(cv_test.split(X, y))[0]
    X_dev = X.iloc[train_index,:]
    y_dev = y[train_index]
elif cross_val_method is 'temporal':
    X_dev = X
    y_dev = y

# setting to include class weights in the gradient boosting model
if model_type is 'gbc':
    sample_weights = np.array(map(lambda x: class_weights[x], y_dev))
    fit_params = {'gbc__sample_weight': sample_weights}
else: 
    fit_params = {}

# tune model with a parameter grid search if desired
if tune_model:
    
    grid_search = GridSearchCV(estimator, cv=cv_dev, n_jobs=n_jobs, refit=False,
                             param_grid=model_params_grid[model_type],
                             scoring=pr_auc_scorer, fit_params=fit_params)

    grid_search.fit(X_dev, y_dev)
    
    # show grid search results
    display(pd.DataFrame(grid_search.cv_results_))
    
    # set best parameters for estimator
    estimator.set_params(**grid_search.best_params_)

Model validation

The final test on the holdout.


In [ ]:
y_pred_proba = []  # initialize empty predictions array
y_true = []  # initialize empty ground-truth array

# loop over the test folds
for i_fold, (train_index, test_index) in enumerate(cv_test.split(X, y)):
    
    print "validation fold {:d}".format(i_fold)
    
    X_train = X.iloc[train_index,:]
    y_train = y[train_index]
    
    X_test = X.iloc[test_index,:]
    y_test = y[test_index]
    
    if model_type is 'gbc':
        sample_weights = map(lambda x: class_weights[x], y_train)
        fit_params = {'gbc__sample_weight': sample_weights}
    else: 
        fit_params = {}
    
    # fit the model
    estimator.fit(X_train, y_train, **fit_params)

    # probability outputs for class 1
    y_pred_proba.append(map(lambda x: x[1], estimator.predict_proba(X_test)))
    
    # store the true y labels for each fold
    y_true.append(np.array(y_test))

# postprocess the results
y_true = np.concatenate(y_true)
y_pred_proba = np.concatenate(y_pred_proba) 
y_pred_bin = (y_pred_proba > 0.5) * 1.

# print some stats
n_samples_test = len(y_true)
n_pos_test = sum(y_true)
n_neg_test = n_samples_test - n_pos_test
print "events: {}".format(n_pos_test)
print "p_no_event: {}".format(n_neg_test / n_samples_test)
print "test accuracy: {}".format((np.equal(y_pred_bin, y_true) * 1.).mean())

Receiver-operator characteristics

Line is constructed by applying various threshold to the model output.
Y-axis: proportion of events correctly identified, hit-rate
X-axis: proportion of false positives, usually results in waste of resources Dotted line is guessing (no model). Blue line above the dotted line means there is information in the features.


In [ ]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)
     
# plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label="ROC curve (area = {:.2f})".format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Receiver-operating characteristic')
plt.legend(loc="lower right")
plt.show()

Costs and benefits

ROC optimization with cost matrix. Critical information: cost of FP and cost of FN (i.e. benefit of TP). Also used to train the model with class_weights.


In [ ]:
def benefit(tpr, fpr):

    n_tp = tpr * n_pos_test  # number of true positives (benefits)
    n_fp = fpr * n_neg_test  # number of false positives (extra costs)
    
    fp_costs = n_fp * cost_fp
    tp_benefits =  n_tp * benefit_tp
    
    return tp_benefits - fp_costs

benefits = np.zeros_like(thresholds)
for i, _ in enumerate(thresholds):
    benefits[i] = benefit(tpr[i], fpr[i])

i_max = np.argmax(benefits)
print ("max benefits: {:.0f}k euros, tpr: {:.3f}, fpr: {:.3f}, threshold: {:.3f}"
       .format(benefits[i_max]/ 1e3, benefits[i_max]/ 1e3 / 8, tpr[i_max], fpr[i_max], thresholds[i_max]))

plt.plot(thresholds, benefits)
plt.xlim([0,1])
plt.ylim([0,np.max(benefits)])
plt.show()

In [ ]:
# recalibrate threshold based on benefits (optional, should still be around 0.5)
y_pred_bin = (y_pred_proba > thresholds[i_max]) * 1.

Precision-recall curve

Another way to look at it. Note that models which perform well in PR-space are necessarily also dominating ROC-space. The opposite is not the case! Line is constructed by applying various threshold to the model output.
Y-axis: proportion of events among all positives (precision)
X-axis: proportion of events correctly identified (recall, hit rate)


In [ ]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba, pos_label=1)

average_precision = average_precision_score(y_true, y_pred_proba, average="micro")

baseline = n_pos_test / n_samples_test

# plot PR curve
plt.figure()
plt.plot(recall, precision, label="PR curve (area = {:.2f})".format(average_precision))
plt.plot([0, 1], [baseline, baseline], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve')
plt.legend(loc="lower right")
plt.show()

if model_type is 'dummy':
    print 'DummyClassifier only has endpoints in PR-curve'

Classification report


In [ ]:
from sklearn.metrics import classification_report

target_names = ['no event','event']

print classification_report(y_true, y_pred_bin, target_names=target_names, digits=3)

Confusion matrix


In [ ]:
from sklearn.metrics import confusion_matrix

confusion = pd.DataFrame(confusion_matrix(y_true, y_pred_bin), index=target_names, columns=target_names)
sns.heatmap(confusion, annot=True, fmt="d")
plt.xlabel('predicted label')
plt.ylabel('true label')

Accuracies at different classifier thresholds


In [ ]:
from sklearn.metrics import accuracy_score

thresholds = (np.arange(0,100,1) / 100.)
acc = map(lambda thresh: accuracy_score(y_true, map(lambda prob: prob > thresh, y_pred_proba)), thresholds)
plt.hist(acc, bins=20);

Thresholds versus accuracy


In [ ]:
plt.plot(thresholds, acc);

Feature importance

Note that these models are optimized to make accurate predictions, and not to make solid statistical inferences.


In [ ]:
feature_labels = filter(lambda k: y_col not in k, df.columns.values) 

if model_type is 'lr':
    weights = estimator._final_estimator.coef_[0]
elif model_type in ['rf','gbc']:
    weights = estimator._final_estimator.feature_importances_
elif model_type is 'dummy':
    print 'DummyClassifier does not have weights'
    weights = np.zeros(len(feature_labels))
    
feature_weights = pd.Series(weights, index=feature_labels)
feature_weights.plot.barh(title='Feature importance', fontsize=8, figsize=(12,30), grid=True);

In [ ]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence

if model_type is 'gbc':
    preproc_pipe = Pipeline(steps=preproc_steps)
    X_transformed = preproc_pipe.fit_transform(X_dev, y_dev)

    plot_partial_dependence(estimator._final_estimator, X_transformed,
                            features=range(n_features), feature_names=feature_labels,
                            figsize=(12,180), n_cols=4, percentiles=(0.2,0.8));
else:
    print "No partial dependence plots available for this model type."

In [ ]: