In [ ]:
import sys
sys.path.append('../')
# import generic packages
import numpy as np
import pandas as pd
# pd.options.display.max_columns = None
# pd.options.display.max_colwidth = 100
from IPython.display import display
# visualization packages
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set(style="white")
%matplotlib inline
# module loading settings
%load_ext autoreload
%autoreload 2
In [ ]:
# load to data frame
df = pd.read_csv('')
# extract and remove timestamps from data frame
timestamps = df['timestamp']
df.drop('timestamp', axis=1, inplace=True)
# determine categoricals
high_capacity = df.columns.values[~np.array(df.dtypes == np.number)].tolist()
print "high capacity categorical feature columns:"
print high_capacity
# print some info
print "{:d} observations".format(len(df))
df.head()
In [ ]:
model_type = 'rf' # the classification algorithm
tune_model = False # optimize hyperparameters
cross_val_method = 'temporal' # cross-validation routine
cost_fp = 1000 # preferably in euros!
benefit_tp = 3000
class_weights = {0: cost_fp, 1: benefit_tp} # costs for fn and fp
To validate whether the model makes sensible predictions, we need to perform cross-validation. The exact procedure for this is specified below. Random cross-validation (set-aside a random sample for testing) is fast, but temporal cross-validation (set-aside a time period for testing) gives the most realistic results due to the resemblence of real-world model usage.
In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split
#source: https://github.com/BigDataRepublic/bdr-analytics-py
#! pip install -e git+ssh://git@github.com/BigDataRepublic/bdr-analytics.git#egg=bdranalytics-0.1
from bdranalytics.pipeline.encoders import WeightOfEvidenceEncoder
from bdranalytics.model_selection.growingwindow import IntervalGrowingWindow
from sklearn.metrics import average_precision_score, make_scorer, roc_auc_score
if cross_val_method is 'random':
# split train data into stratified random folds
cv_dev = StratifiedShuffleSplit(test_size=0.1, train_size=0.1, n_splits=5, random_state=1)
cv_test = StratifiedShuffleSplit(test_size=0.33, n_splits=1, random_state=2)
elif cross_val_method is 'temporal':
train_size = pd.Timedelta(days=365 * 4 )
# create a cross-validation routine for parameter tuning
cv_dev = IntervalGrowingWindow(timestamps=timestamps,
test_start_date=pd.datetime(year=2015, month=1, day=1),
test_end_date=pd.datetime(year=2015, month=12, day=31),
test_size=pd.Timedelta(days=30),
train_size=train_size)
# create a cross-validation routine for model evaluation
cv_test = IntervalGrowingWindow(timestamps=timestamps,
test_start_date=pd.datetime(year=2016, month=1, day=1),
test_end_date=pd.datetime(year=2016, month=8, day=31),
test_size=pd.Timedelta(days=2*30),
train_size=train_size)
# number of parallel jobs for cross-validation
n_jobs = 1
# two functions for advanced performance evaluation metrics
def roc_auc(y_true, y_pred):
return roc_auc_score(pd.get_dummies(y_true), y_pred)
roc_auc_scorer = make_scorer(roc_auc, needs_proba=True)
def pr_auc(y_true, y_pred):
return average_precision_score(pd.get_dummies(y_true), y_pred, average="micro")
pr_auc_scorer = make_scorer(pr_auc, needs_proba=True)
In [ ]:
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
# convert date frame to bare X and y variables for the model pipeline
y_col = 'target'
X = df.copy().drop(y_col, axis=1)
y = np.array(df[y_col])
n_features = X.shape[1]
# define preprocessing steps
preproc_steps = [('woe', WeightOfEvidenceEncoder(cols=high_capacity)),
('imputer', Imputer(missing_values='NaN', strategy='median', axis=0)),
('standardizer', StandardScaler(with_mean=True, with_std=True))]
# specification of different model types and their defaults
model_steps_dict = {'lr': [('lr', LogisticRegression(C=0.001, penalty='l2', tol=0.01,
class_weight=class_weights))],
'rf': [('rf', RandomForestClassifier(n_estimators=400, max_features='auto',
class_weight=class_weights))],
'gbc': [('gbc', GradientBoostingClassifier(n_estimators=400, max_depth=3))],
'xgb': [('xgb', XGBClassifier(scale_pos_weight=class_weights[1],
n_estimators=100, max_depth=4))],
'dummy': [('dummy', DummyClassifier(strategy='prior'))]
}
# specification of the different model hyperparameters and tuning space
model_params_grid = {'lr': {'lr__C': [1e-4, 1e-3, 1e-2, 1e-1]},
'rf': {'rf__max_features': [3, n_features, np.sqrt(n_features)],
'rf__n_estimators': [10, 100, 1000]},
'gbc': {'gbc__n_estimators': [100, 200]},
'xgb': {'xgb__max_depth': [3,6,9],
'xgb__reg_alpha': [0,5,15],
'xgb__reg_lambda': [0,5,15],
'xgb__gamma' : [0,10,50,100]},
'dummy': {}}
# store the model step
model_steps = model_steps_dict[model_type]
# combine everything in one pipeline
estimator = Pipeline(steps=(preproc_steps + model_steps))
print estimator
In [ ]:
# procedure depends on cross-validation type
if cross_val_method is 'random':
train_index = next(cv_test.split(X, y))[0]
X_dev = X.iloc[train_index,:]
y_dev = y[train_index]
elif cross_val_method is 'temporal':
X_dev = X
y_dev = y
# setting to include class weights in the gradient boosting model
if model_type is 'gbc':
sample_weights = np.array(map(lambda x: class_weights[x], y_dev))
fit_params = {'gbc__sample_weight': sample_weights}
else:
fit_params = {}
# tune model with a parameter grid search if desired
if tune_model:
grid_search = GridSearchCV(estimator, cv=cv_dev, n_jobs=n_jobs, refit=False,
param_grid=model_params_grid[model_type],
scoring=pr_auc_scorer, fit_params=fit_params)
grid_search.fit(X_dev, y_dev)
# show grid search results
display(pd.DataFrame(grid_search.cv_results_))
# set best parameters for estimator
estimator.set_params(**grid_search.best_params_)
In [ ]:
y_pred_proba = [] # initialize empty predictions array
y_true = [] # initialize empty ground-truth array
# loop over the test folds
for i_fold, (train_index, test_index) in enumerate(cv_test.split(X, y)):
print "validation fold {:d}".format(i_fold)
X_train = X.iloc[train_index,:]
y_train = y[train_index]
X_test = X.iloc[test_index,:]
y_test = y[test_index]
if model_type is 'gbc':
sample_weights = map(lambda x: class_weights[x], y_train)
fit_params = {'gbc__sample_weight': sample_weights}
else:
fit_params = {}
# fit the model
estimator.fit(X_train, y_train, **fit_params)
# probability outputs for class 1
y_pred_proba.append(map(lambda x: x[1], estimator.predict_proba(X_test)))
# store the true y labels for each fold
y_true.append(np.array(y_test))
# postprocess the results
y_true = np.concatenate(y_true)
y_pred_proba = np.concatenate(y_pred_proba)
y_pred_bin = (y_pred_proba > 0.5) * 1.
# print some stats
n_samples_test = len(y_true)
n_pos_test = sum(y_true)
n_neg_test = n_samples_test - n_pos_test
print "events: {}".format(n_pos_test)
print "p_no_event: {}".format(n_neg_test / n_samples_test)
print "test accuracy: {}".format((np.equal(y_pred_bin, y_true) * 1.).mean())
Line is constructed by applying various threshold to the model output.
Y-axis: proportion of events correctly identified, hit-rate
X-axis: proportion of false positives, usually results in waste of resources
Dotted line is guessing (no model). Blue line above the dotted line means there is information in the features.
In [ ]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)
# plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label="ROC curve (area = {:.2f})".format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Receiver-operating characteristic')
plt.legend(loc="lower right")
plt.show()
In [ ]:
def benefit(tpr, fpr):
n_tp = tpr * n_pos_test # number of true positives (benefits)
n_fp = fpr * n_neg_test # number of false positives (extra costs)
fp_costs = n_fp * cost_fp
tp_benefits = n_tp * benefit_tp
return tp_benefits - fp_costs
benefits = np.zeros_like(thresholds)
for i, _ in enumerate(thresholds):
benefits[i] = benefit(tpr[i], fpr[i])
i_max = np.argmax(benefits)
print ("max benefits: {:.0f}k euros, tpr: {:.3f}, fpr: {:.3f}, threshold: {:.3f}"
.format(benefits[i_max]/ 1e3, benefits[i_max]/ 1e3 / 8, tpr[i_max], fpr[i_max], thresholds[i_max]))
plt.plot(thresholds, benefits)
plt.xlim([0,1])
plt.ylim([0,np.max(benefits)])
plt.show()
In [ ]:
# recalibrate threshold based on benefits (optional, should still be around 0.5)
y_pred_bin = (y_pred_proba > thresholds[i_max]) * 1.
Another way to look at it. Note that models which perform well in PR-space are necessarily also dominating ROC-space. The opposite is not the case! Line is constructed by applying various threshold to the model output.
Y-axis: proportion of events among all positives (precision)
X-axis: proportion of events correctly identified (recall, hit rate)
In [ ]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba, pos_label=1)
average_precision = average_precision_score(y_true, y_pred_proba, average="micro")
baseline = n_pos_test / n_samples_test
# plot PR curve
plt.figure()
plt.plot(recall, precision, label="PR curve (area = {:.2f})".format(average_precision))
plt.plot([0, 1], [baseline, baseline], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve')
plt.legend(loc="lower right")
plt.show()
if model_type is 'dummy':
print 'DummyClassifier only has endpoints in PR-curve'
In [ ]:
from sklearn.metrics import classification_report
target_names = ['no event','event']
print classification_report(y_true, y_pred_bin, target_names=target_names, digits=3)
In [ ]:
from sklearn.metrics import confusion_matrix
confusion = pd.DataFrame(confusion_matrix(y_true, y_pred_bin), index=target_names, columns=target_names)
sns.heatmap(confusion, annot=True, fmt="d")
plt.xlabel('predicted label')
plt.ylabel('true label')
In [ ]:
from sklearn.metrics import accuracy_score
thresholds = (np.arange(0,100,1) / 100.)
acc = map(lambda thresh: accuracy_score(y_true, map(lambda prob: prob > thresh, y_pred_proba)), thresholds)
plt.hist(acc, bins=20);
In [ ]:
plt.plot(thresholds, acc);
In [ ]:
feature_labels = filter(lambda k: y_col not in k, df.columns.values)
if model_type is 'lr':
weights = estimator._final_estimator.coef_[0]
elif model_type in ['rf','gbc']:
weights = estimator._final_estimator.feature_importances_
elif model_type is 'dummy':
print 'DummyClassifier does not have weights'
weights = np.zeros(len(feature_labels))
feature_weights = pd.Series(weights, index=feature_labels)
feature_weights.plot.barh(title='Feature importance', fontsize=8, figsize=(12,30), grid=True);
In [ ]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
if model_type is 'gbc':
preproc_pipe = Pipeline(steps=preproc_steps)
X_transformed = preproc_pipe.fit_transform(X_dev, y_dev)
plot_partial_dependence(estimator._final_estimator, X_transformed,
features=range(n_features), feature_names=feature_labels,
figsize=(12,180), n_cols=4, percentiles=(0.2,0.8));
else:
print "No partial dependence plots available for this model type."
In [ ]: