A notebook walking through the steps from this post:
http://blog.insightdatalabs.com/visualizing-classifier-thresholds/
In [46]:
#Imports
import bisect
import pandas as pd
import numpy as np
%pylab inline
import seaborn as sns
sns.set_style('darkgrid')
from scipy.stats import mstats
In [47]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import LabelEncoder
In [4]:
# Retrieve Data Set
df = pd.read_csv('http://www.dataminingconsultant.com/data/churn.txt')
In [5]:
# Some Preprocessing
df.columns = [c.lower().replace(' ', '_').replace('?', '').replace("'", "") for c in
df.columns]
In [7]:
state_encoder = LabelEncoder()
df.state = state_encoder.fit_transform(df.state)
In [8]:
del df['phone']
In [14]:
binary_columns = ['intl_plan', 'vmail_plan', 'churn']
for col in binary_columns:
df[col] = df[col].map({
'no': 0
, 'False.': 0
, 'yes': 1
, 'True.': 1
})
df.head()
Out[14]:
In [15]:
# Build the classifier and get the predictions
clf = RandomForestClassifier(n_estimators=50, oob_score=True)
test_size_percent = 0.1
In [16]:
signals = df[[c for c in df.columns if c != 'churn']]
labels = df['churn']
In [17]:
train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
clf.fit(train_signals, train_labels)
predictions = clf.predict_proba(test_signals)[:,1]
In [19]:
# Now let's see what the performance of this model is as a function of the threshold.
precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
thresholds = np.append(thresholds, 1)
In [21]:
queue_rate = []
for threshold in thresholds:
queue_rate.append((predictions >= threshold).mean())
In [27]:
plt.plot(thresholds, precision, color=sns.color_palette()[0])
plt.plot(thresholds, recall, color=sns.color_palette()[1])
plt.plot(thresholds, queue_rate, color=sns.color_palette()[2])
leg = plt.legend((['precision', 'recall', 'queue_rate']), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')
Out[27]:
In [28]:
clf = RandomForestClassifier(n_estimators=50, oob_score=True)
In [29]:
n_trials = 50
test_size_percent = 0.1
In [31]:
signals = df[[c for c in df.columns if c != 'churn']]
labels = df['churn']
In [32]:
plot_data = []
In [39]:
for trial in range(n_trials):
train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
clf.fit(train_signals, train_labels)
predictions = clf.predict_proba(test_signals)[:,1]
precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
thresholds = np.append(thresholds, 1)
queue_rate = []
for threshold in thresholds:
queue_rate.append((predictions >= threshold).mean())
plot_data.append({
'thresholds': thresholds,
'precision': precision,
'recall': recall,
'queue_rate': queue_rate
})
In [40]:
for p in plot_data:
plt.plot(p['thresholds'], p['precision'], color=sns.color_palette()[0], alpha=0.5)
plt.plot(p['thresholds'], p['recall'], color=sns.color_palette()[1], alpha=0.5)
plt.plot(p['thresholds'], p['queue_rate'], color=sns.color_palette()[2], alpha=0.5)
leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')
Out[40]:
In [43]:
# Let's plot the median curves, along with a 90% central interval for each threshold:
uniform_thresholds = np.linspace(0, 1, 101)
uniform_precision_plots = []
uniform_recall_plots = []
uniform_queue_rate_plots = []
for p in plot_data:
uniform_precision = []
uniform_recall = []
uniform_queue_rate = []
for ut in uniform_thresholds:
index = bisect.bisect_left(p['thresholds'], ut)
uniform_precision.append(p['precision'][index])
uniform_recall.append(p['recall'][index])
uniform_queue_rate.append(p['queue_rate'][index])
uniform_precision_plots.append(uniform_precision)
uniform_recall_plots.append(uniform_recall)
uniform_queue_rate_plots.append(uniform_queue_rate)
quantiles = [0.1, 0.5, 0.9]
lower_precision, median_precision, upper_precision = mstats.mquantiles(uniform_precision_plots, quantiles, axis=0)
lower_recall, median_recall, upper_recall = mstats.mquantiles(uniform_recall_plots, quantiles, axis=0)
lower_queue_rate, median_queue_rate, upper_queue_rate = mstats.mquantiles(uniform_queue_rate_plots, quantiles, axis=0)
plt.plot(uniform_thresholds, median_precision)
plt.plot(uniform_thresholds, median_recall)
plt.plot(uniform_thresholds, median_queue_rate)
plt.fill_between(uniform_thresholds, upper_precision, lower_precision, alpha=0.5, linewidth=0, color=sns.color_palette()[0])
plt.fill_between(uniform_thresholds, upper_recall, lower_recall, alpha=0.5, linewidth=0, color=sns.color_palette()[1])
plt.fill_between(uniform_thresholds, upper_queue_rate, lower_queue_rate, alpha=0.5, linewidth=0, color=sns.color_palette()[2])
leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')
Out[43]:
In [44]:
uniform_thresholds = np.linspace(0, 1, 101)
uniform_payout_plots = []
n = 10000
success_payoff = 100
case_cost = 20
In [49]:
for p in plot_data:
uniform_payout = []
for ut in uniform_thresholds:
index = bisect.bisect_left(p['thresholds'], ut)
precision = p['precision'][index]
queue_rate = p['queue_rate'][index]
payout = n*queue_rate*(precision*100 - case_cost)
uniform_payout.append(payout)
uniform_payout_plots.append(uniform_payout)
quantiles = [0.1, 0.5, 0.9]
lower_payout, median_payout, upper_payout = mstats.mquantiles(uniform_payout_plots, quantiles, axis = 0)
plt.plot(uniform_thresholds, median_payout, color=sns.color_palette()[4])
plt.fill_between(uniform_thresholds, upper_payout, lower_payout, alpha=0.5, linewidth=0, color=sns.color_palette()[4])
max_ap = uniform_thresholds[np.argmax(median_payout)]
plt.vlines([max_ap], -100000, 150000, linestyles='--')
plt.ylim(-100000, 150000)
leg = plt.legend(('payout ($)', 'median argmax = {:.2f}'.format(max_ap)), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('$')
plt.title("Payout as a Function of Threshold")
Out[49]:
In [ ]: