A notebook walking through the steps from this post:

http://blog.insightdatalabs.com/visualizing-classifier-thresholds/



In [46]:

    
#Imports
import bisect
import pandas as pd
import numpy as np

%pylab inline
import seaborn as sns
sns.set_style('darkgrid')

from scipy.stats import mstats









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['clf']
`%matplotlib` prevents importing * from pylab and numpy



In [47]:

    
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import LabelEncoder



In [4]:

    
# Retrieve Data Set
df = pd.read_csv('http://www.dataminingconsultant.com/data/churn.txt')



In [5]:

    
# Some Preprocessing
df.columns = [c.lower().replace(' ', '_').replace('?', '').replace("'", "") for c in 
             df.columns]



In [7]:

    
state_encoder = LabelEncoder()
df.state = state_encoder.fit_transform(df.state)



In [8]:

    
del df['phone']



In [14]:

    
binary_columns = ['intl_plan', 'vmail_plan', 'churn']
for col in binary_columns:
    df[col] = df[col].map({
            'no': 0
        ,   'False.': 0
        ,   'yes': 1
        ,   'True.': 1
    })
    
df.head()









    Out[14]:






  
    
      
      state
      account_length
      area_code
      intl_plan
      vmail_plan
      vmail_message
      day_mins
      day_calls
      day_charge
      eve_mins
      eve_calls
      eve_charge
      night_mins
      night_calls
      night_charge
      intl_mins
      intl_calls
      intl_charge
      custserv_calls
      churn
    
  
  
    
      0
      16
      128
      415
      0
      1
      25
      265.1
      110
      45.07
      197.4
      99
      16.78
      244.7
      91
      11.01
      10.0
      3
      2.70
      1
      0
    
    
      1
      35
      107
      415
      0
      1
      26
      161.6
      123
      27.47
      195.5
      103
      16.62
      254.4
      103
      11.45
      13.7
      3
      3.70
      1
      0
    
    
      2
      31
      137
      415
      0
      0
      0
      243.4
      114
      41.38
      121.2
      110
      10.30
      162.6
      104
      7.32
      12.2
      5
      3.29
      0
      0
    
    
      3
      35
      84
      408
      1
      0
      0
      299.4
      71
      50.90
      61.9
      88
      5.26
      196.9
      89
      8.86
      6.6
      7
      1.78
      2
      0
    
    
      4
      36
      75
      415
      1
      0
      0
      166.7
      113
      28.34
      148.3
      122
      12.61
      186.9
      121
      8.41
      10.1
      3
      2.73
      3
      0



In [15]:

    
# Build the classifier and get the predictions
clf = RandomForestClassifier(n_estimators=50, oob_score=True)
test_size_percent = 0.1



In [16]:

    
signals = df[[c for c in df.columns if c != 'churn']]
labels = df['churn']



In [17]:

    
train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
clf.fit(train_signals, train_labels)
predictions = clf.predict_proba(test_signals)[:,1]



In [19]:

    
# Now let's see what the performance of this model is as a function of the threshold. 
precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
thresholds = np.append(thresholds, 1)



In [21]:

    
queue_rate = []
for threshold in thresholds:
    queue_rate.append((predictions >= threshold).mean())



In [27]:

    
plt.plot(thresholds, precision, color=sns.color_palette()[0])
plt.plot(thresholds, recall, color=sns.color_palette()[1])
plt.plot(thresholds, queue_rate, color=sns.color_palette()[2])

leg = plt.legend((['precision', 'recall', 'queue_rate']), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')









    Out[27]:





<matplotlib.text.Text at 0x10b7c0610>



In [28]:

    
clf = RandomForestClassifier(n_estimators=50, oob_score=True)



In [29]:

    
n_trials = 50
test_size_percent = 0.1



In [31]:

    
signals = df[[c for c in df.columns if c != 'churn']]
labels = df['churn']



In [32]:

    
plot_data = []



In [39]:

    
for trial in range(n_trials):
    train_signals, test_signals, train_labels, test_labels = train_test_split(signals, labels, test_size=test_size_percent)
    clf.fit(train_signals, train_labels)
    predictions = clf.predict_proba(test_signals)[:,1]
    
    precision, recall, thresholds = precision_recall_curve(test_labels, predictions)
    thresholds = np.append(thresholds, 1)
    
    queue_rate = []
    for threshold in thresholds:
        queue_rate.append((predictions >= threshold).mean())
        
    plot_data.append({
            'thresholds': thresholds,
            'precision': precision,
            'recall': recall,
            'queue_rate': queue_rate    
    })



In [40]:

    
for p in plot_data:
    plt.plot(p['thresholds'], p['precision'], color=sns.color_palette()[0], alpha=0.5)
    plt.plot(p['thresholds'], p['recall'], color=sns.color_palette()[1], alpha=0.5)
    plt.plot(p['thresholds'], p['queue_rate'], color=sns.color_palette()[2], alpha=0.5)
    
leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')









    Out[40]:





<matplotlib.text.Text at 0x10b958e50>



In [43]:

    
# Let's plot the median curves, along with a 90% central interval for each threshold:
uniform_thresholds = np.linspace(0, 1, 101)

uniform_precision_plots = []
uniform_recall_plots = []
uniform_queue_rate_plots = []

for p in plot_data:
    uniform_precision = []
    uniform_recall = []
    uniform_queue_rate = []
    for ut in uniform_thresholds:
        index = bisect.bisect_left(p['thresholds'], ut)
        uniform_precision.append(p['precision'][index])
        uniform_recall.append(p['recall'][index])
        uniform_queue_rate.append(p['queue_rate'][index])
        
    uniform_precision_plots.append(uniform_precision)
    uniform_recall_plots.append(uniform_recall)
    uniform_queue_rate_plots.append(uniform_queue_rate)
    
quantiles = [0.1, 0.5, 0.9]
lower_precision, median_precision, upper_precision = mstats.mquantiles(uniform_precision_plots, quantiles, axis=0)
lower_recall, median_recall, upper_recall = mstats.mquantiles(uniform_recall_plots, quantiles, axis=0)
lower_queue_rate, median_queue_rate, upper_queue_rate = mstats.mquantiles(uniform_queue_rate_plots, quantiles, axis=0)

plt.plot(uniform_thresholds, median_precision)
plt.plot(uniform_thresholds, median_recall)
plt.plot(uniform_thresholds, median_queue_rate)

plt.fill_between(uniform_thresholds, upper_precision, lower_precision, alpha=0.5, linewidth=0, color=sns.color_palette()[0])
plt.fill_between(uniform_thresholds, upper_recall, lower_recall, alpha=0.5, linewidth=0, color=sns.color_palette()[1])
plt.fill_between(uniform_thresholds, upper_queue_rate, lower_queue_rate, alpha=0.5, linewidth=0, color=sns.color_palette()[2])

leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')









    Out[43]:





<matplotlib.text.Text at 0x10bf34910>



In [44]:

    
uniform_thresholds = np.linspace(0, 1, 101)

uniform_payout_plots = []

n = 10000
success_payoff = 100
case_cost = 20



In [49]:

    
for p in plot_data:
    uniform_payout = []
    for ut in uniform_thresholds:
        index = bisect.bisect_left(p['thresholds'], ut)
        precision = p['precision'][index]
        queue_rate = p['queue_rate'][index]
        
        payout = n*queue_rate*(precision*100 - case_cost)
        uniform_payout.append(payout)
        
    uniform_payout_plots.append(uniform_payout)
    
quantiles = [0.1, 0.5, 0.9]
lower_payout, median_payout, upper_payout = mstats.mquantiles(uniform_payout_plots, quantiles, axis = 0)

plt.plot(uniform_thresholds, median_payout, color=sns.color_palette()[4])
plt.fill_between(uniform_thresholds, upper_payout, lower_payout, alpha=0.5, linewidth=0, color=sns.color_palette()[4])

max_ap = uniform_thresholds[np.argmax(median_payout)]
plt.vlines([max_ap], -100000, 150000, linestyles='--')
plt.ylim(-100000, 150000)

leg = plt.legend(('payout ($)', 'median argmax = {:.2f}'.format(max_ap)), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('$')
plt.title("Payout as a Function of Threshold")









    Out[49]:





<matplotlib.text.Text at 0x109ef6210>



In [ ]:

	state	account_length	area_code	intl_plan	vmail_plan	vmail_message	day_mins	day_calls	day_charge	eve_mins	eve_calls	eve_charge	night_mins	night_calls	night_charge	intl_mins	intl_calls	intl_charge	custserv_calls
0	16	128	415	0	1	25	265.1	110	45.07	197.4	99	16.78	244.7	91	11.01	10.0	3	2.70	1
1	35	107	415	0	1	26	161.6	123	27.47	195.5	103	16.62	254.4	103	11.45	13.7	3	3.70	1
2	31	137	415	0	0	0	243.4	114	41.38	121.2	110	10.30	162.6	104	7.32	12.2	5	3.29	0
3	35	84	408	1	0	0	299.4	71	50.90	61.9	88	5.26	196.9	89	8.86	6.6	7	1.78	2
4	36	75	415	1	0	0	166.7	113	28.34	148.3	122	12.61	186.9	121	8.41	10.1	3	2.73	3