Lessons from Binary Classification

Performance Evaluation, Surrogate Losses, and Class Weighting

James Sharpnack
Assistant Professor
UC Davis, Statistics Department
5/5/2017

Talk Outline

Introduction with link prediction in social networks
Evaluating a score with ROC and PR curves
Multiple predictors and hyperplane methods
Surrogate Losses
Class weights



In [689]:

    
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')









    Out[689]:



In [633]:

    
import pandas as pd
import numpy as np
import networkx as nx
import os
from scipy import sparse as sp
from matplotlib import pyplot as plt
from sklearn import metrics as skmetrics
from sklearn.linear_model import LogisticRegression
import itertools
from sklearn.svm import SVC
%matplotlib inline
plt.style.use('ggplot')



In [634]:

    
n = 347
p = .1
testind = np.random.rand(n**2)
testind = testind < p 
testind = testind.reshape((n,n))
test_isp = sp.lil_matrix((n,n))
W_test = sp.lil_matrix((n,n))
W_train = sp.lil_matrix((n,n))



In [688]:

    
# The following imports the facebook graph and splits the edges into a training and a test set

facedir = './'
efnames = [f for f in os.listdir(facedir) if 'edges' in f]
g_train = nx.Graph()
g_test = nx.Graph()
for efname in efnames:
    ELdf = pd.read_csv(facedir+efname,sep=' ',header=None)
    eliter = ELdf.iterrows()
    for e in eliter:
        i,j = tuple(e[1])
        if i < j:
            test_isp[i-1,j-1] = True
            if not testind[i-1,j-1]:
                g_train.add_edge(i-1,j-1)
                W_train[i-1,j-1] = 1
            else:
                g_test.add_edge(i-1,j-1)
                W_test[i-1,j-1] = 1



In [636]:

    
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Code from http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')



In [637]:

    
m_test = g_test.number_of_edges()
m_train = g_train.number_of_edges()



In [638]:

    
print m_train, m_test



In [639]:

    
nx.draw(g_train,node_size=2,alpha=.3)

Training Facebook graph with 347 vertices and 2267 edges. The original dataset can be found at the SNAP project website: https://snap.stanford.edu/data/egonets-Facebook.html



In [640]:

    
nx.draw(g_test,node_size=2,alpha=.3)

Testing Facebook graph with 347 vertices and 252 edges. The original dataset can be found at the SNAP project website: https://snap.stanford.edu/data/egonets-Facebook.html

Evaluating a Score with ROC and PR Curves



In [641]:

    
i,j = 0,3
print set(g_train.neighbors(i))
print set(g_train.neighbors(j))
print set(g_train.neighbors(i)) & set(g.neighbors(j))









    



set([279, 321, 132, 72, 298, 235, 47, 52, 53, 118, 87, 345, 125])
set([194, 327, 77, 305, 274, 180, 151, 217])
set([72])



In [642]:

    
P1 = W_train.dot(W_train)



In [643]:

    
Itest, Jtest, _ = sp.find(testind)



In [644]:

    
m = len(Itest)
z = np.array(P1[Itest,Jtest]).reshape(m)
y = W_test[Itest,Jtest].toarray().reshape(m)



In [645]:

    
z.shape, y.shape









    Out[645]:





((11973L,), (11973L,))



In [646]:

    
tau = 1
with plt.style.context(('seaborn-white')):
    plot_confusion_matrix(skmetrics.confusion_matrix(y,z>=tau),[0,1],
                      title="Confusion matrix for tau={}".format(tau))









    



Confusion matrix, without normalization
[[11187   527]
 [   64   195]]



In [647]:

    
tau = 5
with plt.style.context(('seaborn-white')):
    plot_confusion_matrix(skmetrics.confusion_matrix(y,z>=tau),[0,1],
                      title="Confusion matrix for tau={}".format(tau))









    



Confusion matrix, without normalization
[[11666    48]
 [  201    58]]



In [648]:

    
tau = 10
with plt.style.context(('seaborn-white')):
    plot_confusion_matrix(skmetrics.confusion_matrix(y,z>=tau),[0,1],
                      title="Confusion matrix for tau={}".format(tau))









    



Confusion matrix, without normalization
[[11707     7]
 [  244    15]]



In [695]:

    
from IPython.display import IFrame
IFrame("./eval.pdf", width=800, height=600)









    Out[695]:



In [649]:

    
prec, rec, taus = skmetrics.precision_recall_curve(y,z)
prec_rand, rec_rand, _ = skmetrics.precision_recall_curve(y,np.random.rand(m))



In [650]:

    
for p,t in zip(prec, taus):
    print "tau: {}, prec: {}".format(t,p)









    



tau: 0.0, prec: 0.0216320053454
tau: 1.0, prec: 0.270083102493
tau: 2.0, prec: 0.400516795866
tau: 3.0, prec: 0.448559670782
tau: 4.0, prec: 0.5
tau: 5.0, prec: 0.547169811321
tau: 6.0, prec: 0.616438356164
tau: 7.0, prec: 0.642857142857
tau: 8.0, prec: 0.641025641026
tau: 9.0, prec: 0.7
tau: 10.0, prec: 0.681818181818
tau: 11.0, prec: 0.722222222222
tau: 12.0, prec: 0.733333333333
tau: 13.0, prec: 0.666666666667
tau: 15.0, prec: 0.777777777778
tau: 16.0, prec: 0.75
tau: 18.0, prec: 0.714285714286
tau: 19.0, prec: 0.666666666667
tau: 23.0, prec: 0.75
tau: 26.0, prec: 1.0
tau: 27.0, prec: 1.0



In [651]:

    
_ = plt.plot(rec,prec,label='Common neigh.')
_ = plt.plot(rec_rand,prec_rand,label='Random')
_ = plt.title('PR Curve')
_ = plt.xlabel('recall')
_ = plt.ylabel('precision')
_ = plt.legend()



In [652]:

    
fpr, tpr, _ = skmetrics.roc_curve(y,z)
fpr_rand, tpr_rand, _ = skmetrics.roc_curve(y,np.random.rand(m))

_ = plt.plot(fpr,tpr,label='Common neigh.')
_ = plt.plot(fpr_rand,tpr_rand,label='Random')
_ = plt.title('ROC curve')
_ = plt.xlabel('FPR')
_ = plt.ylabel('TPR')
_ = plt.legend()



In [653]:

    
W = W_train.copy()



In [654]:

    
ds = np.array(W.sum(axis=1)).reshape(n)
dinv = ds.copy()
dinv[ds>0] = ds[ds>0]**(-1)



In [655]:

    
X = []
#P = np.diag(dinv).dot(W_train.toarray())
P = W_train.toarray()
Ppow = P.dot(P)
for k in xrange(6):
    Ppow = Ppow.dot(P)
    x = np.array(Ppow[Itest,Jtest]).reshape(m)
    X.append(np.log(x+1.))
X = np.array(X).T



In [656]:

    
y = W_test[Itest,Jtest].toarray().reshape(m)



In [657]:

    
_ = plt.scatter(X[y==1,0],X[y==1,3],c='r',alpha=.4,label='pos')
_ = plt.scatter(X[y==0,0],X[y==0,3],c='b',alpha=.4,label='neg')
_ = plt.xlabel('2 hop paths')
_ = plt.ylabel('5 hop paths')
_ = plt.legend(loc=2)



In [658]:

    
lr = LogisticRegression()
lr.fit(X[:,[0,3]],y)









    Out[658]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [659]:

    
beta1 = lr.coef_[0,0]
beta3 = lr.coef_[0,1]
beta0 = lr.intercept_

T = np.linspace(1,5.5,100)
x3hat = -(beta0 + beta1*T) / beta3



In [660]:

    
_ = plt.scatter(X[y==1,0],X[y==1,3],c='r',alpha=.4,label='pos')
_ = plt.scatter(X[y==0,0],X[y==0,3],c='b',alpha=.4,label='neg')
_ = plt.plot(T,x3hat,c='k')
_ = plt.xlabel('2 hop paths')
_ = plt.ylabel('5 hop paths')
_ = plt.legend(loc=2)



In [661]:

    
lr = LogisticRegression()
lr.fit(X,y)
z_log = lr.predict_proba(X)[:,1]
with plt.style.context(('seaborn-white')):
    plot_confusion_matrix(skmetrics.confusion_matrix(y,z_log>0.5),[0,1])









    



Confusion matrix, without normalization
[[11683    31]
 [  191    68]]



In [662]:

    
prec_log, rec_log, _ = skmetrics.precision_recall_curve(y,z_log)
plt.plot(rec_log,prec_log)
plt.plot(rec,prec)
plt.plot()









    Out[662]:





[]

Finding a hyperplane



In [663]:

    
N = 100
X1 = (np.random.randn(N*2)).reshape((N,2)) + np.array([2,3])
X0 = (np.random.randn(N*2)).reshape((N,2)) + np.array([.5,1.5])
y = np.zeros(N*2)
y[:N]=1
X = np.vstack((X1,X0))



In [664]:

    
_ = plt.scatter(X0[:,0],X0[:,1],c='b',label='neg')
_ = plt.scatter(X1[:,0],X1[:,1],c='r',label='pos')
_ = plt.legend(loc=2)



In [665]:

    
lr = LogisticRegression()
lr.fit(X,y)

beta1 = lr.coef_[0,0]
beta2 = lr.coef_[0,1]
beta0 = lr.intercept_

mults=0.8
T = np.linspace(-1,4,100)
x2hat = -(beta0 + beta1*T) / beta2
line1 = -(beta0 + np.random.randn(1)*2 + 
          (beta1 + np.random.randn(1)*mults) *T) / (beta2 + np.random.randn(1)*mults)
line2 = -(beta0 + np.random.randn(1)*2 + 
          (beta1 + np.random.randn(1)*mults) *T) / (beta2 + np.random.randn(1)*mults)
line3 = -(beta0 + np.random.randn(1)*2 + 
          (beta1 + np.random.randn(1)*mults) *T) / (beta2 + np.random.randn(1)*mults)



In [666]:

    
_ = plt.scatter(X0[:,0],X0[:,1],c='b',label='neg')
_ = plt.scatter(X1[:,0],X1[:,1],c='r',label='pos')
_ = plt.plot(T,line3,c='k')
_ = plt.plot(T,line1,c='k')
_ = plt.plot(T,line2,c='k')
_ = plt.ylim([-1,7])
_ = plt.legend(loc=2)



In [667]:

    
y_hat = lr.predict(X)
_ = plt.scatter(X0[:,0],X0[:,1],c='b',label='neg')
_ = plt.scatter(X1[:,0],X1[:,1],c='r',label='pos')
_ = plt.plot(T,x2hat,c='k')
_ = plt.legend(loc=2)



In [668]:

    
y_hat = lr.predict(X)
_ = plt.scatter(X0[y_hat[N:] == 1,0],X0[y_hat[N:] == 1,1],c='b',label='neg')
_ = plt.scatter(X1[y_hat[:N] == 0,0],X1[y_hat[:N] == 0,1],c='r',label='pos')
_ = plt.plot(T,x2hat,c='k')
_ = plt.legend(loc=2)

Surrogate Losses



In [696]:

    
from IPython.display import IFrame
IFrame("./loss.pdf", width=800, height=600)









    Out[696]:



In [669]:

    
z_range = np.linspace(-5,5,200)



In [670]:

    
zoloss = z_range < 0
l2loss = (1-z_range)**2.
hingeloss = (1 - z_range) * (z_range < 1)
logisticloss = np.log(1 + np.exp(-z_range))
_ = plt.plot(z_range, logisticloss + 1 - np.log(2.),label='logistic')
_ = plt.plot(z_range, zoloss,label='0-1')
_ = plt.plot(z_range, hingeloss,label='hinge')
_ = plt.plot(z_range, l2loss,label='sq error')
_ = plt.ylim([-.2,5])
_ = plt.title('')
_ = plt.legend()



In [671]:

    
y_hat = lr.predict(X)
z_log = (2.*y - 1.)*lr.decision_function(X)
logisticloss = np.log(1 + np.exp(-z_log))
_ = plt.scatter(X0[:,0],X0[:,1],s=logisticloss[N:]*30.,c='b',label='neg')
_ = plt.scatter(X1[:,0],X1[:,1],s=logisticloss[:N]*30.,c='r',label='pos')
_ = plt.plot(T,x2hat,c='k')
_ = plt.xlim([-1,3])
_ = plt.ylim([0,4])
_ = plt.legend(loc=2)



In [672]:

    
y_hat = lr.predict(X)
z_log = (2.*y - 1.)*lr.decision_function(X)
hingeloss = (1-z_log)*(z_log < 1)
_ = plt.scatter(X0[:,0],X0[:,1],s=hingeloss[N:]*30.,c='b',label='neg')
_ = plt.scatter(X1[:,0],X1[:,1],s=hingeloss[:N]*30.,c='r',label='pos')
_ = plt.plot(T,x2hat,c='k')
_ = plt.xlim([-1,3])
_ = plt.ylim([0,4])
_ = plt.legend(loc=2)



In [673]:

    
y_hat = lr.predict(X)
z_log = (2.*y - 1.)*lr.decision_function(X)
l2loss = (1-z_log)**2.
_ = plt.scatter(X0[:,0],X0[:,1],s=l2loss[N:]*10.,c='b',label='neg')
_ = plt.scatter(X1[:,0],X1[:,1],s=l2loss[:N]*10.,c='r',label='pos')
_ = plt.plot(T,x2hat,c='k')
_ = plt.xlim([-1,3])
_ = plt.ylim([0,4])
_ = plt.legend(loc=2)

Class Weighting



In [674]:

    
alpha = 4.
zolossn = z_range < 0
zolossp = z_range > 0
logisticlossn = np.log(1 + np.exp(-z_range))
logisticlossp = np.log(1 + np.exp(z_range))
_ = plt.plot(z_range, logisticlossn + 1 - np.log(2.),label='logistic y=0')
_ = plt.plot(z_range, alpha*(logisticlossp + 1 - np.log(2.)),label='logistic y=1')
_ = plt.plot(z_range, zolossn,label='0-1 y=0')
_ = plt.plot(z_range, alpha*zolossp,label='0-1 y=1')
_ = plt.ylim([-.2,5])
_ = plt.title('')
_ = plt.legend()



In [675]:

    
y_hat = lr.predict(X)
z_log = (2.*y - 1.)*lr.decision_function(X)
logisticloss = np.log(1 + np.exp(-z_log))
_ = plt.scatter(X0[:,0],X0[:,1],s=alpha*logisticloss[N:]*20.,c='b',label='neg')
_ = plt.scatter(X1[:,0],X1[:,1],s=logisticloss[:N]*20.,c='r',label='pos')
_ = plt.plot(T,x2hat,c='k')
_ = plt.xlim([-1,3])
_ = plt.ylim([0,4])
_ = plt.legend(loc=2)



In [676]:

    
lr = LogisticRegression(class_weight='balanced')
lr.fit(X,y)









    Out[676]:





LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)



In [685]:

    
z_log = lr.predict_proba(X)[:,1]
prec_log, rec_log, _ = skmetrics.precision_recall_curve(y,z_log)
_ = plt.plot(rec_log,prec_log,label='Logistic')
_ = plt.plot(rec,prec,label='Old Score')
_ = plt.xlabel('recall')
_ = plt.ylabel('precision')
_ = plt.legend(loc=3)
_ = plt.plot()