In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=2)

Built-In and custom scoring functions

Using built-in scoring functions


In [ ]:
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split

X, y = make_classification(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [ ]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

In [ ]:
print(lr.score(X_test, y_test))

In [ ]:
pred = lr.predict(X_test)

In [ ]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred))

Binary confusion matrix:

True Positive (TP)False Negative (FN)
False Positive (FP) True Negative (TN)
$$ \text{precision} = \frac{TP}{FP + TP} $$$$ \text{recall} = \frac{TP}{FN + TP} $$$$ \text{accuracy} = \frac{TP + TN}{FP + FN + TP + TN} $$$$ f_1 = 2 \frac{\text{precision} \cdot \text{recall}}{\text{precision} + \text{recall}} $$

In [ ]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [ ]:
from sklearn.metrics import  precision_score, f1_score
print("precision: %f  f1_score: %f" % (precision_score(y_test, pred), f1_score(y_test, pred)))

In [ ]:
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss

probs = lr.predict_proba(X_test)[:, 1]

print("area under the roc_curve: %f" % roc_auc_score(y_test, probs))
print("average precision: %f" % average_precision_score(y_test, probs))
print("log loss: %f" % log_loss(y_test, probs))

In [ ]:
from sklearn.metrics.scorer import SCORERS
print(SCORERS.keys())

In [ ]:
from sklearn.cross_validation import cross_val_score

cross_val_score(LogisticRegression(), X, y)

In [ ]:
print("Accuracy scoring: %s" % cross_val_score(LogisticRegression(), X, y, scoring="accuracy"))
print("F1 scoring: %s" % cross_val_score(LogisticRegression(), X, y, scoring="f1"))
print("AUC scoring: %s" % cross_val_score(LogisticRegression(), X, y, scoring="roc_auc"))
print("Log loss scoring: %s" % cross_val_score(LogisticRegression(), X, y, scoring="log_loss"))

In [ ]:
from sklearn.grid_search import GridSearchCV

param_grid = {'C': np.logspace(start=-3, stop=3, num=10)}
grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring="log_loss")
grid_search.fit(X, y)

In [ ]:
grid_search.grid_scores_

In [ ]:
grid_search.best_params_

Defining your own scoring callable

From scratch


In [ ]:
def my_accuracy_scoring(est, X, y):
    return np.mean(est.predict(X) == y)

print(cross_val_score(LogisticRegression(), X, y))
print(cross_val_score(LogisticRegression(), X, y, scoring=my_accuracy_scoring))

From a score function


In [ ]:
from sklearn.metrics import fbeta_score
fbeta_score(y_test, pred, beta=10)

In [ ]:
from sklearn.metrics.scorer import make_scorer
my_fbeta_scorer = make_scorer(fbeta_score, beta=10)

print(cross_val_score(LogisticRegression(), X, y, scoring=my_fbeta_scorer))

Accessing the estimator


In [ ]:
def my_sparse_scoring(est, X, y):
    return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)

In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC

grid = GridSearchCV(LinearSVC(C=.01, dual=False),
                    param_grid={'penalty' : ['l1', 'l2']},
                    scoring=my_sparse_scoring)
grid.fit(X, y)
print(grid.best_params_)