In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

Be mindful of default metrics

Regression

$$ R^2(y, \hat{y}) = 1 - \frac{\displaystyle \sum_{i=0}^{n - 1} (y_i - \hat{y}_i)^2}{\displaystyle \sum_{i=0}^{n - 1} (y_i - \bar{y})^2}$$

where $$\bar{y} = \frac{1}{n} \sum_{i=0}^{n - 1} y_i$$


In [ ]:
rng = np.random.RandomState(42)
X = rng.uniform(size=(30, 1))
a = rng.normal(scale=10)
b = rng.normal()

y_clean = np.dot(X, a).ravel() + b
y = y_clean + rng.normal(size=len(y_clean))
plt.plot(X[:, 0], y, 'x')

In [ ]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X, y)
y_pred = lr.predict(X)
plt.plot(X[:, 0], y, 'x')
plt.plot(X[:, 0], y_pred)

In [ ]:
print("training set R^2: %f" % r2_score(y, y_pred))  # same as lr.score(X, y)
print("training set MSE: %f" % mean_squared_error(y, y_pred))

In [ ]:
print("training set R^2: %f" % r2_score(10 * y, 10 * y_pred))
print("training set MSE: %f" % mean_squared_error(10 * y, 10 * y_pred))

In [ ]:
from sklearn.cross_validation import cross_val_score, LeaveOneOut
cv = LeaveOneOut(len(y))
cross_val_score(LinearRegression(), X, y, cv=cv)

Classification

$$ \texttt{accuracy}(y, \hat{y}) = \frac{1}{n} \sum_{i=0}^{n-1} 1(\hat{y}_i = y_i)$$

Multi-class accuracies


In [ ]:
from sklearn.datasets import make_blobs
from sklearn.cross_validation import train_test_split
from sklearn.dummy import DummyClassifier

n_classes = 5

X, y = make_blobs(centers=n_classes, n_samples=10000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=42)
dummy = DummyClassifier(random_state=42).fit(X_train, y_train)
print("Chance accuracy for %d classes: %.1f" % (n_classes, dummy.score(X_test, y_test)))

Imbalanced classes


In [ ]:
from sklearn.datasets import load_digits
from sklearn.svm import SVC

digits = load_digits()
X, y_is_three = digits.data, digits.target == 3

X_train, X_test, y_train, y_test = train_test_split(X, y_is_three, test_size=.5, random_state=0)

dummy = DummyClassifier().fit(X_train, y_train)
print("Chance accuracy for 9:1 imbalanced classification : %.2f" % dummy.score(X_test, y_test))

In [ ]:
dummy = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)
print("Accuracy for 9:1 imbalanced classification predicting majority: %.2f"
      % dummy.score(X_test, y_test))

In [ ]: