In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.datasets import make_blobs, make_circles
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.decomposition import KernelPCA
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

Choosing the right metric


In [ ]:
# simulating some data
X, y = make_blobs(centers=2, n_features=2)
plt.scatter(X[:, 0], X[:, 1], c=y)

In [ ]:
# fitting logistic regression
lr = LogisticRegression()
lr.fit(X, y)

In [ ]:
# plotting decision boundary
xmin, xmax = X[:, 0].min() - 1, X[:, 0].max() + 1
ymin, ymax = X[:, 1].min() - 1, X[:, 1].max() + 1
h = 0.02
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, s=50)

Acuuracy as a metric


In [ ]:
# simulating some overlapping data
_x1 = np.random.multivariate_normal(mean=[0, 0], cov=np.array([[0, 0.5], [0.5, 0]]), size=(900,))
_x2 = np.random.multivariate_normal(mean=[0.75, 0.75], cov=np.array([[0, 0.125], [0.125, 0]]), size=(100,))
X = np.r_[_x1, _x2]
y = np.zeros((X.shape[0],))
y[900:] = 1
rand_ix = np.arange(1000)
np.random.shuffle(rand_ix)
X = X[rand_ix, :]
y = y[rand_ix]
plt.scatter(X[:, 0], X[:, 1], c=y)

In [ ]:
x_train, y_train = X[:900, :], y[:900]
x_test, y_test = X[900:, :], y[900:]
clf = LogisticRegression()
clf.fit(x_train, y_train)
prediction = clf.predict(x_test)
print("Accuracy: %f" % accuracy_score(y_test, prediction))

Q: Is this a good score?

Exercise: Find accuracy for all positive class samples and all negative class samples. (Hint: use clf.predict)


In [ ]:
# enter code here

Q: What went wrong?


In [ ]:
# decision boundary
xmin, xmax = X[:, 0].min() - 1, X[:, 0].max() + 1
ymin, ymax = X[:, 1].min() - 1, X[:, 1].max() + 1
h = 0.02
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, s=50)

In [ ]:
prediction = clf.predict(X)
print("Accuracy: %f" % accuracy_score(prediction, y))
print("Recall: %f" % recall_score(prediction, y))
print("F1 score: %f" % f1_score(prediction, y))