In [5]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST Original')
mnist
Out[5]:
In [6]:
X, y = mnist["data"], mnist["target"]
X.shape
Out[6]:
In [7]:
y.shape
Out[7]:
In [8]:
# Plot one of the entries by reshaping the data into a 28x28 (pixels) array
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary, interpolation = "nearest")
plt.axis("off")
plt.show()
In [9]:
y[36000]
Out[9]:
In [10]:
# Create train and test sets from data and shuffle entries
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[:60000]
import numpy as np
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
In [11]:
# Train a binary classifier for checking '5 or not 5'
# Create train and test sets with binary (1 or 0) vectors:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
from sklearn.linear_model import SGDClassifier
sdg_clf = SGDClassifier(random_state = 42) # to make reproducible
sdg_clf.fit(X_train, y_train_5)
# try to predict if the image above is a five
sdg_clf.predict([some_digit])
Out[11]:
In [12]:
# Custom implementation of cross-validation (p. 83)
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits = 3, random_state = 42)
for train_index, test_index in skfolds.split(X_train, y_train_5):
clone_clf = clone(sdg_clf) # make this parameterizable?
X_train_folds = X_train[train_index]
y_train_folds = (y_train_5[train_index])
X_test_fold = X_train[test_index]
y_test_fold = (y_train_5[test_index])
clone_clf.fit(X_train_folds, y_train_folds)
y_pred = clone_clf.predict(X_test_fold)
n_correct = sum(y_pred == y_test_fold)
print(n_correct / len(y_pred))
In [13]:
# Check with pre-existing cross validator
from sklearn.model_selection import cross_val_score
cross_val_score(sdg_clf, X_train, y_train_5, cv = 3, scoring = "accuracy")
Out[13]:
In [14]:
# Accuracy may seem good, but compare to this "never 5" classifier:
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
def fit(self, X, y = None):
pass
def predict(self, X):
return np.zeros((len(X), 1), dtype = bool)
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv = 3, scoring = "accuracy")
Out[14]:
In [15]:
# Create a confusion matrix (p. 84)
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sdg_clf, X_train, y_train_5, cv = 3)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
# this creates a matrix:
# [[non-fives predicted correctly predicted as non-fives, non-fives predicted as fives],
# [fives predicted as non-fives, fives correctly predicted as fives]]
#
# [[true negatives, false positives],
# [false negatives, true positives]]
Out[15]:
In [16]:
# (p.85) Calculate "precision", "recall"
# precision = TP / (TP + FP)
# recall = TP / (TP + FN)
from sklearn.metrics import precision_score, recall_score
print("Precision: ", precision_score(y_train_5, y_train_pred))
print("Recall: ", recall_score(y_train_5, y_train_pred))
# ==> predicting fives using our classifier is only correcy ~78% of the time (precision)
# ==> detecting fives works only 77% of the time (recall)
In [17]:
# (p. 86) F1-score = harmonic mean of precision and recall
# f1 = 2 / ((1 / precision) + (1 / recall))
# = 2 * ((precision * recall) / (precision + recall))
# = TP / (TP + ((FN + FP) / 2))
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)
# Sometimes you want to tune for precision, other times for
# recall. This is called the precision/recall tradeoff.
Out[17]:
In [18]:
# (p. 88) how to decide which threshold to use
y_scores = cross_val_predict(sdg_clf, X_train, y_train_5, cv = 3, method = "decision_function")
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
def plot_precision_vs_recall_thresholds(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label = "Precision")
plt.plot(thresholds, recalls[:-1], "g-", label = "Recall")
plt.xlabel("Threshold")
plt.legend(loc = "upper left")
plt.ylim([0, 1])
plot_precision_vs_recall_thresholds(precisions, recalls, thresholds)
plt.show()
In [19]:
# Other option: plot precision vs. recall
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls, precisions, "b-", label = "Precision vs. Recall")
plt.ylabel("Precision")
plt.xlabel("Recall")
plt.legend(loc = "lower left")
plt.ylim([0, 1])
plt.xlim([0, 1])
plot_precision_vs_recall(precisions, recalls)
plt.show()
In [20]:
# (p. 90) aiming for 90% precision:
y_train_pred_90 = (y_scores > 160000) # 160,000 deduced from the first plot
print("Precision: ", precision_score(y_train_5, y_train_pred_90))
print("Recall: ", recall_score(y_train_5, y_train_pred_90))
In [21]:
# (p. 91) The ROC / receiver operating characteristic curve
# specificity = true negative rate, true negs classified as such
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr, tpr, label = None):
plt.plot(fpr, tpr, "g-", linewidth = 2, label = label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plot_roc_curve(fpr, tpr)
plt.show()
In [22]:
# (p. 92) ROC AUC = receiver operating characteristic, area under curve
# auc = 1 for perfect classifier
# auc = 0.5 for random classifier
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)
Out[22]:
In [23]:
# (p. 92) train a RandomForestClassifier and compare
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state = 42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv = 3, method = "predict_proba")
y_scores_forest =y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc = "lower right")
plt.show()
In [24]:
# (p. 94) multiclass classification, sklearn does this automatically OvA
sdg_clf.fit(X_train, y_train) # y_train, not y_train_5!
sdg_clf.predict([some_digit])
Out[24]:
In [25]:
some_digit_scores = sdg_clf.decision_function([some_digit])
some_digit_scores
Out[25]:
In [26]:
np.argmax(some_digit_scores)
Out[26]:
In [27]:
sdg_clf.classes_
Out[27]:
In [28]:
sdg_clf.classes_[5]
Out[28]:
In [29]:
# Manually create OvO or OvA classifier (p. 95)
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
Out[29]:
In [30]:
len(ovo_clf.estimators_)
Out[30]:
In [31]:
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
Out[31]:
In [32]:
forest_clf.predict_proba([some_digit])
Out[32]:
In [33]:
cross_val_score(sdg_clf, X_train, y_train, cv=3, scoring="accuracy")
Out[33]:
In [34]:
# Quite good accuracy, but gets even better with scaling (p. 96)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sdg_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")
Out[34]:
In [35]:
# Error Analysis (p. 96)
y_train_pred = cross_val_predict(sdg_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx
Out[35]:
In [36]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()
In [37]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
In [38]:
# (p. 99) plot_digits in not defined in the book, skipping
# (p. 100) multi-label classification
from sklearn.neighbors import KNeighborsClassifier
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)
knn_clf.predict([some_digit])
# results in two values: if the number is large (7,8,9) and if it is odd
# for [some_digit] - which is a 5 - this yields false, true correctly
Out[38]:
In [39]:
# (p. 100) F1 score across all labels, aggregated
# y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
# f1_score(y_train, y_train_knn_pred, average="macro")
In [46]:
# (p.101) multi-output multi-class classification
# => multiple labels in output as before, but each can have more than two values!
# example: remove noise from images (each pixel is an output value and can have 0..255 as value)
def plot_digit(digit):
reshaped = digit.reshape(28, 28)
plt.imshow(reshaped, cmap = matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()
import numpy as np
# 1) add noise to existing MNIST images
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test
some_index = 7500
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)
# plots cleaned up image
In [47]:
plot_digit(X_test_mod[some_index])
# plots non-cleaned up image with noise
In [ ]: