In [183]:
# import
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_mldata
from sklearn.linear_model import SGDClassifier
%matplotlib inline
In [75]:
mnist = fetch_mldata('MNIST original')
In [76]:
mnist
Out[76]:
In [77]:
X, y = mnist['data'], mnist['target']
In [78]:
print(X.shape, y.shape)
In [79]:
some_digits = X[36001]
some_digits_img = some_digits.reshape(28, 28)
In [80]:
plt.imshow(some_digits_img, cmap=matplotlib.cm.binary, interpolation='Nearest')
plt.axis("off")
plt.show()
In [81]:
### checking out its label
y[36001]
Out[81]:
In [82]:
# MNIST dataset is already split into train(first 60000) and test(last 10000)
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]
# let's suffle these train set to avoid any bias for algo
suffled_indices = np.random.permutation(60000)
X_train, y_train = X_train[suffled_indices], y_train[suffled_indices]
In [83]:
# converting the target into binary info, Foy 5
y_train_5 = (y_train==5)
y_test_5 = (y_test==5)
A good place to start is with a Stochastic Gradient Descent (SGD) classifier, using Scikit-Learn’s SGDClassifier class. This classifier has the advantage of being capable of handling very large datasets efficiently. This is in part because SGD deals with training instances independently, one at a time (which also makes SGD well suited for online learning)
In [84]:
sgd_clf = SGDClassifier(random_state=142)
sgd_clf.fit(X_train, y_train_5)
Out[84]:
In [85]:
sgd_clf.predict([some_digits])
Out[85]:
In [86]:
from sklearn.model_selection import StratifiedShuffleSplit
In [87]:
skfolds = StratifiedShuffleSplit(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(X_train, y_train_5):
# creating classified object
clone_clf = SGDClassifier(random_state=42)
# splitting the data into train n test
X_train_folds = X_train[train_index]
y_train_folds = y_train_5[train_index]
X_test_folds = X_train[test_index]
y_test_folds = y_train_5[test_index]
# fitting the data
clone_clf.fit(X_train_folds, y_train_folds)
y_pred = clone_clf.predict(X_test_folds)
print(np.mean(y_pred==y_test_folds), end=' ')
In [88]:
from sklearn.model_selection import cross_val_score
In [89]:
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
Out[89]:
As we can see accuract is above 90% in all cases, it seems weird, let's check the base estimate
In [90]:
from sklearn.base import BaseEstimator
In [91]:
class Never5Classifier(BaseEstimator):
def fit(self, X, y=None):
pass
def predict(self, X):
return np.zeros((len(X), 1), dtype=bool)
In [92]:
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")
Out[92]:
base classifier gives 90% accuracy bacause our data is skewed towards "Not 5" category
In [93]:
# being a image is 5 accuracy
print(np.mean(y_train_5)) # which means not 5 is
print(1-np.mean(y_train_5))
In [94]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
y_pred_5 = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
confusion_matrix(y_train_5, y_pred_5)
Out[94]:
In [95]:
from sklearn.metrics import precision_score, recall_score
print("precision score: ", precision_score(y_train_5, y_pred_5))
print("recall score: ", recall_score(y_train_5, y_pred_5))
It is often convenient to combine precision and recall into a single metric called the F1 score, in
particular if you need a simple way to compare two classifiers. The F1 score is the harmonic mean of
precision and recall.
Whereas the regular mean treats all values equally, the harmonic
mean gives much more weight to low values. As a result, the classifier will only get a high F1 score if
both recall and precision are high.
The F1 score favors classifiers that have similar precision and recall. This is not always what you want: in some contexts you mostly care about precision, and in other contexts you really care about recall.
In [96]:
from sklearn.metrics import f1_score
print("f1-score: ", f1_score(y_train_5, y_pred_5))
There is always a trade off between precision and recall and it denepeds on the threshold value which help to decide whether it is a positive value or negative. Scikit-Learn does not let you set the threshold directly, but it does give you access to the decision scores that it uses to make predictions. Instead of calling the classifier’s predict() method, you can call its decision_function() method, which returns a score for each instance, and then make predictions based on those scores using any threshold you want
In [97]:
y_score = sgd_clf.decision_function([some_digits])
y_score
Out[97]:
In [98]:
threshold = 0
y_pred = (y_score>threshold)
y_pred
Out[98]:
In [99]:
# let's increase the threshold
threshold = 200000
y_pred = (y_score>threshold)
y_pred
Out[99]:
In [100]:
# as we increase the threshold, recall is decresing , let's check the same for all the values
print(X_train.shape, y_train_5.shape)
print(y_train_5[:3])
y_pred_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3, method='decision_function')
y_pred_scores=y_pred_scores[:,1]
print(y_pred_scores[:3])
y_pred = (y_pred_scores>threshold)
In [101]:
np.mean(y_pred) # TRUE accuracy
Out[101]:
In [102]:
from sklearn.metrics import precision_recall_curve
In [103]:
print(y_train_5.shape, y_pred_scores.shape)
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_pred_scores)
y_pred_scores[:3]
Out[103]:
In [104]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(10,8))
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([-0.1, 1.1])
plt.grid(True)
In [105]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
In [106]:
def plot_precision_vs_recall(precisions, recalls):
plt.figure(figsize=(10,8))
plt.plot(recalls[:-1], precisions[:-1], "b-", linewidth=2)
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="upper left")
plt.ylim([-0.1, 1.1])
plt.grid(True, which='major')
In [107]:
plot_precision_vs_recall(precisions, recalls)
plt.show()
You can see that precision really starts to fall sharply around 80% recall. You will probably want to
select a precision/recall tradeoff just before that drop — for example, at around 60% recall. But of
course the choice depends on your project.
let’s suppose you decide to aim for 90% precision. You look up the first plot (zooming in a bit) andfind that you need to use a threshold of about 70,000. To make predictions (on the training set for now),
instead of calling the classifier’s predict() method, you can just run this code:
In [108]:
y_train_pred_90 = (y_pred_scores > 70000)
print("Precision: ", precision_score(y_train_5, y_train_pred_90))
print("Recall: ", recall_score(y_train_5, y_train_pred_90))
The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers.
It is very similar to the precision/recall curve, but instead of plotting precision versus recall, the ROC
curve plots the true positive rate (another name for recall) against the false positive rate. The FPR is the
ratio of negative instances that are incorrectly classified as positive. It is equal to one minus the true
negative rate, which is the ratio of negative instances that are correctly classified as negative. The TNR
is also called specificity. Hence the ROC curve plots sensitivity (recall) versus 1 – specificity
.
In [109]:
from sklearn.metrics import roc_curve
In [110]:
fpr, tpr, thresholds = roc_curve(y_train_5, y_pred_scores)
In [111]:
def plot_roc_curve(fpr, tpr, label=None, figno=1):
plt.figure(figno, figsize=(10, 8))
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.1, 1.1, -0.1, 1.1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend(loc='lower right')
In [112]:
plot_roc_curve(fpr, tpr)
plt.show()
Once again there is a tradeoff: the higher the recall (TPR), the more false positives (FPR) the classifier
produces. The dotted line represents the ROC curve of a purely random classifier; a good classifier stays
as far away from that line as possible (toward the top-left corner).
One way to compare classifiers is to measure the area under the curve (AUC). A perfect classifier will
have a ROC AUC equal to 1, whereas a purely random classifier will have a ROC AUC equal to 0.5
In [113]:
from sklearn.metrics import roc_auc_score
In [114]:
roc_auc_score(y_train_5, y_pred_scores)
Out[114]:
Since the ROC curve is so similar to the precision/recall (or PR) curve, you may wonder how to decide which one to use. As a rule of thumb, you should prefer the PR curve whenever the positive class is rare or when you care more about the false positives than the false negatives, and the ROC curve otherwise.
Let's train an another model
In [115]:
from sklearn.ensemble import RandomForestClassifier
In [116]:
rf_clf = RandomForestClassifier(random_state=142)
y_pred_prob = cross_val_predict(rf_clf, X_train, y_train_5, method='predict_proba', cv=3)
y_pred_prob[:3]
Out[116]:
In [117]:
# But to plot a ROC curve, you need scores, not probabilities.
# A simple solution is to use the positive class’s probability as the score:
y_forest_scores = y_pred_prob[:,-1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_forest_scores)
In [118]:
plot_roc_curve(fpr_forest, tpr_forest, label='RandomForest')
plot_roc_curve(fpr, tpr, label='SGD')
In [119]:
roc_auc_score(y_train_5, y_forest_scores)
Out[119]:
In [120]:
sgdm_clf = SGDClassifier(random_state=142)
sgdm_clf.fit(X_train, y_train)
sgdm_clf.predict([some_digits])
Out[120]:
In [121]:
sgdm_clf.decision_function([some_digits])
Out[121]:
In [122]:
argmax = np.argmax(sgdm_clf.decision_function([some_digits]))
argmax
Out[122]:
In [123]:
sgdm_clf.classes_[argmax]
Out[123]:
If you want to force ScikitLearn to use one-versus-one or one-versus-all, you can use the OneVsOneClassifier or OneVsRestClassifier classes. Simply create an instance and pass a binary classifier to its constructor.
In [175]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
In [176]:
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=142))
ovo_clf.fit(X_train, y_train)
Out[176]:
In [177]:
ovo_clf.classes_
Out[177]:
In [180]:
len(ovo_clf.estimators_) # there are 45 estimator
Out[180]:
In [184]:
y_pred_ovo = cross_val_predict(ovo_clf, X_train, y_train, cv=5)
cm=confusion_matrix(y_train, y_pred_ovo)
In [186]:
plt.matshow(cm, cmap=plt.cm.gray)
Out[186]:
In [187]:
# normalizing the confusion matrix
row_sums = cm.sum(axis=1, keepdims=True)
norm_conf_mx = cm / row_sums
In [189]:
np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
In [ ]:
def plot_digits()
In [192]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_pred == cl_b)]
plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()
In [ ]: