In [183]:
# import
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt


from sklearn.datasets import fetch_mldata
from sklearn.linear_model import SGDClassifier

%matplotlib inline

In [75]:
mnist = fetch_mldata('MNIST original')

In [76]:
mnist


Out[76]:
{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}

In [77]:
X, y = mnist['data'], mnist['target']

In [78]:
print(X.shape, y.shape)


(70000, 784) (70000,)

display an image


In [79]:
some_digits = X[36001]
some_digits_img = some_digits.reshape(28, 28)

In [80]:
plt.imshow(some_digits_img, cmap=matplotlib.cm.binary, interpolation='Nearest')
plt.axis("off")
plt.show()



In [81]:
### checking out its label
y[36001]


Out[81]:
5.0

In [82]:
# MNIST dataset is already split into train(first 60000) and test(last 10000) 
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

# let's suffle these train set to avoid any bias for algo
suffled_indices = np.random.permutation(60000)
X_train, y_train = X_train[suffled_indices], y_train[suffled_indices]

Let's try to train a BINARY classification


In [83]:
# converting the target into binary info, Foy 5
y_train_5 = (y_train==5)
y_test_5 = (y_test==5)

A good place to start is with a Stochastic Gradient Descent (SGD) classifier, using Scikit-Learn’s SGDClassifier class. This classifier has the advantage of being capable of handling very large datasets efficiently. This is in part because SGD deals with training instances independently, one at a time (which also makes SGD well suited for online learning)


In [84]:
sgd_clf = SGDClassifier(random_state=142)
sgd_clf.fit(X_train, y_train_5)


c:\tools\anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)
Out[84]:
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=142, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [85]:
sgd_clf.predict([some_digits])


Out[85]:
array([False], dtype=bool)

Measuring the Accuracy

  1. By Stratified Splitted data

In [86]:
from sklearn.model_selection import StratifiedShuffleSplit

In [87]:
skfolds = StratifiedShuffleSplit(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(X_train, y_train_5):
    # creating classified object
    clone_clf = SGDClassifier(random_state=42)
    # splitting the data into train n test
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_folds = X_train[test_index]
    y_test_folds = y_train_5[test_index]
    
    # fitting the data
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_folds)
    
    print(np.mean(y_pred==y_test_folds), end='   ')


c:\tools\anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)
0.966666666667   0.936666666667   0.898   
  1. Now using CV

In [88]:
from sklearn.model_selection import cross_val_score

In [89]:
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")


Out[89]:
array([ 0.9599 ,  0.96685,  0.94215])

As we can see accuract is above 90% in all cases, it seems weird, let's check the base estimate


In [90]:
from sklearn.base import BaseEstimator

In [91]:
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [92]:
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")


Out[92]:
array([ 0.9098 ,  0.91115,  0.908  ])

base classifier gives 90% accuracy bacause our data is skewed towards "Not 5" category


In [93]:
# being a image is 5 accuracy
print(np.mean(y_train_5))  # which means not 5 is
print(1-np.mean(y_train_5))


0.09035
0.90965

Confusion Matrix


In [94]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_pred_5 = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
confusion_matrix(y_train_5, y_pred_5)


Out[94]:
array([[53824,   755],
       [ 1867,  3554]], dtype=int64)

Precision and Recall

$ precision = \frac{TP}{TP+FP} $

$ recall = \frac{TP}{TP+FN} $


In [95]:
from sklearn.metrics import precision_score, recall_score

print("precision score: ", precision_score(y_train_5, y_pred_5))
print("recall score: ", recall_score(y_train_5, y_pred_5))


precision score:  0.824785333024
recall score:  0.655598598045

F1 Score

It is often convenient to combine precision and recall into a single metric called the F1 score, in particular if you need a simple way to compare two classifiers. The F1 score is the harmonic mean of precision and recall.
Whereas the regular mean treats all values equally, the harmonic mean gives much more weight to low values. As a result, the classifier will only get a high F1 score if both recall and precision are high.

$ f1-score = \frac{2}{\frac{1}{precision} + \frac{1}{recall}} = \frac{2 * precision * recall}{precision + recall} $

The F1 score favors classifiers that have similar precision and recall. This is not always what you want: in some contexts you mostly care about precision, and in other contexts you really care about recall.


In [96]:
from sklearn.metrics import f1_score
print("f1-score: ", f1_score(y_train_5, y_pred_5))


f1-score:  0.730524152107

There is always a trade off between precision and recall and it denepeds on the threshold value which help to decide whether it is a positive value or negative. Scikit-Learn does not let you set the threshold directly, but it does give you access to the decision scores that it uses to make predictions. Instead of calling the classifier’s predict() method, you can call its decision_function() method, which returns a score for each instance, and then make predictions based on those scores using any threshold you want


In [97]:
y_score = sgd_clf.decision_function([some_digits])
y_score


Out[97]:
array([-122672.54777732])

In [98]:
threshold = 0 
y_pred = (y_score>threshold)
y_pred


Out[98]:
array([False], dtype=bool)

In [99]:
# let's increase the threshold 
threshold = 200000
y_pred = (y_score>threshold)
y_pred


Out[99]:
array([False], dtype=bool)

In [100]:
# as we increase the threshold, recall is decresing , let's check the same for all the values
print(X_train.shape, y_train_5.shape)
print(y_train_5[:3])
y_pred_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3, method='decision_function')
y_pred_scores=y_pred_scores[:,1]
print(y_pred_scores[:3])
y_pred = (y_pred_scores>threshold)


(60000, 784) (60000,)
[False False False]
[-557284.18127093 -249161.50404368 -468887.12456914]

In [101]:
np.mean(y_pred)  # TRUE accuracy


Out[101]:
0.029333333333333333

Precision Recall Curve


In [102]:
from sklearn.metrics import precision_recall_curve

In [103]:
print(y_train_5.shape, y_pred_scores.shape)
precisions, recalls, thresholds  = precision_recall_curve(y_train_5, y_pred_scores)
y_pred_scores[:3]


(60000,) (60000,)
Out[103]:
array([-557284.18127093, -249161.50404368, -468887.12456914])

In [104]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.figure(figsize=(10,8))
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([-0.1, 1.1])
    plt.grid(True)

In [105]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()



In [106]:
def plot_precision_vs_recall(precisions, recalls):
    plt.figure(figsize=(10,8))
    plt.plot(recalls[:-1], precisions[:-1], "b-", linewidth=2)
    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.legend(loc="upper left")
    plt.ylim([-0.1, 1.1])
    plt.grid(True, which='major')

In [107]:
plot_precision_vs_recall(precisions, recalls)
plt.show()


c:\tools\anaconda3\lib\site-packages\matplotlib\axes\_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

You can see that precision really starts to fall sharply around 80% recall. You will probably want to select a precision/recall tradeoff just before that drop — for example, at around 60% recall. But of course the choice depends on your project.
let’s suppose you decide to aim for 90% precision. You look up the first plot (zooming in a bit) andfind that you need to use a threshold of about 70,000. To make predictions (on the training set for now), instead of calling the classifier’s predict() method, you can just run this code:


In [108]:
y_train_pred_90 = (y_pred_scores > 70000)
print("Precision: ", precision_score(y_train_5, y_train_pred_90))
print("Recall: ", recall_score(y_train_5, y_train_pred_90))


Precision:  0.900936273029
Recall:  0.550267478325

The ROC Curve

The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. It is very similar to the precision/recall curve, but instead of plotting precision versus recall, the ROC curve plots the true positive rate (another name for recall) against the false positive rate. The FPR is the ratio of negative instances that are incorrectly classified as positive. It is equal to one minus the true negative rate, which is the ratio of negative instances that are correctly classified as negative. The TNR is also called specificity. Hence the ROC curve plots sensitivity (recall) versus 1 – specificity.


In [109]:
from sklearn.metrics import roc_curve

In [110]:
fpr, tpr, thresholds = roc_curve(y_train_5, y_pred_scores)

In [111]:
def plot_roc_curve(fpr, tpr, label=None, figno=1):
    plt.figure(figno, figsize=(10, 8))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.1, 1.1, -0.1, 1.1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid(True)
    plt.legend(loc='lower right')

In [112]:
plot_roc_curve(fpr, tpr)
plt.show()


c:\tools\anaconda3\lib\site-packages\matplotlib\axes\_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

Once again there is a tradeoff: the higher the recall (TPR), the more false positives (FPR) the classifier produces. The dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).
One way to compare classifiers is to measure the area under the curve (AUC). A perfect classifier will have a ROC AUC equal to 1, whereas a purely random classifier will have a ROC AUC equal to 0.5


In [113]:
from sklearn.metrics import roc_auc_score

In [114]:
roc_auc_score(y_train_5, y_pred_scores)


Out[114]:
0.95015253499562624

Since the ROC curve is so similar to the precision/recall (or PR) curve, you may wonder how to decide which one to use. As a rule of thumb, you should prefer the PR curve whenever the positive class is rare or when you care more about the false positives than the false negatives, and the ROC curve otherwise.

Let's train an another model


In [115]:
from sklearn.ensemble import RandomForestClassifier

In [116]:
rf_clf = RandomForestClassifier(random_state=142)

y_pred_prob = cross_val_predict(rf_clf, X_train, y_train_5, method='predict_proba', cv=3)
y_pred_prob[:3]


Out[116]:
array([[ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 0.9,  0.1]])

In [117]:
# But to plot a ROC curve, you need scores, not probabilities.
# A simple solution is to use the positive class’s probability as the score:
y_forest_scores = y_pred_prob[:,-1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_forest_scores)

In [118]:
plot_roc_curve(fpr_forest, tpr_forest, label='RandomForest')
plot_roc_curve(fpr, tpr, label='SGD')



In [119]:
roc_auc_score(y_train_5, y_forest_scores)


Out[119]:
0.99187025528091965

MultiClass Classification


In [120]:
sgdm_clf = SGDClassifier(random_state=142)
sgdm_clf.fit(X_train, y_train)
sgdm_clf.predict([some_digits])


c:\tools\anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)
Out[120]:
array([ 3.])

In [121]:
sgdm_clf.decision_function([some_digits])


Out[121]:
array([[-192567.03179159, -361789.31555958, -502598.00495265,
          58562.9492373 , -274042.82402126, -122672.54777732,
        -613677.64452781, -587907.08421512, -129597.52546502,
        -290215.66494478]])

In [122]:
argmax = np.argmax(sgdm_clf.decision_function([some_digits]))
argmax


Out[122]:
3

In [123]:
sgdm_clf.classes_[argmax]


Out[123]:
3.0

If you want to force ScikitLearn to use one-versus-one or one-versus-all, you can use the OneVsOneClassifier or OneVsRestClassifier classes. Simply create an instance and pass a binary classifier to its constructor.


In [175]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

In [176]:
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=142))
ovo_clf.fit(X_train, y_train)


C:\tools\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)
Out[176]:
OneVsOneClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=142, shuffle=True,
       tol=None, verbose=0, warm_start=False),
          n_jobs=1)

In [177]:
ovo_clf.classes_


Out[177]:
array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])

In [180]:
len(ovo_clf.estimators_)  # there are 45 estimator


Out[180]:
45

In [184]:
y_pred_ovo = cross_val_predict(ovo_clf, X_train, y_train, cv=5)
cm=confusion_matrix(y_train, y_pred_ovo)

In [186]:
plt.matshow(cm, cmap=plt.cm.gray)


Out[186]:
<matplotlib.image.AxesImage at 0x201d9e12e48>

In [187]:
# normalizing the confusion matrix
row_sums = cm.sum(axis=1, keepdims=True)
norm_conf_mx = cm / row_sums

In [189]:
np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()



In [ ]:
def plot_digits()

In [192]:
cl_a,	cl_b	=	3,	5
X_aa	=	X_train[(y_train	==	cl_a)	&	(y_pred	==	cl_a)]
X_ab	=	X_train[(y_train	==	cl_a)	&	(y_pred	==	cl_b)]
X_ba	=	X_train[(y_train	==	cl_b)	&	(y_pred	==	cl_a)]
X_bb	=	X_train[(y_train	==	cl_b)	&	(y_pred	==	cl_b)]
plt.figure(figsize=(8,8))
plt.subplot(221);	plot_digits(X_aa[:25],	images_per_row=5)
plt.subplot(222);	plot_digits(X_ab[:25],	images_per_row=5)
plt.subplot(223);	plot_digits(X_ba[:25],	images_per_row=5)
plt.subplot(224);	plot_digits(X_bb[:25],	images_per_row=5)
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-192-decfe64b29d4> in <module>()
      5 X_bb    =       X_train[(y_train        ==      cl_b)   &       (y_pred ==      cl_b)]
      6 plt.figure(figsize=(8,8))
----> 7 plt.subplot(221);       plot_digits(X_aa[:25],  images_per_row=5)
      8 plt.subplot(222);       plot_digits(X_ab[:25],  images_per_row=5)
      9 plt.subplot(223);       plot_digits(X_ba[:25],  images_per_row=5)

NameError: name 'plot_digits' is not defined

In [ ]: