notebook.community

Edit and run



In [1]:

    
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('MNIST original')
mnist









    Out[1]:





{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}



In [3]:

    
X, y = mnist['data'], mnist['target']
X.shape









    Out[3]:





(70000, 784)



In [7]:

    
import matplotlib
import matplotlib.pyplot as plt

some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image,  cmap=matplotlib.cm.binary, interpolation="nearest")
plt.axis('off')
plt.show()
y[36000]



In [10]:

    
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]


import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X[shuffle_index], y[shuffle_index]



In [11]:

    
y_train_5 = (y_train == 5)
y_test_5 = (y_test_5 == 5)









    Out[11]:





array([False, False, False, ..., False, False, False], dtype=bool)



In [13]:

    
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)
sgd_clf.predict([some_digit])









    



/home/gleb/workspace/handson-machine-learning/venv/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)






    Out[13]:





array([ True], dtype=bool)



In [20]:

    
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))



In [24]:

    
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=4, scoring="accuracy")









    Out[24]:





array([ 0.89027398,  0.9516    ,  0.946     ,  0.93432896])



In [25]:

    
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)



In [26]:

    
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")









    Out[26]:





array([ 0.90785,  0.91265,  0.90845])



In [27]:

    
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)



In [28]:

    
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)









    Out[28]:





array([[53459,  1120],
       [ 1275,  4146]])



In [29]:

    
from sklearn.metrics import precision_score, recall_score

precision_score(y_train_5, y_train_pred)









    Out[29]:





0.7873148499810102



In [30]:

    
recall_score(y_train_5, y_train_pred)









    Out[30]:





0.76480354178195908



In [ ]: