In [14]:
from sklearn.datasets import fetch_mldata
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [4]:
# load data
mnist = fetch_mldata('MNIST original')

In [48]:
mnist


Out[48]:
{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([0., 0., 0., ..., 9., 9., 9.])}

In [49]:
dir(mnist)


Out[49]:
['COL_NAMES', 'DESCR', 'data', 'target']

In [50]:
X, y = mnist['data'], mnist['target']

In [51]:
print(X.shape)
print(y.shape)


(70000, 784)
(70000,)

In [52]:
# assign a random image
random_data = np.random.randint(0,X.shape[0],1)
some_digit = X[random_data]
some_digit_img = some_digit.reshape(28,28)
plt.imshow(some_digit_img)
print("Actual label:", y[random_data])


Actual label: [7.]

In [53]:
# create test
# MNIST is already split into training and testing data

X_train, X_test, y_train, y_test = X[:60000], X[60000:],y[:60000], y[60000:]

In [54]:
# randomly shuffle the indices to prevent cross-validation from failing, and to prevent
# unexpected behavior from having too many digits of same type in a row

reshuffle_idx = np.random.permutation(60000)
X_train, y_train = X[reshuffle_idx], y[reshuffle_idx]

In [55]:
# implement SGD classifier
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=123)
sgd_clf.fit(X_train, y_train)


Out[55]:
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=123, shuffle=True,
       verbose=0, warm_start=False)

In [56]:
# look at reapeated KFold cv
from sklearn.model_selection import cross_val_score

val_scores = []
for i in range(3):
    val_scores.append(cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring="accuracy"))
    print("Iteration:", i+1)
print(val_scores)


Iteration: 1
Iteration: 2
Iteration: 3
[array([0.85464535, 0.84924205, 0.862023  , 0.8605    , 0.86366667,
       0.85030838, 0.8468078 , 0.85914319, 0.88260797, 0.87174783]), array([0.85464535, 0.84924205, 0.862023  , 0.8605    , 0.86366667,
       0.85030838, 0.8468078 , 0.85914319, 0.88260797, 0.87174783]), array([0.85464535, 0.84924205, 0.862023  , 0.8605    , 0.86366667,
       0.85030838, 0.8468078 , 0.85914319, 0.88260797, 0.87174783])]

In [57]:
np.mean(val_scores)


Out[57]:
0.8600692242225849

In [58]:
# look at confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
y_train_pred


Out[58]:
array([7., 0., 2., ..., 1., 5., 5.])

In [59]:
confusion_mat = confusion_matrix(y_train, y_train_pred)

In [60]:
# visualise
import seaborn as sns
sns.heatmap(confusion_mat, cmap="plasma")


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f8ce668>

In [61]:
# get the wrongly classified images
idx_list = []
for ind, el in enumerate(y_train):
    if y_train_pred[ind] != el:
        idx_list.append(ind)

In [74]:
# randomly pick one and show it
idx_pos = np.random.randint(0,len(idx_list)-1)
wrong_image = X_train[idx_list[idx_pos]].reshape(28,28)
plt.imshow(wrong_image)
print("Classified as %d but was actually %d" % (y_train_pred[idx_list[idx_pos]], y_train[idx_list[idx_pos]]))


Classified as 7 but was actually 9

In [83]:
pd.Series(y_train[idx_list]).value_counts()


Out[83]:
9.0    1917
5.0    1277
8.0    1239
2.0     892
4.0     786
3.0     609
7.0     519
6.0     379
1.0     283
0.0     258
dtype: int64

In [ ]: