notebook.community

Edit and run



In [14]:

    
from sklearn.datasets import fetch_mldata
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline



In [4]:

    
# load data
mnist = fetch_mldata('MNIST original')



In [48]:

    
mnist









    Out[48]:





{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([0., 0., 0., ..., 9., 9., 9.])}



In [49]:

    
dir(mnist)









    Out[49]:





['COL_NAMES', 'DESCR', 'data', 'target']



In [50]:

    
X, y = mnist['data'], mnist['target']



In [51]:

    
print(X.shape)
print(y.shape)









    



(70000, 784)
(70000,)



In [52]:

    
# assign a random image
random_data = np.random.randint(0,X.shape[0],1)
some_digit = X[random_data]
some_digit_img = some_digit.reshape(28,28)
plt.imshow(some_digit_img)
print("Actual label:", y[random_data])









    



Actual label: [7.]



In [53]:

    
# create test
# MNIST is already split into training and testing data

X_train, X_test, y_train, y_test = X[:60000], X[60000:],y[:60000], y[60000:]



In [54]:

    
# randomly shuffle the indices to prevent cross-validation from failing, and to prevent
# unexpected behavior from having too many digits of same type in a row

reshuffle_idx = np.random.permutation(60000)
X_train, y_train = X[reshuffle_idx], y[reshuffle_idx]



In [55]:

    
# implement SGD classifier
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=123)
sgd_clf.fit(X_train, y_train)









    Out[55]:





SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=123, shuffle=True,
       verbose=0, warm_start=False)



In [56]:

    
# look at reapeated KFold cv
from sklearn.model_selection import cross_val_score

val_scores = []
for i in range(3):
    val_scores.append(cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring="accuracy"))
    print("Iteration:", i+1)
print(val_scores)









    



Iteration: 1
Iteration: 2
Iteration: 3
[array([0.85464535, 0.84924205, 0.862023  , 0.8605    , 0.86366667,
       0.85030838, 0.8468078 , 0.85914319, 0.88260797, 0.87174783]), array([0.85464535, 0.84924205, 0.862023  , 0.8605    , 0.86366667,
       0.85030838, 0.8468078 , 0.85914319, 0.88260797, 0.87174783]), array([0.85464535, 0.84924205, 0.862023  , 0.8605    , 0.86366667,
       0.85030838, 0.8468078 , 0.85914319, 0.88260797, 0.87174783])]



In [57]:

    
np.mean(val_scores)









    Out[57]:





0.8600692242225849



In [58]:

    
# look at confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
y_train_pred









    Out[58]:





array([7., 0., 2., ..., 1., 5., 5.])



In [59]:

    
confusion_mat = confusion_matrix(y_train, y_train_pred)



In [60]:

    
# visualise
import seaborn as sns
sns.heatmap(confusion_mat, cmap="plasma")









    Out[60]:





<matplotlib.axes._subplots.AxesSubplot at 0x11f8ce668>



In [61]:

    
# get the wrongly classified images
idx_list = []
for ind, el in enumerate(y_train):
    if y_train_pred[ind] != el:
        idx_list.append(ind)



In [74]:

    
# randomly pick one and show it
idx_pos = np.random.randint(0,len(idx_list)-1)
wrong_image = X_train[idx_list[idx_pos]].reshape(28,28)
plt.imshow(wrong_image)
print("Classified as %d but was actually %d" % (y_train_pred[idx_list[idx_pos]], y_train[idx_list[idx_pos]]))









    



Classified as 7 but was actually 9



In [83]:

    
pd.Series(y_train[idx_list]).value_counts()









    Out[83]:





9.0    1917
5.0    1277
8.0    1239
2.0     892
4.0     786
3.0     609
7.0     519
6.0     379
1.0     283
0.0     258
dtype: int64



In [ ]: