In [14]:
from sklearn.datasets import fetch_mldata
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [4]:
# load data
mnist = fetch_mldata('MNIST original')
In [48]:
mnist
Out[48]:
In [49]:
dir(mnist)
Out[49]:
In [50]:
X, y = mnist['data'], mnist['target']
In [51]:
print(X.shape)
print(y.shape)
In [52]:
# assign a random image
random_data = np.random.randint(0,X.shape[0],1)
some_digit = X[random_data]
some_digit_img = some_digit.reshape(28,28)
plt.imshow(some_digit_img)
print("Actual label:", y[random_data])
In [53]:
# create test
# MNIST is already split into training and testing data
X_train, X_test, y_train, y_test = X[:60000], X[60000:],y[:60000], y[60000:]
In [54]:
# randomly shuffle the indices to prevent cross-validation from failing, and to prevent
# unexpected behavior from having too many digits of same type in a row
reshuffle_idx = np.random.permutation(60000)
X_train, y_train = X[reshuffle_idx], y[reshuffle_idx]
In [55]:
# implement SGD classifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=123)
sgd_clf.fit(X_train, y_train)
Out[55]:
In [56]:
# look at reapeated KFold cv
from sklearn.model_selection import cross_val_score
val_scores = []
for i in range(3):
val_scores.append(cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring="accuracy"))
print("Iteration:", i+1)
print(val_scores)
In [57]:
np.mean(val_scores)
Out[57]:
In [58]:
# look at confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
y_train_pred
Out[58]:
In [59]:
confusion_mat = confusion_matrix(y_train, y_train_pred)
In [60]:
# visualise
import seaborn as sns
sns.heatmap(confusion_mat, cmap="plasma")
Out[60]:
In [61]:
# get the wrongly classified images
idx_list = []
for ind, el in enumerate(y_train):
if y_train_pred[ind] != el:
idx_list.append(ind)
In [74]:
# randomly pick one and show it
idx_pos = np.random.randint(0,len(idx_list)-1)
wrong_image = X_train[idx_list[idx_pos]].reshape(28,28)
plt.imshow(wrong_image)
print("Classified as %d but was actually %d" % (y_train_pred[idx_list[idx_pos]], y_train[idx_list[idx_pos]]))
In [83]:
pd.Series(y_train[idx_list]).value_counts()
Out[83]:
In [ ]: