notebook.community

Edit and run



In [1]:

    
from PIL import Image
import numpy as np



In [2]:

    
import os
import urllib
from urllib.request import urlretrieve
dataset = 'mnist.pkl.gz'
def reporthook(a,b,c):
    print("\rdownloading: %5.1f%%"%(a*b*100.0/c), end="")
    
if not os.path.isfile(dataset):
        origin = "https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz"
        print('Downloading data from %s' % origin)
        urlretrieve(origin, dataset, reporthook=reporthook)



In [3]:

    
import gzip
import pickle
with gzip.open(dataset, 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='latin1')



In [4]:

    
print("train_set", train_set[0].shape, train_set[1].shape)
print("valid_set", valid_set[0].shape, valid_set[1].shape)
print("test_set", test_set[0].shape, test_set[1].shape)









    



train_set (50000, 784) (50000,)
valid_set (10000, 784) (10000,)
test_set (10000, 784) (10000,)



In [5]:

    
# 訓練資料， Y 的前 20 筆
train_set[1][:20]









    Out[5]:





array([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9])



In [6]:

    
# 訓練資料， X 的前 20 筆
int_X = (train_set[0][:20]*255).clip(0,255).astype('uint8')
int_X_reshape = int_X.reshape(-1,28,28).swapaxes(0,1).reshape(28,-1)
Image.fromarray(int_X_reshape)









    Out[6]:



In [7]:

    
# 資料正規化
train_X  = train_set[0]
train_X  = train_X / np.linalg.norm(train_X, axis=1, keepdims=True)
test_X  = test_set[0]
test_X  = test_X / np.linalg.norm(test_X, axis=1, keepdims=True)



In [8]:

    
# 矩陣乘法
A = test_X @ train_X.T



In [9]:

    
A.argmax(axis=1)









    Out[9]:





array([44566, 28882, 15224, ...,  3261,  1311, 22424])



In [10]:

    
y_predict = train_set[1][A.argmax(axis=1)]



In [11]:

    
# 測試資料， X 的前 20 筆
int_X = (test_set[0][:20]*255).clip(0,255).astype('uint8')
int_X_reshape = int_X.reshape(-1,28,28).swapaxes(0,1).reshape(28,-1)
Image.fromarray(int_X_reshape)









    Out[11]:



In [13]:

    
# 猜測的 Y 前20筆
y_predict[:20]









    Out[13]:





array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4])



In [14]:

    
#測試資料的 Y 前 20 筆
test_set[1][:20]









    Out[14]:





array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4])



In [15]:

    
# 正確率
(y_predict == test_set[1]).mean()









    Out[15]:





0.9708



In [16]:

    
# 用 PCA 降低維度到 60
from sklearn.decomposition import PCA
pca = PCA(n_components=60)
train_X = pca.fit_transform(train_set[0])
test_X = pca.transform(test_set[0])



In [17]:

    
train_X /= np.linalg.norm(train_X, axis=1, keepdims=True)
test_X /= np.linalg.norm(test_X, axis=1, keepdims=True)



In [18]:

    
# 矩陣乘法
A = test_X @ train_X.T



In [19]:

    
y_predict = train_set[1][A.argmax(axis=1)]



In [20]:

    
# 正確率
(y_predict == test_set[1]).mean()









    Out[20]:





0.97030000000000005



In [ ]: