In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import os
import urllib
dataset = 'mnist.pkl.gz'
def reporthook(a,b,c):
    print "\rdownloading: %5.1f%%"%(a*b*100.0/c),
    
if not os.path.isfile(dataset):
        origin = "https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz"
        print('Downloading data from %s' % origin)
        urllib.urlretrieve(origin, dataset, reporthook=reporthook)


Downloading data from https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz
downloading: 100.0%

In [3]:
import gzip
import pickle
with gzip.open(dataset, 'rb') as f:
            train_set, valid_set, test_set = pickle.load(f)

In [4]:
print "train_set", train_set[0].shape, train_set[1].shape
print "valid_set", valid_set[0].shape, valid_set[1].shape
print "test_set", test_set[0].shape, test_set[1].shape


train_set (50000L, 784L) (50000L,)
valid_set (10000L, 784L) (10000L,)
test_set (10000L, 784L) (10000L,)

In [5]:
imshow(train_set[0][0].reshape((28, 28)), cmap="gray")


Out[5]:
<matplotlib.image.AxesImage at 0x5dc7c18>

In [6]:
def show(x, i=[0]):
    plt.figure(i[0])
    imshow(x.reshape((28,28)), cmap="gray")
    i[0]+=1
for i in range(5):
    print train_set[1][i]
    show(train_set[0][i])


5
0
4
1
9

In [7]:
W = np.random.uniform(low=-1, high=1, size=(28*28,10))
b = np.random.uniform(low=-1, high=1, size=10)

In [8]:
x = train_set[0][0]
y = train_set[1][0]

In [9]:
Pr = exp(dot(x, W)+b)
Pr.shape


Out[9]:
(10L,)

In [10]:
Pr = Pr/Pr.sum()
print Pr


[  2.75874373e-03   5.86601964e-04   6.84430338e-01   2.50472684e-05
   2.37169047e-07   1.85013911e-03   1.18603684e-06   1.78285187e-05
   1.15474516e-01   1.94855361e-01]

In [11]:
loss = -log(Pr[y])
loss


Out[11]:
6.2924944465420536

In [12]:
gradb = Pr.copy()
gradb[y] -= 1
print gradb


[  2.75874373e-03   5.86601964e-04   6.84430338e-01   2.50472684e-05
   2.37169047e-07  -9.98149861e-01   1.18603684e-06   1.78285187e-05
   1.15474516e-01   1.94855361e-01]

In [13]:
gradb = Pr.copy()
gradb[y] -= 1
print gradb


[  2.75874373e-03   5.86601964e-04   6.84430338e-01   2.50472684e-05
   2.37169047e-07  -9.98149861e-01   1.18603684e-06   1.78285187e-05
   1.15474516e-01   1.94855361e-01]

In [14]:
print Pr.shape, x.shape, W.shape
gradW = dot(x.reshape(784,1), Pr.reshape(1,10), )
gradW[:, y] -= x


(10L,) (784L,) (784L, 10L)

In [15]:
W -= 0.1 * gradW
b -= 0.1 * gradb

In [16]:
Pr = exp(dot(x, W)+b)
Pr = Pr/Pr.sum()
loss = -log(Pr[y])
loss


Out[16]:
0.0044462033731483561

In [17]:
W = np.random.uniform(low=-1, high=1, size=(28*28,10))
b = np.random.uniform(low=-1, high=1, size=10)
score = 0
N=50000*20
d = 0.001
learning_rate = 1e-2
for i in xrange(N):
    if i%50000==0:
        print i, "%5.3f%%"%(score*100)
    x = train_set[0][i%50000]
    y = train_set[1][i%50000]
    Pr = exp(dot(x, W)+b)
    Pr = Pr/Pr.sum()
    loss = -log(Pr[y])
    score *=(1-d)
    if Pr.argmax() == y:
        score += d
    gradb = Pr.copy()
    gradb[y] -= 1
    gradW = dot(x.reshape(784,1), Pr.reshape(1,10), )
    gradW[:, y] -= x
    W -= learning_rate * gradW
    b -= learning_rate * gradb


0 0.000%
50000 87.524%
100000 88.880%
150000 89.807%
200000 90.120%
250000 90.543%
300000 90.693%
350000 90.968%
400000 91.308%
450000 91.262%
500000 91.401%
550000 91.547%
600000 91.668%
650000 91.675%
700000 91.819%
750000 91.732%
800000 91.805%
850000 91.806%
900000 91.821%
950000 91.899%

In [18]:
def compute_Pr(x):
    Pr = exp(dot(x, W)+b)
    return Pr/Pr.sum(axis=1, keepdims=True)
def compute_accuracy(Pr, y):
    return mean(Pr.argmax(axis=1)==y)

In [19]:
W = np.random.uniform(low=-1, high=1, size=(28*28,10))
b = np.random.uniform(low=-1, high=1, size=10)
score = 0
N=50000*100
batch_size = 500
learning_rate = .7
for i in xrange(0, N, batch_size):
    if i%100000==0:
        x, y  = test_set[0], test_set[1]
        test_score = compute_accuracy(compute_Pr(x), y)*100
        x, y = valid_set[0], valid_set[1]
        valid_score = compute_accuracy(compute_Pr(x), y)*100
        print i, "%5.2f%%"%test_score, "%5.2f%%"%valid_score
    # 隨機選出一些訓練資料出來
    rndidx = np.random.choice(train_set[0].shape[0], batch_size, replace=False)
    x, y  = train_set[0][rndidx], train_set[1][rndidx]
    # 一次計算所有的 Pr
    Pr = compute_Pr(x)
    # 計算平均 gradient 
    gradb = Pr.mean(axis=0)-[(y==i).mean() for i in range(10)]
    gradW = dot(x.T, Pr)
    for i in range(batch_size):
        gradW[:, y[i]]-=x[i]
    gradW /= batch_size
    # 更新 W 和 b
    W -= learning_rate * gradW
    b -= learning_rate * gradb


0 12.88% 12.78%
100000 86.91% 86.90%
200000 89.04% 89.06%
300000 89.79% 89.98%
400000 90.17% 90.43%
500000 90.47% 90.61%
600000 90.74% 90.93%
700000 90.87% 91.06%
800000 91.19% 91.26%
900000 91.01% 91.44%
1000000 91.13% 91.59%
1100000 91.37% 91.54%
1200000 91.35% 91.54%
1300000 91.41% 91.73%
1400000 91.41% 91.64%
1500000 91.51% 91.86%
1600000 91.48% 91.91%
1700000 91.63% 91.93%
1800000 91.44% 92.06%
1900000 91.57% 92.12%
2000000 91.75% 92.13%
2100000 91.92% 92.16%
2200000 91.84% 92.31%
2300000 91.71% 92.28%
2400000 91.59% 92.14%
2500000 91.92% 92.21%
2600000 91.90% 92.47%
2700000 92.05% 92.41%
2800000 91.79% 92.32%
2900000 91.99% 92.39%
3000000 91.98% 92.34%
3100000 91.81% 92.49%
3200000 91.98% 92.49%
3300000 91.98% 92.56%
3400000 92.00% 92.41%
3500000 91.86% 92.43%
3600000 91.99% 92.64%
3700000 92.13% 92.63%
3800000 92.14% 92.60%
3900000 91.98% 92.67%
4000000 92.14% 92.63%
4100000 92.25% 92.70%
4200000 92.02% 92.47%
4300000 92.09% 92.60%
4400000 92.18% 92.54%
4500000 92.16% 92.50%
4600000 92.08% 92.52%
4700000 92.08% 92.51%
4800000 92.07% 92.63%
4900000 92.19% 92.64%

In [20]:
x, y  = test_set[0], test_set[1]
Pr = compute_Pr(x)
test_score = compute_accuracy(Pr, y)*100
x, y = valid_set[0], valid_set[1]
Pr = compute_Pr(x)
valid_score = compute_accuracy(Pr, y)*100
print "test accuracy %5.2f%%"%test_score, "valid accuracy %5.2f%%"%valid_score
x, y = train_set[0], train_set[1]
Pr = compute_Pr(x)
train_score = compute_accuracy(Pr, y)*100
print "train accuracy %5.2f%%"%train_score


test accuracy 92.10% valid accuracy 92.71%
train accuracy 93.14%

In [21]:
x = test_set[0][:10]
y = test_set[1][:10]
Pr = compute_Pr(x)
print Pr.argmax(axis=1)
print y
for i in range(10):
    show(x[i])


[7 2 1 0 4 1 4 9 6 9]
[7 2 1 0 4 1 4 9 5 9]

In [22]:
x = test_set[0][:100]
y = test_set[1][:100]
Pr = compute_Pr(x)
y2 = Pr.argmax(axis=1)
for i in range(100):
    if y2[i] != y[i]:
        print y2[i], y[i]
        show(x[i])


6 5
6 4
2 3
3 6

In [ ]: