In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.io import loadmat
from sklearn.preprocessing import OneHotEncoder
In [2]:
data = loadmat('../data/andrew_ml_ex33507/ex3data1.mat')
data
Out[2]:
In [3]:
X = data['X']
y = data['y']
X.shape, y.shape#看下维度
Out[3]:
In [4]:
# 目前考虑输入是图片的像素值,20*20像素的图片有400个输入层单元,不包括需要额外添加的加上常数项。 材料已经提供了训练好的神经网络的参数,有25个单元和10个输出单元(10个输出)
weight = loadmat("../data/andrew_ml_ex33507/ex3weights.mat")
theta1, theta2 = weight['Theta1'], weight['Theta2']
theta1.shape, theta2.shape
Out[4]:
In [6]:
sample_idx = np.random.choice(np.arange(data['X'].shape[0]), 100)
sample_images = data['X'][sample_idx, :]
#展示二进制图
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(8, 8))
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(np.array(sample_images[5 * r + c].reshape((20, 20))).T,cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
按照默认 我们设计一个输入层,一个隐藏层,一个输出层
前向传播和代价函数
在逻辑回归中,我们只有一个输出变量,又称标量(scalar),也只有一个因变量$y$,但是在神经网络中,我们可以有很多输出变量,我们的$h_\theta(x)$是一个维度为$K$的向量,并且我们训练集中的因变量也是同样维度的一个向量,因此我们的代价函数会比逻辑回归更加复杂一些,为:$\newcommand{\subk}[1]{ #1_k }$ $$h_\theta\left(x\right)\in \mathbb{R}^{K}$$ $${\left({h_\theta}\left(x\right)\right)}_{i}={i}^{th} \text{output}$$
$J(\Theta) = -\frac{1}{m} \left[ \sum\limits_{i=1}^{m} \sum\limits_{k=1}^{k} {y_k}^{(i)} \log \subk{(h_\Theta(x^{(i)}))} + \left( 1 - y_k^{(i)} \right) \log \left( 1- \subk{\left( h_\Theta \left( x^{(i)} \right) \right)} \right) \right] + \frac{\lambda}{2m} \sum\limits_{l=1}^{L-1} \sum\limits_{i=1}^{s_l} \sum\limits_{j=1}^{s_{l+1}} \left( \Theta_{ji}^{(l)} \right)^2$
In [7]:
def sigmoid(z):
return 1 / (1 + np.exp(-z))
In [62]:
#2st 上面传播规律,定义第一层,并计算第二层(隐藏层)的值,并添加额外值
def forward_propagate(X,theta1,theta2):
m= X.shape[0]
a1 = np.insert(X,0, values=np.ones(m), axis=1)
Z2 = a1*theta1.T
a2= np.insert(sigmoid(Z2),0, values=np.ones(m), axis=1)
Z3= a2*theta2.T
h= sigmoid(Z3)
return a1,Z2,a2,Z3,h
In [14]:
# 代价函数(不带规则化项(也叫权重衰减项) Y=R(5000*10) ,这里直接使用二维矩阵,代替循环累加
def cost(X,Y,theta1,theta2):
m = X.shape[0]
X = np.matrix(X)
Y = np.matrix(Y)
h=forward_propagate(X,theta1,theta2)
# multiply 矩阵size相同对应相乘
first = np.multiply(Y,np.log(h))
second = np.multiply((1-Y),np.log((1-h)))
J= np.sum(first+second)
J = (-1/m)*J
return J
In [11]:
# 对y标签进行编码 一开始我们得到的y是维500*1 的向量,但我们要把他编码成的矩阵。 比如说,原始y0=2,那么转化后的Y对应行就是[0,1,0...0],原始转化后的Y对应行就是[0,0...0,1]
# Scikitlearn有一个内置的编码函数,我们可以使用这个。
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_onehot.shape
Out[11]:
In [12]:
y[0], y_onehot[0,:] # y0是数字0
Out[12]:
In [ ]:
# 初始化设置
input_size = 400
num_labels = 10
In [15]:
cost(X, y_onehot,theta1, theta2)
Out[15]:
In [76]:
# 加入正则项
def cost_reg(X,Y,theta1,theta2,learning_rate):
m = X.shape[0]
X = np.matrix(X)
Y = np.matrix(Y)
_,_,_,_,h=forward_propagate(X,theta1,theta2)
first = np.multiply(Y,np.log(h))
second = np.multiply((1-Y),np.log((1-h)))
J= np.sum(first+second)
# 计算正则时,第一项时不参与计算
J = (-1/m)*J + (float(learning_rate) / (2 * m))*(np.sum(np.power(theta1[:,1:],2))+np.sum(np.power(theta2[:,1:],2)))
return J
In [165]:
# theta1.shape,theta2.shape
cost_reg(X, y_onehot,theta1, theta2,1)
Out[165]:
In [24]:
# 计算sigmoid函数的导数
def sigmoid_gradient(z):
return np.multiply(sigmoid(z) ,(1-sigmoid(z)))
# 检查
sigmoid_gradient(0)
Out[24]:
In [117]:
# 初始化设置
input_size = 400 #输入单元数量
hidden_size = 25 # y隐藏单元数量
num_labels = 10 # 输出单元数
epsilon = 0.001
theta01=np.random.rand(hidden_size,input_size+1) * 2*epsilon - epsilon# +1是添加偏置单元
theta02 =np.random.rand(num_labels,hidden_size+1)* 2*epsilon - epsilon
theta01.shape,theta02.shape
Out[117]:
步骤:
In [216]:
# 分别得出
def forward_propagateNEW(X,thetalist):
m= X.shape[0]
a = np.insert(X,0, values=np.ones(m), axis=1)
alist=[a]
zlist=[]
for i in range(len(thetalist)):
theta= thetalist[i]
z = a * theta
# a= np.insert(sigmoid(z),0, values=np.ones(m), axis=1)
a=sigmoid(z)
if(i<len(thetalist)-1):
a= np.insert(a,0, values=np.ones(m), axis=1)
zlist.append(z)
alist.append(a)
return zlist,alist
In [333]:
# Δ 用delta1 和delta2 替代
def backpropRegSelf(input_size, hidden_size, num_labels, X, y, learning_rate,L=3): # 随机化后的 这里为3层
m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)
#初始化参数
theta1 = (np.random.random((input_size+1,hidden_size))- 0.5)* 0.24
theta2 = (np.random.random((hidden_size+1,num_labels))- 0.5)* 0.24
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y) # 格式化y
# 前向计算 每层值
theta = [theta1, theta2]
zlist,alist = forward_propagateNEW(X, theta)# 返回 a1 z2 a2 。。。
# 初始化Deta
Delta=[]
for th in theta:
Delta.append(np.zeros(th.shape))
for i in range(m):
# 以计算a z
for l in range(L,1,-1): # 3,2 表示层数,最后一层已经算出来,单独列放
#最后一层
if l==L:
delta=alist[-1][i,:]-y_onehot[i,:] # 最后一层得δ
Delta[l-2] = Delta[l-2] + alist[l-2][i,:].T * delta
else:
zl = zlist[l-2][i,:]
zl = np.insert(zl, 0, values=np.ones(1)) # (1, 26) 怎加偏执项
# d2t = np.multiply((theta2.T * d3t.T).T, sigmoid_gradient(z2t)) # (1, 26)
# delta1 = delta1 + (d2t[:,1:]).T * a1t
delta = np.multiply(delta*theta[l-1].T, sigmoid_gradient(zl)) #
# 因为数组从零开始,且 Delta 为 1 2 层开始 delta 从2 层开始 # (25, 401)# (10, 26)
Delta[l-2] = Delta[l-2] + alist[l-2][i,:].T * delta[:,1:]
# add the gradient regularization term
gradAll = None
for j in range(len(Delta)):
Delta[j][:,1:] = Delta[j][:,1:]/m + (theta[j][:,1:] * learning_rate) / m
if gradAll is None:
gradAll = np.ravel(Delta[j])
else:
tmp=np.ravel(Delta[j])
gradAll = np.concatenate([gradAll,tmp])
# Delta[:,:,1:] = Delta[:,:,1:] + (theta[:,:,1:] * learning_rate) / m
return gradAll
In [334]:
grad2= backpropRegSelf(input_size, hidden_size, num_labels, X, y, 1)
print(grad2.shape)
In [97]:
def backpropReg(params, input_size, hidden_size, num_labels, X, y, learning_rate):
m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)
# reshape the parameter array into parameter matrices for each layer
theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
# run the feed-forward pass
a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
# initializations
J = 0
delta1 = np.zeros(theta1.shape) # (25, 401)
delta2 = np.zeros(theta2.shape) # (10, 26)
# compute the cost
for i in range(m):
first_term = np.multiply(-y[i,:], np.log(h[i,:]))
second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
J += np.sum(first_term - second_term)
J = J / m
# add the cost regularization term
J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
# perform backpropagation
for t in range(m):
a1t = a1[t,:] # (1, 401)
z2t = z2[t,:] # (1, 25)
a2t = a2[t,:] # (1, 26)
ht = h[t,:] # (1, 10)
yt = y[t,:] # (1, 10)
d3t = ht - yt # (1, 10)
z2t = np.insert(z2t, 0, values=np.ones(1)) # (1, 26)
d2t = np.multiply((theta2.T * d3t.T).T, sigmoid_gradient(z2t)) # (1, 26)
delta1 = delta1 + (d2t[:,1:]).T * a1t
delta2 = delta2 + d3t.T * a2t
delta1 = delta1 / m
delta2 = delta2 / m
# add the gradient regularization term
delta1[:,1:] = delta1[:,1:] + (theta1[:,1:] * learning_rate) / m
delta2[:,1:] = delta2[:,1:] + (theta2[:,1:] * learning_rate) / m
# unravel the gradient matrices into a single array
grad = np.concatenate((np.ravel(delta1), np.ravel(delta2)))
return J, grad
In [289]:
# np.random.random(size) 返回size大小的0-1随机浮点数
params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.24
j,grad = backpropReg(params, input_size, hidden_size, num_labels, X, y, 1)
print(j,grad.shape)
# j2,grad2= backpropRegSelf(input_size, hidden_size, num_labels, X, y, 1)
# print(j2,grad2[0:10])
In [175]:
# #J θ
# input_size = 400 #输入单元数量
# hidden_size = 25 # y隐藏单元数量
# num_labels = 10 # 输出单元数
def jcost(X, y,input_size, hidden_size, output_size,theta):
m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)
theta1 = np.reshape(theta[0:hidden_size*(input_size+1)],(hidden_size,input_size+1))#(25,401)
theta2 = np.reshape(theta[hidden_size*(input_size+1):],(output_size,hidden_size+1))#(10.26)
_,_,_,_,h=forward_propagate(X,theta1,theta2)
# multiply 矩阵size相同对应相乘
first = np.multiply(y,np.log(h))
second = np.multiply((1-y),np.log((1-h)))
J= np.sum(first+second)
J = (-1/m)*J
return J
In [176]:
def check(X,y,theta1,theta2,eps):
theta = np.concatenate((np.ravel(theta1), np.ravel(theta2)))
gradapprox=np.zeros(len(theta))
for i in range(len(theta)):
thetaplus = theta
thetaplus[i] = thetaplus[i] + eps
thetaminus = theta
thetaminus[i] = thetaminus[i] - eps
gradapprox[i] = (jcost(X,y,input_size,hidden_size,num_labels,thetaplus) - jcost(X,y,input_size,hidden_size,num_labels,thetaminus)) / (2 * epsilon)
return gradapprox
In [335]:
# theta01.shape , theta02.shape
# 计算很慢
gradapprox = check(X,y_onehot,theta1,theta2,0.001)
numerator = np.linalg.norm(grad2-gradapprox, ord=2) # Step 1'
denominator = np.linalg.norm(grad2, ord=2) + np.linalg.norm(gradapprox, ord=2) # Step 2'
difference = numerator / denominator
print(difference)
In [99]:
# 使用工具库计算参数最优解
from scipy.optimize import minimize
# opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
fmin = minimize(fun=backpropReg, x0=(params), args=(input_size, hidden_size, num_labels, X, y_onehot, learning_rate),
method='TNC', jac=True, options={'maxiter': 250})
fmin
Out[99]:
In [189]:
X = np.matrix(X)
thetafinal1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
thetafinal2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
In [343]:
print(thetafinal1[0,1],grad2[1])
In [190]:
# 计算使用优化后的θ得出的预测
a1, z2, a2, z3, h = forward_propagate(X, thetafinal1, thetafinal2 )
y_pred = np.array(np.argmax(h, axis=1) + 1)
y_pred
Out[190]:
In [191]:
# 最后,我们可以计算准确度,看看我们训练完毕的神经网络效果怎么样。
# 预测值与实际值比较
from sklearn.metrics import classification_report#这个包是评价报告
print(classification_report(y, y_pred))
In [103]:
hidden_layer = thetafinal1[:, 1:]
hidden_layer.shape
Out[103]:
In [104]:
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(12, 12))
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(np.array(hidden_layer[5 * r + c].reshape((20, 20))),cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
In [ ]: