逻辑回归

在训练的初始阶段，我们将要构建一个逻辑回归模型来预测，某个学生是否被大学录取。设想你是大学相关部分的管理者，想通过申请学生两次测试的评分，来决定他们是否被录取。现在你拥有之前申请学生的可以用于训练逻辑回归的训练样本集。对于每一个训练样本，你有他们两次测试的评分和最后是被录取的结果。



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 中文乱码的处理
plt.rcParams['font.sans-serif'] =['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False



In [32]:

    
# 创建一些列数据
#随便创建一些列合适的点,然后画图观察
data = pd.DataFrame({'Exam1':np.concatenate((np.random.uniform(30,60,20),np.random.uniform(60,80,20))),'Exam2':np.concatenate((np.random.uniform(30,60,20),np.random.uniform(60,80,20))),'result':np.concatenate((np.zeros(20),np.ones(20)))})
# 作图分类点
positive = data[data['result'].isin([1])]
negative = data[data['result'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam1'], positive['Exam2'], s=50, c='b', marker='o', label='通过')
ax.scatter(negative['Exam1'], negative['Exam2'], s=50, c='r', marker='x', label='不通过')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()



In [9]:

    
# sigmoid 函数
def sigmoid(z):
    return 1/(1+np.exp(-z))

代价函数： $J\left( \theta \right)=-\frac{1}{m}\sum\limits_{i=1}^{m}{[{{y}^{(i)}}\log \left( {h_\theta}\left( {{x}^{(i)}} \right) \right)+\left( 1-{{y}^{(i)}} \right)\log \left( 1-{h_\theta}\left( {{x}^{(i)}} \right) \right)]}$

梯度函数以及推导：

$\frac{\partial }{\partial {\theta_{j}}}J\left( \theta \right)=\frac{\partial }{\partial {\theta_{j}}}[-\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( 1+{{e}^{-{\theta^{T}}{{x}^{(i)}}}} \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1+{{e}^{{\theta^{T}}{{x}^{(i)}}}} \right)]}]$ $=-\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\frac{-x_{j}^{(i)}{{e}^{-{\theta^{T}}{{x}^{(i)}}}}}{1+{{e}^{-{\theta^{T}}{{x}^{(i)}}}}}-\left( 1-{{y}^{(i)}} \right)\frac{x_j^{(i)}{{e}^{{\theta^T}{{x}^{(i)}}}}}{1+{{e}^{{\theta^T}{{x}^{(i)}}}}}}]$ $=-\frac{1}{m}\sum\limits_{i=1}^{m}{{y}^{(i)}}\frac{x_j^{(i)}}{1+{{e}^{{\theta^T}{{x}^{(i)}}}}}-\left( 1-{{y}^{(i)}} \right)\frac{x_j^{(i)}{{e}^{{\theta^T}{{x}^{(i)}}}}}{1+{{e}^{{\theta^T}{{x}^{(i)}}}}}]$ $=-\frac{1}{m}\sum\limits_{i=1}^{m}{\frac{{{y}^{(i)}}x_j^{(i)}-x_j^{(i)}{{e}^{{\theta^T}{{x}^{(i)}}}}+{{y}^{(i)}}x_j^{(i)}{{e}^{{\theta^T}{{x}^{(i)}}}}}{1+{{e}^{{\theta^T}{{x}^{(i)}}}}}}$ $=-\frac{1}{m}\sum\limits_{i=1}^{m}{\frac{{{y}^{(i)}}\left( 1\text{+}{{e}^{{\theta^T}{{x}^{(i)}}}} \right)-{{e}^{{\theta^T}{{x}^{(i)}}}}}{1+{{e}^{{\theta^T}{{x}^{(i)}}}}}x_j^{(i)}}$ $=-\frac{1}{m}\sum\limits_{i=1}^{m}{({{y}^{(i)}}-\frac{{{e}^{{\theta^T}{{x}^{(i)}}}}}{1+{{e}^{{\theta^T}{{x}^{(i)}}}}})x_j^{(i)}}$ $=-\frac{1}{m}\sum\limits_{i=1}^{m}{({{y}^{(i)}}-\frac{1}{1+{{e}^{-{\theta^T}{{x}^{(i)}}}}})x_j^{(i)}}$ $=-\frac{1}{m}\sum\limits_{i=1}^{m}{[{{y}^{(i)}}-{h_\theta}\left( {{x}^{(i)}} \right)]x_j^{(i)}}$ $=\frac{1}{m}\sum\limits_{i=1}^{m}{[{h_\theta}\left( {{x}^{(i)}} \right)-{{y}^{(i)}}]x_j^{(i)}}$



In [88]:

    
# 代价函数和梯段下降函数
def cost(theta,X,y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(y,np.log(sigmoid(X*theta.T)))
    second = np.multiply((1-y),np.log(1-sigmoid(X*theta.T)))
    return -np.sum(first+second)/len(X)

${\theta_j}:={\theta_j}-\alpha \frac{1}{m}\sum\limits_{i=1}^{m}{({h_\theta}({{x}^{(i)}})-{{y}^{(i)}}){{x}_{j}}^{(i)}}$来同时更新所有$\theta $的值。



In [110]:

    
# 梯度下降函数，和第一种自定义学习率和循环次数
def gradientSelf(theta,X,y,alpha,num):
    X = np.matrix(X)
    y = np.matrix(y)
    tc = int(theta.ravel().shape[1])
    for i in range(num):
        dist = sigmoid(X*theta.T) - y
        for j in range(tc):
            term = np.multiply(dist,X[:,j])
            theta[0,j] = theta[0,j] - (alpha/len(X)) * np.sum(term)
    return theta



In [93]:

    
## 单纯梯度下降函数(无学习率，也无需更新theta,相当于循环一次返回)，为第三方算法调用
def gradient(theta,X,y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    tc = int(theta.ravel().shape[1])
    grad = np.zeros(tc)
    dist = sigmoid(X*theta.T) - y
    for j in range(tc):
        term = np.multiply(dist,X[:,j])
        grad[j] = np.sum(term)/ len(X)
    return grad



In [37]:

    
# 执行测试
# 加一列常数列
data.insert(0, 'Ones', 1)

# 初始化X，y，θ
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]
# 转换X，y的类型
X = np.array(X.values)
y = np.array(y.values)









    



[[-0.95604786  0.50808839 -0.09931169]]



In [118]:

    
theta = np.matrix(np.zeros(3))
selfpredict=gradientSelf(theta,X,y,0.05,100000)
print(selfpredict)









    



[[-133.85163041    1.1383581     1.09589532]]



In [119]:

    
# 作图查看 两个变量时，分割线 sigma(z)=1/2 => -z=0 => x*θ.T = 0  => x2 = （-θ0 - θ1x1)/θ2
# thetaR = np.array([-130.02753949,    0.767743  ,    1.38019249])
thetaR = np.array([-133.85163041   , 1.1383581 ,    1.09589532])

predict = sigmoid(X*thetaR.T)
fig, ax = plt.subplots(figsize=(10,6))
plotting_x1 = np.linspace(20,100, 100)
plotting_h1 = ( - thetaR[0] - thetaR[1] * plotting_x1) / thetaR[2]

ax.plot(plotting_x1, plotting_h1, 'y', label='Prediction')
ax.scatter(positive['Exam1'], positive['Exam2'], s=50, c='b', marker='o', label='通过')
ax.scatter(negative['Exam1'], negative['Exam2'], s=50, c='r', marker='x', label='不通过')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
plt.show()



In [89]:

    
#
import scipy.optimize as opt
theta = theta = np.matrix(np.zeros(3))
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result
# 带入上面的图









    Out[89]:





(array([-130.02753949,    0.767743  ,    1.38019249]), 89, 4)

评价逻辑回归模型



In [122]:

    
# 定义预测函数
def predict(theta, X):
    probability = sigmoid(X * theta.T)
    return [1 if x >= 0.5 else 0 for x in probability]

# 统计预测正确率
theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) / len(correct))
print ('accuracy = {0}%'.format(accuracy*100))









    



accuracy = 100.0%

正则化逻辑回归



In [128]:

    
data_init = pd.read_csv('../data/weekRegData.txt', header=None, names=['x1', 'x2', 'Accepted'])
data_init.head()



In [129]:

    
positive2 = data_init[data_init['Accepted'].isin([1])]
negative2 = data_init[data_init['Accepted'].isin([0])]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['x1'], positive2['x2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['x1'], negative2['x2'], s=50, c='r', marker='x', label='Rejected')
ax.legend()
ax.set_xlabel('x1')
ax.set_ylabel('x2')
plt.show()

这个数据集不能像之前一样使用直线将两部分分割。而逻辑回归只适用于线性的分割，所以，这个数据集不适合直接使用逻辑回归。就以多项式理解， x 的次数越高，拟合的越好，但相应的预测的能力就可能变差。

对于线性回归的求解，我们之前推导了两种学习算法：一种基于梯度下降，一种基于正规方程。

正则化线性回归的代价函数为：

$J\left( \theta \right)=\frac{1}{2m}\sum\limits_{i=1}^{m}{[({{({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})}^{2}}+\lambda \sum\limits_{j=1}^{n}{\theta _{j}^{2}})]}$

如果我们要使用梯度下降法令这个代价函数最小化，因为我们未对$\theta_0$进行正则化，所以梯度下降算法将分两种情形：

$Repeat$ $until$ $convergence${

${\theta_0}:={\theta_0}-a\frac{1}{m}\sum\limits_{i=1}^{m}{(({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})x_{0}^{(i)}})$

${\theta_j}:={\theta_j}-a[\frac{1}{m}\sum\limits_{i=1}^{m}{(({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})x_{j}^{\left( i \right)}}+\frac{\lambda }{m}{\theta_j}]$

$for$ $j=1,2,...n$

}

对上面的算法中$ j=1,2,...,n$ 时的更新式子进行调整可得：



In [196]:

    
# 实现正则化的代价函数
def costReg(theta, X, y, learningRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learningRate / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg



In [135]:

    
# 一种更好的使用数据集的方式是为每组数据创造更多的特征。所以我们为每组x1 x2添加了最高到6次幂的特征
degree = 6
data2 = data_init
x1 = data2['x1']
x2 = data2['x2']
data2.insert(0, 'Ones', 1)
for i in range(2,degree+1):
    tmp = i*2
    data2.insert(tmp-1,'x1'+str(i),np.power(x1,i))
    data2.insert(tmp,'x2'+str(i),np.power(x2,i))
data2.head()









    Out[135]:







  
    
      
      Ones
      x1
      x2
      x12
      x22
      x13
      x23
      x14
      x24
      x15
      x25
      x16
      x26
      Accepted
    
  
  
    
      0
      1
      0.051267
      0.69956
      0.002628
      0.489384
      0.000135
      0.342354
      0.000007
      0.239497
      3.541519e-07
      0.167542
      1.815630e-08
      0.117206
      1
    
    
      1
      1
      -0.092742
      0.68494
      0.008601
      0.469143
      -0.000798
      0.321335
      0.000074
      0.220095
      -6.860919e-06
      0.150752
      6.362953e-07
      0.103256
      1
    
    
      2
      1
      -0.213710
      0.69225
      0.045672
      0.479210
      -0.009761
      0.331733
      0.002086
      0.229642
      -4.457837e-04
      0.158970
      9.526844e-05
      0.110047
      1
    
    
      3
      1
      -0.375000
      0.50219
      0.140625
      0.252195
      -0.052734
      0.126650
      0.019775
      0.063602
      -7.415771e-03
      0.031940
      2.780914e-03
      0.016040
      1
    
    
      4
      1
      -0.513250
      0.46564
      0.263426
      0.216821
      -0.135203
      0.100960
      0.069393
      0.047011
      -3.561597e-02
      0.021890
      1.827990e-02
      0.010193
      1



In [148]:

    
#  正则后代价函数
def gradientRegSelf(theta,X,y,alpha,num,learningRate ):
    X = np.matrix(X)
    y = np.matrix(y)
    tc = int(theta.ravel().shape[1])
    for i in range(num):
        dist = sigmoid(X*theta.T) - y
        for j in range(tc):
            term = np.multiply(dist,X[:,j]) 
            if j==0:
                theta[0,j] = theta[0,j] - (alpha/len(X)) * np.sum(term)
            else:
                theta[0,j] = theta[0,j] * (1-alpha*learningRate/len(X))- (alpha/len(X)) * np.sum(term)
    return theta



In [186]:

    
def gradientReg(theta, X, y, learningRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    error = sigmoid(X * theta.T) - y
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        if (i == 0):
            grad[i] = np.sum(term) / len(X)
        else:
            grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])
    return grad



In [149]:

    
cols = data2.shape[1]
X1 = data2.iloc[:,0:cols-1]
y1 = data2.iloc[:,cols-1:cols]
# 转换X，y的类型
X1 = np.array(X1.values)
y1 = np.array(y1.values)
theta = np.matrix(np.zeros(13))
# 调用梯度下降方法计算
selfpredict=gradientRegSelf(theta,X1,y1,0.05,100000,40)
print(selfpredict)









    



[[ 0.0895506  -0.03333885  0.02295319 -0.13245361 -0.08897617 -0.03900917
  -0.04814574 -0.10310981 -0.09467924 -0.0465955  -0.07054792 -0.08022341
  -0.09307719]]



In [217]:

    
#用工具库求解参数
import scipy.optimize as opt
theta = np.matrix(np.zeros(13))
learnRate=1
result2 = opt.fmin_tnc(func=costReg, x0=theta, fprime=gradientReg, args=(X1, y1,learnRate))
result2









    Out[217]:





(array([ 1.25462168,  0.38544275,  0.98698836, -2.05527228, -1.38893362,
         0.11638269, -0.21312394, -1.45072778, -1.13298213, -0.20722038,
        -0.42387879, -1.01661821, -0.85607168]),
 28,
 1)



In [195]:

    
# 预测
theta_min = np.matrix(result2[0])
predictions = predict(theta_min, X1)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y1)]
accuracy = (sum(map(int, correct)) / len(correct))
print ('accuracy = {0}%'.format(accuracy*100))









    



accuracy = 64.40677966101694%



In [219]:

    
#  画出决策的曲线 计算Z = X*θ.T
def funcZ(theta, x1, x2):
    temp = theta[0]
    place = 0
    for i in range(1, degree+1):
        temp += np.multiply(np.power(x1,i),theta[2*i -1 ]) + np.multiply(np.power(x2,i),theta[2*i])
    return temp
# 边界
# 这里为了避免计算整理二元多项式，x0+a*x1^2+b*x2^2+c*x1^3...=0
# X*θ.T=z  其中边界值z=0即可，这里通过取大量点，然后满足z近似为0 的x1 x2 的一系列的点构成即可
def find_decision_boundary(theta):
    # 选定合适范围，选取大量的点后在筛选
    t1 = np.linspace(-1, 1.5, 1000)
    t2 = np.linspace(-1, 1.5, 1000)                                              
    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    h_val = pd.DataFrame({'x1':x_cord, 'x2':y_cord})
    h_val['hval'] = funcZ(theta, h_val['x1'], h_val['x2'])
    decision = h_val[np.abs(h_val['hval']) <  10**-3]
    return decision.x1, decision.x2



In [222]:

    
# 这里的学习率取1，可以调整，分别查看learnRate=0 和100 是的过拟合和欠拟合
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['x1'], positive2['x2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['x1'], negative2['x2'], s=50, c='r', marker='x', label='Rejected')
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')

x, y = find_decision_boundary(result2[0])
plt.scatter(x, y, c='y', s=10, label='Prediction')
# ax.plot(x, y, 'y', label='Prediction')
ax.legend()
plt.show()



In [ ]:

	x1	x2	Accepted
0	0.051267	0.69956	1
1	-0.092742	0.68494	1
2	-0.213710	0.69225	1
3	-0.375000	0.50219	1
4	-0.513250	0.46564	1

	Ones	x1	x2	x12	x22	x13	x23	x14	x24	x15	x25	x16	x26	Accepted
0	1	0.051267	0.69956	0.002628	0.489384	0.000135	0.342354	0.000007	0.239497	3.541519e-07	0.167542	1.815630e-08	0.117206	1
1	1	-0.092742	0.68494	0.008601	0.469143	-0.000798	0.321335	0.000074	0.220095	-6.860919e-06	0.150752	6.362953e-07	0.103256	1
2	1	-0.213710	0.69225	0.045672	0.479210	-0.009761	0.331733	0.002086	0.229642	-4.457837e-04	0.158970	9.526844e-05	0.110047	1
3	1	-0.375000	0.50219	0.140625	0.252195	-0.052734	0.126650	0.019775	0.063602	-7.415771e-03	0.031940	2.780914e-03	0.016040	1
4	1	-0.513250	0.46564	0.263426	0.216821	-0.135203	0.100960	0.069393	0.047011	-3.561597e-02	0.021890	1.827990e-02	0.010193	1