In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from numpy.linalg import inv, qr, svd, solve, lstsq
import seaborn as sns
%matplotlib inline


/Users/xpgeng/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()


Out[3]:
x1 x2 x3 y
0 0.511942 0.412118 0.540302 3.135134
1 -0.385495 -0.756802 0.753902 1.189656
2 -0.016638 0.989358 0.540302 2.404029
3 0.786973 0.141120 1.000000 6.359970
4 0.395976 0.656987 0.540302 2.882558

In [4]:
df.describe()


Out[4]:
x1 x2 x3 y
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.030942 0.196471 0.059228 2.073943
std 1.024607 0.656093 0.722516 3.700031
min -3.026895 -0.958924 -0.989992 -8.108243
25% -0.646609 -0.279415 -0.653644 -0.418985
50% 0.018040 0.412118 0.283662 2.094814
75% 0.678942 0.841471 0.753902 4.745579
max 3.862828 0.989358 1.000000 12.421685

linear regression

解析式直接求解


In [5]:
df['x4'] = 1

In [6]:
X = df.iloc[:,(0,1,2,4)].values

In [7]:
y = df.y.values

$y = Xw$
$ w = (X^T*X)^[-1]*X^T*y$


In [8]:
inv_XX_T = inv(X.T.dot(X))

In [9]:
w = inv_XX_T.dot(X.T).dot(df.y.values)

In [10]:
w


Out[10]:
array([ 2.97396653, -0.54139002,  0.97132913,  2.03076198])

Results

w1 = 2.97396653
w2 = -0.54139002
w3 = 0.97132913
b = 2.03076198


In [11]:
qr(inv_XX_T)


Out[11]:
(array([[-0.99932291, -0.02064871, -0.0030824 ,  0.03029612],
        [ 0.02599347, -0.98046617,  0.03167325,  0.19237264],
        [ 0.00526495, -0.02072455, -0.99808801,  0.05799222],
        [ 0.02550186,  0.1944999 ,  0.05298706,  0.9791383 ]]),
 array([[ -9.54420192e-04,   7.38324873e-05,   1.32685044e-05,
           3.97413182e-05],
        [  0.00000000e+00,  -2.37176114e-03,  -1.12197714e-04,
           6.67128272e-04],
        [  0.00000000e+00,   0.00000000e+00,  -1.91987064e-03,
           1.66696794e-04],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           9.79138303e-04]]))

In [12]:
X.shape


Out[12]:
(1000, 4)

In [312]:
#solve(X,y)##只能解方阵

梯度下降法求解

  • 目标函数选取要合适一些, 前边乘以适当的系数.
  • 注意检验梯度的计算是否正确...

In [118]:
def f(w,X,y):    
    return ((X.dot(w)-y)**2/(2*1000)).sum()

In [119]:
def grad_f(w,X,y):
    return  (X.dot(w) - y).dot(X)/1000

In [120]:
w0 = np.array([100.0,100.0,100.0,100.0])

In [121]:
epsilon = 1e-10

In [122]:
alpha = 0.1

In [123]:
check_condition = 1

In [243]:
while  check_condition > epsilon:
    w0 += -alpha*grad_f(w0,X,y)
    check_condition = abs(grad_f(w0,X,y)).sum()
print w0


[ 2.97396671 -0.5414066   0.97132728  2.03076759]

随机梯度下降法求解

  • Stochastic gradient descent
  • 使用了固定步长
    • 一开始用的0.1, 始终达不到给定的精度
  • 于是添加了判定条件用来更新步长.

In [411]:
def cost_function(w,X,y):    
    return (X.dot(w)-y)**2/2

In [412]:
def grad_cost_f(w,X,y):
    return  (np.dot(X, w) - y)*X

In [462]:
w0 = np.array([1.0, 1.0, 1.0, 1.0])

In [477]:
epsilon = 1e-3

In [478]:
alpha = 0.01

In [479]:
# 生成随机index,用来随机索引数据.
random_index = np.arange(1000)
np.random.shuffle(random_index)

In [480]:
cost_value = np.inf #初始化目标函数值

In [481]:
while abs(grad_f(w0,X,y)).sum() > epsilon:
    for i in range(1000):
        w0 += -alpha*grad_cost_f(w0,X[random_index[i]],y[random_index[i]])
    
    #检查目标函数变化趋势, 如果趋势变化达到临界值, 更新更小的步长继续计算
    difference = cost_value - f(w0, X, y) 
    if difference < 1e-10:
        alpha *= 0.9
    cost_value = f(w0, X, y)
print w0


[ 2.97376767 -0.54075842  0.97217986  2.03067711]

In [ ]: