In [1]:
# 啟動互動式繪圖環境
%pylab inline
import numpy as np
import matplotlib.pyplot as plt


Populating the interactive namespace from numpy and matplotlib

In [2]:
# 載入 file 輸出 feature,label(最後一欄)
def loadDataSet(fileName):      
    numFeat = len(open(fileName).readline().split('\t')) - 1 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat
矩陣X 對應y
x:feature y:label
$$ y =\{x^Tw}$$
已知x 找到w 即可預測y,故下列各種方法為找w,建立model
$$ w =\({x^Tx})^{-1}{x^Ty}$$

In [3]:
# OLS 普通最小二乘法 ordinary least squares
# 理念為 求解w 而平方誤差最小則越接近預測情形
def OLS(xArr,yArr):
    xMat = np.mat(xArr); yMat = np.mat(yArr).T
    xTx = xMat.T*xMat
    # 有些matrix不可轉為逆矩陣
    if np.linalg.det(xTx) == 0.0:
        print ("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws

In [4]:
fileName = '/Users/wy/Desktop/ex0.txt'
dataMat,labelMat = loadDataSet(fileName)

In [5]:
dataMat[1:3]


Out[5]:
[[1.0, 0.42781], [1.0, 0.995731]]

In [6]:
labelMat[1:3]


Out[6]:
[3.816464, 4.550095]

In [7]:
# 求出w
OLS_w = OLS(dataMat,labelMat)

In [8]:
OLS_w


Out[8]:
matrix([[ 3.00774324],
        [ 1.69532264]])

In [9]:
# dataMat為list,需轉為matrix運算
xMat = np.mat(dataMat)
yMat = np.mat(labelMat)
yHat = xMat*OLS_w

In [10]:
# 把yHat matrix 轉為 yHat.flatten().A[0] list格式
# 相關係數
corrcoef(yHat.flatten().A[0],yMat.flatten().A[0])
# 0.98 建立model所預測的y和原先y相比的相關係數 蠻高的XD


Out[10]:
array([[ 1.        ,  0.98647356],
       [ 0.98647356,  1.        ]])

In [11]:
# linear regression最常出現的問題為underfitting,會有一些bias,故定每一點的附近給予weight
# 對每一點給予 Gaussian kernel 來改變 weight
# 其理念為原先用一條線去fitting,每一點給予weight,有點像是用很多條很短的直線去組成model
# Locally Weighted Linear Regression
def lwlr(testPoint,xArr,yArr,k=1.0):
    xMat = np.mat(xArr); yMat = np.mat(yArr).T
    m = shape(xMat)[0]
    weights = mat(eye((m)))
    for j in range(m):                     
        diffMat = testPoint - xMat[j,:]
        weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
    xTx = xMat.T * (weights * xMat)
    if linalg.det(xTx) == 0.0:
        print ("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T * (weights * yMat))
    return testPoint * ws

def lwlrTest(testArr,xArr,yArr,k=1.0):  #loops over all the data points and applies lwlr to each one
    m = shape(testArr)[0]
    yHat = zeros(m)
    for i in range(m):
        yHat[i] = lwlr(testArr[i],xArr,yArr,k)
    return yHat

In [12]:
# 排順序x,y由小到大
srtInd = xMat[:,1].argsort(0)
xSort = xMat[srtInd][:,0,:]

# # xMat[:,1] 第一維皆是1去掉
# # xMat[:,1] = matrix([[ 0.067732],[ 0.42781 ]])
# # xMat[:,1].flatten() = matrix([[ 0.067732, 0.42781 ]])
# # xMat[:,1].flatten().A[0] = matrix([0.067732, 0.42781])

fig = plt.figure()
ax = fig.add_subplot(111)
# underfitting
ax.plot(xSort[:,1].flatten().A[0],sort(yHat[srtInd].flatten().A[0]))
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0],c='red')

# overfitting
yHat2 = lwlrTest(dataMat,dataMat,labelMat,0.003)
ax.plot(xSort[:,1].flatten().A[0],yHat2[srtInd],c='white')
# better fit 
yHat3 = lwlrTest(dataMat,dataMat,labelMat,0.01)
ax.plot(xSort[:,1].flatten().A[0],yHat3[srtInd],c='green')
# 白線:k=0.003 發生overfitting 過度耦合(對目前預測準確)對於未來預測數據可能會發生不準確
# 藍線:k=1.0 發生underfitting 無耦合直線預測不準
# 必須讓線存在於接近數據走向但是又不能太過貼和數據 調整k值

# 在 Locally Weighted Linear Regression中 用k來調整該點附近的範圍
# k越小代表越精細影響範圍較大 k越大則影響範圍較小 因此k太大會發生underfitting k太小會發生overfitting
# 但此方法的運算成本太大,因為對每個點必須進行scan整個dataset


Out[12]:
[<matplotlib.lines.Line2D at 0x1062a3f50>]

In [13]:
# fidge regression

狀況一 當 feature > n(樣本數量),inverse matrix 不存在

引入 Ridge regression 解決該問題

原先

$$ w =\({x^Tx})^{-1}{x^Ty}$$

引入 lambda

$$ w =\({x^Tx+{\lambda}I})^{-1}{x^Ty}$$

In [14]:
def ridgeRegres(xMat,yMat,lam=0.2):
    xTx = xMat.T*xMat
    denom = xTx + eye(shape(xMat)[1])*lam
    if linalg.det(denom) == 0.0:
        print ("This matrix is singular, cannot do inverse")
        return
    ws = denom.I * (xMat.T*yMat)
    return ws
# lambda 由 exp(i-10) 去調整由非常小到非常大
def ridgeTest(xArr,yArr):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)
    yMat = yMat - yMean     #to eliminate X0 take mean off of Y
    #regularize X's
    xMeans = mean(xMat,0)   #calc mean then subtract it off
    xVar = var(xMat,0)      #calc variance of Xi then divide by it
    xMat = (xMat - xMeans)/xVar
    numTestPts = 30
    wMat = zeros((numTestPts,shape(xMat)[1]))
    for i in range(numTestPts):
        ws = ridgeRegres(xMat,yMat,exp(i-10))
        wMat[i,:]=ws.T
    return wMat

In [15]:
# 載入新的數據
fileName = '/Users/wy/Desktop/abalone.txt'
dataMat,labelMat = loadDataSet(fileName)

In [16]:
dataMat[1:3]


Out[16]:
[[1.0, 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07],
 [-1.0, 0.53, 0.42, 0.135, 0.677, 0.2565, 0.1415, 0.21]]

In [17]:
labelMat[1:3]


Out[17]:
[7.0, 9.0]

In [18]:
# lambda 由非常小到非常大得出的w有30種 自行從其中找方差最小選為最適合之model
ridgeTest(dataMat,labelMat)[1:3]


Out[18]:
array([[ 0.04304419, -0.02274164,  0.13214088,  0.02075182,  2.22403626,
        -0.99895275, -0.11725417,  0.16622934],
       [ 0.04304419, -0.02274164,  0.13214088,  0.02075182,  2.22403305,
        -0.99895211, -0.117254  ,  0.16622966]])

In [19]:
ridgeTest(dataMat,labelMat)[28:30]


Out[19]:
array([[ -8.34109967e-06,   9.05722756e-04,   1.13283336e-03,
          2.59497753e-03,   2.15072069e-04,   3.65888384e-04,
          8.94835365e-04,   8.86543150e-04],
       [ -3.13618246e-06,   3.43488557e-04,   4.29265642e-04,
          9.86279863e-04,   8.16188652e-05,   1.39858822e-04,
          3.40121256e-04,   3.34847052e-04]])

為了讓data去掉bias,降低極端data的影響,所以必須要data進行normalization

z-score 標準化(zero-mean normalization)

也叫標準差標準化,經過處理的數據符合標準常態分佈,即平均值為0,標準差為1,其函數為:
$$ \dot{x} =\frac{ x-u}{ \sigma }$$

其中μ為所有樣本數據的平均值,σ為所有樣本數據的標準差。

狀況二 如何減少計算量,另一種貪心演算法

一開始所有w的權重為1,每一步對每個w的權重增加或是減少一個步長,然後計算方差,方差變少就執行下去直到收斂


In [20]:
# zero-mean normalization (x-平均/標準差)
def regularize(xMat):#regularize by columns
    inMat = xMat.copy()
    inMeans = mean(inMat,0)   #calc mean then subtract it off
    #標準差 The variance is the average of the squared deviations from the mean, i.e., var = mean(abs(x - x.mean())**2).
    inVar = var(inMat,0)      #calc variance of Xi then divide by it
    inMat = (inMat - inMeans)/inVar
    inMat = np.nan_to_num(inMat)
    return inMat

In [21]:
# 方差計算 
def rssError(yArr,yHatArr):
    return ((yArr-yHatArr)**2).sum()

In [22]:
# Forward stagewise linear regression
def stageWise(xArr,yArr,eps=0.01,numIt=100):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)
    yMat = yMat - yMean     #can also regularize ys but will get smaller coef
    xMat = regularize(xMat)
    m,n=shape(xMat)
    returnMat = zeros((numIt,n)) 
    ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
    for i in range(numIt):
        # print ws.T 顯示行走經過
        # print ws.T
        #inf 正無窮
        lowestError = inf; 
        # run every feature
        for j in range(n):
            for sign in [-1,1]:
                wsTest = ws.copy()
                wsTest[j] += eps*sign
                yTest = xMat*wsTest
                rssE = rssError(yMat.A,yTest.A)
                
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        returnMat[i,:]=ws.T
    return returnMat

In [23]:
# 步長為0.001 iteration 5000
stageWise_w = stageWise(dataMat,labelMat,0.001,5000)
# 以下為行走的過程
# 收斂結果 w = 0.044 -0.011  0.12   0.022  2.023 -0.963 -0.105  0.187

In [24]:
stageWise_w


Out[24]:
array([[ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  0.   , ...,  0.   ,  0.   ,  0.   ],
       ..., 
       [ 0.043, -0.011,  0.12 , ..., -0.963, -0.105,  0.187],
       [ 0.044, -0.011,  0.12 , ..., -0.963, -0.105,  0.187],
       [ 0.043, -0.011,  0.12 , ..., -0.963, -0.105,  0.187]])

In [25]:
def OLS(xArr,yArr):
    # 因為要normalization 在外面就處理成matrix形式,因此註解掉list to matrix的過程,其餘不變
    # xMat = np.mat(xArr); yMat = np.mat(yArr).T
    xTx = xMat.T*xMat
    # 有些matrix不可轉為逆矩陣
    if np.linalg.det(xTx) == 0.0:
        print ("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws

In [26]:
xMat = mat(dataMat)
yMat = mat(labelMat).T
xMat = regularize(xMat)
yM = mean(yMat,0)
yMat = yMat - yM
OLS_w = OLS(xMat,yMat)

In [27]:
OLS_w


Out[27]:
matrix([[ 0.0430442 ],
        [-0.02274163],
        [ 0.13214087],
        [ 0.02075182],
        [ 2.22403814],
        [-0.99895312],
        [-0.11725427],
        [ 0.16622915]])

In [28]:
# OLS_w       0.0430442 -0.02274163 0.13214087 0.02075182 2.22403814 -0.99895312 -0.11725427 0.16622915 
# stageWise_w 0.044     -0.011      0.12       0.022      2.023      -0.963      -0.105      0.187
# 兩者算出來結果差不多
yHat_OLS_w = xMat*OLS_w
# stageWise_w[-1] 收斂之w
yHat_stageWise_w = xMat*mat(stageWise_w[-1]).T

In [29]:
# 相關係數預測y和原本y比較,OLS 和 stageWise兩者的差距不大
corrcoef(yHat_OLS_w.flatten().A[0],yMat.flatten().A[0])


Out[29]:
array([[ 1.        ,  0.72656103],
       [ 0.72656103,  1.        ]])

In [30]:
corrcoef(yHat_stageWise_w.flatten().A[0],yMat.flatten().A[0])


Out[30]:
array([[ 1.        ,  0.72642036],
       [ 0.72642036,  1.        ]])

In [31]:
# 交叉驗證的方法,從data中選取90%為train data 10%為test data 去跑演算法(Ridge regression)
# 重覆10次 從中取得方差較小的w
def crossValidation(xArr,yArr,numVal=10):
    # random 90%->train 10%->test
    m = len(yArr)                           
    indexList = range(m)
    errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
    for i in range(numVal):
        trainX=[]; trainY=[]
        testX = []; testY = []
        random.shuffle(indexList)
        for j in range(m):#create training set based on first 90% of values in indexList
            if j < m*0.9: 
                trainX.append(xArr[indexList[j]])
                trainY.append(yArr[indexList[j]])
            else:
                testX.append(xArr[indexList[j]])
                testY.append(yArr[indexList[j]])
        wMat = ridgeTest(trainX,trainY)    #get 30 weight vectors from ridge
        for k in range(30):#loop over all of the ridge estimates
            matTestX = mat(testX); matTrainX=mat(trainX)
            meanTrain = mean(matTrainX,0)
            varTrain = var(matTrainX,0)
            matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
            yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
            errorMat[i,k]=rssError(yEst.T.A,array(testY))
            #print errorMat[i,k]
    meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
    minMean = float(min(meanErrors))
    bestWeights = wMat[nonzero(meanErrors==minMean)]
    return bestWeights

In [32]:
# ordinary least squares,Forward stagewise linear regression,Ridge regression(加上crossValidation)
# 三種方式預測出來的結果都極其相似
crossValidation_w = crossValidation(dataMat,labelMat)
yHat_crossValidation_w = xMat*crossValidation_w.T

In [33]:
corrcoef(yHat_crossValidation_w.flatten().A[0],yMat.flatten().A[0])


Out[33]:
array([[ 1.        ,  0.72651684],
       [ 0.72651684,  1.        ]])

In [34]:
corrcoef(yHat_OLS_w.flatten().A[0],yMat.flatten().A[0])


Out[34]:
array([[ 1.        ,  0.72656103],
       [ 0.72656103,  1.        ]])

In [35]:
corrcoef(yHat_stageWise_w.flatten().A[0],yMat.flatten().A[0])


Out[35]:
array([[ 1.        ,  0.72642036],
       [ 0.72642036,  1.        ]])

In [36]:
# 參考來源 Machine Learning in Action