In [1]:
# 啟動互動式繪圖環境
%pylab inline
import numpy as np
import matplotlib.pyplot as plt
In [2]:
# 載入 file 輸出 feature,label(最後一欄)
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split('\t')) - 1
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
In [3]:
# OLS 普通最小二乘法 ordinary least squares
# 理念為 求解w 而平方誤差最小則越接近預測情形
def OLS(xArr,yArr):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
xTx = xMat.T*xMat
# 有些matrix不可轉為逆矩陣
if np.linalg.det(xTx) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T*yMat)
return ws
In [4]:
fileName = '/Users/wy/Desktop/ex0.txt'
dataMat,labelMat = loadDataSet(fileName)
In [5]:
dataMat[1:3]
Out[5]:
In [6]:
labelMat[1:3]
Out[6]:
In [7]:
# 求出w
OLS_w = OLS(dataMat,labelMat)
In [8]:
OLS_w
Out[8]:
In [9]:
# dataMat為list,需轉為matrix運算
xMat = np.mat(dataMat)
yMat = np.mat(labelMat)
yHat = xMat*OLS_w
In [10]:
# 把yHat matrix 轉為 yHat.flatten().A[0] list格式
# 相關係數
corrcoef(yHat.flatten().A[0],yMat.flatten().A[0])
# 0.98 建立model所預測的y和原先y相比的相關係數 蠻高的XD
Out[10]:
In [11]:
# linear regression最常出現的問題為underfitting,會有一些bias,故定每一點的附近給予weight
# 對每一點給予 Gaussian kernel 來改變 weight
# 其理念為原先用一條線去fitting,每一點給予weight,有點像是用很多條很短的直線去組成model
# Locally Weighted Linear Regression
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))
for j in range(m):
diffMat = testPoint - xMat[j,:]
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
In [12]:
# 排順序x,y由小到大
srtInd = xMat[:,1].argsort(0)
xSort = xMat[srtInd][:,0,:]
# # xMat[:,1] 第一維皆是1去掉
# # xMat[:,1] = matrix([[ 0.067732],[ 0.42781 ]])
# # xMat[:,1].flatten() = matrix([[ 0.067732, 0.42781 ]])
# # xMat[:,1].flatten().A[0] = matrix([0.067732, 0.42781])
fig = plt.figure()
ax = fig.add_subplot(111)
# underfitting
ax.plot(xSort[:,1].flatten().A[0],sort(yHat[srtInd].flatten().A[0]))
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0],c='red')
# overfitting
yHat2 = lwlrTest(dataMat,dataMat,labelMat,0.003)
ax.plot(xSort[:,1].flatten().A[0],yHat2[srtInd],c='white')
# better fit
yHat3 = lwlrTest(dataMat,dataMat,labelMat,0.01)
ax.plot(xSort[:,1].flatten().A[0],yHat3[srtInd],c='green')
# 白線:k=0.003 發生overfitting 過度耦合(對目前預測準確)對於未來預測數據可能會發生不準確
# 藍線:k=1.0 發生underfitting 無耦合直線預測不準
# 必須讓線存在於接近數據走向但是又不能太過貼和數據 調整k值
# 在 Locally Weighted Linear Regression中 用k來調整該點附近的範圍
# k越小代表越精細影響範圍較大 k越大則影響範圍較小 因此k太大會發生underfitting k太小會發生overfitting
# 但此方法的運算成本太大,因為對每個點必須進行scan整個dataset
Out[12]:
In [13]:
# fidge regression
In [14]:
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = denom.I * (xMat.T*yMat)
return ws
# lambda 由 exp(i-10) 去調整由非常小到非常大
def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = mean(xMat,0) #calc mean then subtract it off
xVar = var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
In [15]:
# 載入新的數據
fileName = '/Users/wy/Desktop/abalone.txt'
dataMat,labelMat = loadDataSet(fileName)
In [16]:
dataMat[1:3]
Out[16]:
In [17]:
labelMat[1:3]
Out[17]:
In [18]:
# lambda 由非常小到非常大得出的w有30種 自行從其中找方差最小選為最適合之model
ridgeTest(dataMat,labelMat)[1:3]
Out[18]:
In [19]:
ridgeTest(dataMat,labelMat)[28:30]
Out[19]:
In [20]:
# zero-mean normalization (x-平均/標準差)
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
#標準差 The variance is the average of the squared deviations from the mean, i.e., var = mean(abs(x - x.mean())**2).
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
inMat = np.nan_to_num(inMat)
return inMat
In [21]:
# 方差計算
def rssError(yArr,yHatArr):
return ((yArr-yHatArr)**2).sum()
In [22]:
# Forward stagewise linear regression
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
returnMat = zeros((numIt,n))
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):
# print ws.T 顯示行走經過
# print ws.T
#inf 正無窮
lowestError = inf;
# run every feature
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:]=ws.T
return returnMat
In [23]:
# 步長為0.001 iteration 5000
stageWise_w = stageWise(dataMat,labelMat,0.001,5000)
# 以下為行走的過程
# 收斂結果 w = 0.044 -0.011 0.12 0.022 2.023 -0.963 -0.105 0.187
In [24]:
stageWise_w
Out[24]:
In [25]:
def OLS(xArr,yArr):
# 因為要normalization 在外面就處理成matrix形式,因此註解掉list to matrix的過程,其餘不變
# xMat = np.mat(xArr); yMat = np.mat(yArr).T
xTx = xMat.T*xMat
# 有些matrix不可轉為逆矩陣
if np.linalg.det(xTx) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T*yMat)
return ws
In [26]:
xMat = mat(dataMat)
yMat = mat(labelMat).T
xMat = regularize(xMat)
yM = mean(yMat,0)
yMat = yMat - yM
OLS_w = OLS(xMat,yMat)
In [27]:
OLS_w
Out[27]:
In [28]:
# OLS_w 0.0430442 -0.02274163 0.13214087 0.02075182 2.22403814 -0.99895312 -0.11725427 0.16622915
# stageWise_w 0.044 -0.011 0.12 0.022 2.023 -0.963 -0.105 0.187
# 兩者算出來結果差不多
yHat_OLS_w = xMat*OLS_w
# stageWise_w[-1] 收斂之w
yHat_stageWise_w = xMat*mat(stageWise_w[-1]).T
In [29]:
# 相關係數預測y和原本y比較,OLS 和 stageWise兩者的差距不大
corrcoef(yHat_OLS_w.flatten().A[0],yMat.flatten().A[0])
Out[29]:
In [30]:
corrcoef(yHat_stageWise_w.flatten().A[0],yMat.flatten().A[0])
Out[30]:
In [31]:
# 交叉驗證的方法,從data中選取90%為train data 10%為test data 去跑演算法(Ridge regression)
# 重覆10次 從中取得方差較小的w
def crossValidation(xArr,yArr,numVal=10):
# random 90%->train 10%->test
m = len(yArr)
indexList = range(m)
errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
for i in range(numVal):
trainX=[]; trainY=[]
testX = []; testY = []
random.shuffle(indexList)
for j in range(m):#create training set based on first 90% of values in indexList
if j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge
for k in range(30):#loop over all of the ridge estimates
matTestX = mat(testX); matTrainX=mat(trainX)
meanTrain = mean(matTrainX,0)
varTrain = var(matTrainX,0)
matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
errorMat[i,k]=rssError(yEst.T.A,array(testY))
#print errorMat[i,k]
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
minMean = float(min(meanErrors))
bestWeights = wMat[nonzero(meanErrors==minMean)]
return bestWeights
In [32]:
# ordinary least squares,Forward stagewise linear regression,Ridge regression(加上crossValidation)
# 三種方式預測出來的結果都極其相似
crossValidation_w = crossValidation(dataMat,labelMat)
yHat_crossValidation_w = xMat*crossValidation_w.T
In [33]:
corrcoef(yHat_crossValidation_w.flatten().A[0],yMat.flatten().A[0])
Out[33]:
In [34]:
corrcoef(yHat_OLS_w.flatten().A[0],yMat.flatten().A[0])
Out[34]:
In [35]:
corrcoef(yHat_stageWise_w.flatten().A[0],yMat.flatten().A[0])
Out[35]:
In [36]:
# 參考來源 Machine Learning in Action