Chapter 06
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels as sm
%matplotlib inline
In [106]:
x = np.random.normal(0,1,100)
epsilon = np.random.normal(0,1,100)
In [107]:
beta_0 = 3
beta_1 = 2
beta_2 = -3
beta_3 = 0.3
y = beta_0 + beta_1 * x + beta_2 * x**2 + beta_3 * x**3 + epsilon
In [86]:
data = pd.DataFrame({
'y' : y,
'x1' : x,
'x2' : x**2,
'x3' : x**3,
'x4' : x**4,
'x5' : x**5,
'x6' : x**6,
'x7' : x**7,
'x8' : x**8,
'x9' : x**9,
'x10' : x**10
})
data.head()
Out[86]:
In [91]:
from statsmodels.formula.api import ols
import re
# use regularizaion rule to extact the bic and ajust r squared
def bic(text):
pattern = r'BIC:\s+ [-+]?\d+[\.]?\d*[eE]?[-+]?\d*'
group = re.search(pattern, text)[0]
pattern = r'[\d\.]+'
return float(re.search(pattern, group)[0])
def adjust_r_square(text):
pattern = r'dj. R-squared:\s+ [-+]?\d+[\.]?\d*[eE]?[-+]?\d*'
group = re.search(pattern, text)[0]
pattern =r'[-+]?\d+[\.]?\d*[eE]?[-+]?\d*'
return float(re.search(pattern, group)[0])
def ols_indexes(formula):
model = ols(formula, data=data).fit()
model_summary = model.summary()
text = model_summary.as_text()
return bic(text), adjust_r_square(text)
bics = []
ajust_r_square = []
formulas = ['y ~ x1','y ~ x1+x2', 'y ~ x1+x2+x3','y ~ x1+x2+x3+x4','y ~ x1+x2+x3+x4+x5',
'y ~ x1+x2+x3+x4+x5+x6', 'y ~ x1+x2+x3+x4+x5+x6+x7',
'y ~ x1+x2+x3+x4+x5+x6+x7+x8', 'y ~ x1+x2+x3+x4+x5+x6+x7+x8+x9',
'y ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10']
for formula in formulas:
indexes = ols_indexes(formula)
bics.append(indexes[0])
ajust_r_square.append(indexes[1])
x = np.arange(1,11)
plt.subplot(1,2,1)
plt.plot(x, bics)
plt.scatter([x[2]],[bics[2]],marker='*',c='r')
plt.subplot(1,2,2)
plt.plot(x, ajust_r_square)
plt.scatter([x[2]],[ajust_r_square[2]],marker='*',c='r')
plt.show()
Above illustrating two images, we conclude that $y=\beta_0 + \beta_1x+\beta_2x^2+\beta_3x^3$ approach fits the response best.
In [94]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
X = data[['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']]
y = data['y']
esitmator = LinearRegression()
selector = RFE(esitmator, 3, step=1)
selector = selector.fit(X,y)
selector.support_
Out[94]:
From the RFE selector, we know that first three predictors are choen as the best preditors which is same with subsection above (c).
In [99]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
X = data[['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']]
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=0)
lambdas = [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
lambdas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2,0.4,0.5,0.8,1]
mse_scores = []
for lam in lambdas:
clf = linear_model.Lasso(alpha=lam)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
residuals = y_test - pred
mse_scores.append(np.sum(residuals**2)/residuals.shape[0])
x_coord = range(1, 11,1)
plt.plot(x_coord, mse_scores)
plt.xticks(x_coord, lambdas)
plt.xlabel(r'$\lambda$')
plt.ylabel('MES')
plt.show()
In [102]:
clf = linear_model.Lasso(alpha=0)
clf.fit(X_train, y_train)
print(clf.coef_)
In [103]:
print(clf.intercept_)
In [108]:
beta_7 = 8
y = beta_0 + beta_7 * x**7+epsilon
In [113]:
data = pd.DataFrame({
'y':y,
'x1' : x,
'x2' : x**2,
'x3' : x**3,
'x4' : x**4,
'x5' : x**5,
'x6' : x**6,
'x7' : x**7,
'x8' : x**8,
'x9' : x**9,
'x10' : x**10
})
X = data[['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']]
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
lambdas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2,0.4,0.5,0.8,1]
mse_scores = []
for lam in lambdas:
clf = linear_model.Lasso(alpha=lam)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
residuals = y_test - pred
mse_scores.append(np.sum(residuals**2)/residuals.shape[0])
x_coord = range(1, 11,1)
plt.plot(x_coord, mse_scores)
plt.xticks(x_coord, lambdas)
plt.xlabel(r'$\lambda$')
plt.ylabel('MES')
plt.show()
In [114]:
clf = linear_model.Lasso(alpha=0.2)
clf.fit(X_train, y_train)
print(clf.coef_)
print(clf.intercept_)
In [115]:
college_file_path = '../data/College.csv'
colleges = pd.read_csv(college_file_path, index_col=0)
colleges.head()
Out[115]:
In [118]:
from sklearn.model_selection import train_test_split
colleges=colleges.replace(['Yes','No'],[1,-1])
colleges.head()
Out[118]:
In [119]:
colleges.columns
Out[119]:
In [121]:
X = colleges[['Private', 'Accept', 'Enroll', 'Top10perc', 'Top25perc',
'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books',
'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend',
'Grad.Rate']].values
y = colleges['Apps'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=0)
In [122]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
residual = pred - y_test
print(np.sum(residual**2)/residual.shape[0])
In [131]:
from sklearn import linear_model
alphas = [1, 5, 10,50,100,500,1000,5000,10000,50000]
mses = []
for alpha in alphas:
reg = linear_model.Ridge(alpha=alpha)
reg.fit(X_train, y_train)
residual = reg.predict(X_test) - y_test
mses.append(np.sum(residual*residual)/residual.shape[0])
x = range(1,11,1)
plt.plot(x, mses)
plt.xticks(x,alphas)
plt.show()
In [132]:
reg = linear_model.Ridge(alpha=5000)
reg.fit(X_train, y_train)
print(reg.coef_, reg.intercept_)
In [133]:
alphas = [1, 5, 10,50,100,500,1000,5000,10000,50000]
mses = []
for alpha in alphas:
reg = linear_model.Lasso(alpha=alpha)
reg.fit(X_train, y_train)
residual = reg.predict(X_test) - y_test
mses.append(np.sum(residual*residual)/residual.shape[0])
x = range(1,11,1)
plt.plot(x, mses)
plt.xticks(x,alphas)
plt.show()
In [134]:
reg = linear_model.Lasso(alpha=500)
reg.fit(X_train, y_train)
print(reg.coef_,reg.intercept_)
In [136]:
from sklearn.decomposition import PCA
Ms = range(1,18)
mses = []
for m in Ms:
pca = PCA(n_components=m)
pca.fit(X_train)
X_train_trans = pca.transform(X_train)
lr = LinearRegression()
lr.fit(X_train_trans, y_train)
X_test_trans = pca.transform(X_test)
residual = lr.predict(X_test_trans) - y_test
mses.append(np.sum(residual*residual)/residual.shape[0])
plt.plot(Ms, mses)
plt.show()
In [137]:
from sklearn.cross_decomposition import PLSRegression
Ms = range(1,18)
mses = []
for m in Ms:
pls = PLSRegression(n_components=m)
pls.fit(X_train, y_train)
residual = pls.predict(X_test) - y_test
mses.append(np.sum(residual*residual)/residual.shape[0])
plt.plot(Ms, mses)
plt.show()
In [152]:
X = np.random.randint(1,high=200, size=(1000,20))
beta = np.random.randn(20,1)
beta[2] = 0.0
beta[3] = 0.0
beta[8] = 0.0
beta[18] = 0.0
beta[9] = 0.0
epsilon = np.random.normal(0,1,1000).reshape((1000,1))
y = X.dot(beta)+epsilon
In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
In [164]:
from sklearn.feature_selection import SelectKBest, f_regression
#X_new = SelectKBest(f_regression, k=2).fit_transform(X_train,y_train)
ms = range(1,21)
mses = []
for m in ms:
selector = SelectKBest(f_regression, k=m)
selector.fit(X_train,y_train)
X_new = selector.transform(X_train)
lr = LinearRegression()
lr.fit(X_new, y_train)
residual = lr.predict(X_new) - y_train
mses.append(np.sum(residual*residual)/residual.shape[0])
plt.plot(ms, mses)
plt.show()
In [165]:
ms = range(1,21)
mses = []
for m in ms:
selector = SelectKBest(f_regression, k=m)
selector.fit(X_train,y_train)
X_new = selector.transform(X_train)
lr = LinearRegression()
lr.fit(X_new, y_train)
X_test_new = selector.transform(X_test)
residual = lr.predict(X_test_new) - y_test
mses.append(np.sum(residual*residual)/residual.shape[0])
plt.plot(ms, mses)
plt.show()
In [166]:
print(mses)
when $m=18$, it has lowest test mse, that is 1.0532445.
In [2]:
boston_file_path = '../data/Boston.csv'
bostons = pd.read_csv(boston_file_path, index_col=0)
bostons.head()
Out[2]:
In [3]:
bostons.columns
Out[3]:
In [4]:
X = bostons[['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
'ptratio', 'black', 'lstat', 'medv']].values
y = bostons['crim'].values
In [8]:
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.linear_model import LinearRegression
# select best subsets
ms = range(1,14)
mses = []
for m in ms:
selector = SelectKBest(f_regression, k=m)
selector.fit(X,y)
X_new = selector.transform(X)
lr = LinearRegression()
lr.fit(X_new, y)
residual = lr.predict(X_new) - y
mses.append(np.sum(residual*residual)/residual.shape[0])
plt.plot(ms, mses)
plt.show()
In [173]:
# Lasso
lambdas = [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2,0.4,0.5,0.8,1]
mse_scores = []
for lam in lambdas:
clf = linear_model.Lasso(alpha=lam)
clf.fit(X, y)
pred = clf.predict(X)
residuals = y - pred
mse_scores.append(np.sum(residuals**2)/residuals.shape[0])
x_coord = range(1, 12,1)
plt.plot(x_coord, mse_scores)
plt.xticks(x_coord, lambdas)
plt.xlabel(r'$\lambda$')
plt.ylabel('MES')
plt.show()
In [180]:
# Ridge
alphas = [0.000001,1, 5, 10,50,100,500,1000,5000,10000,50000]
mses = []
for alpha in alphas:
reg = linear_model.Ridge(alpha=alpha)
reg.fit(X, y)
residual = reg.predict(X) - y
mses.append(np.sum(residual*residual)/residual.shape[0])
x = range(1,12,1)
plt.plot(x, mses)
plt.xticks(x,alphas)
plt.show()
In [181]:
# PCR
from sklearn.decomposition import PCA
Ms = range(1,14)
mses = []
for m in Ms:
pca = PCA(n_components=m)
pca.fit(X)
X_train_trans = pca.transform(X)
lr = LinearRegression()
lr.fit(X_train_trans, y)
X_test_trans = pca.transform(X)
residual = lr.predict(X_test_trans) - y
mses.append(np.sum(residual*residual)/residual.shape[0])
plt.plot(Ms, mses)
plt.show()
In [ ]: