In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import KFold
%matplotlib inline
In [96]:
hitters_df = pd.read_csv('data/Hitters.csv')
hitters_df.dropna(inplace=True)
hitters_df.head()
Out[96]:
In [97]:
#对非数字的特征编码
hitters_df['League'] = pd.factorize(hitters_df['League'])[0]
hitters_df['Division'] = pd.factorize(hitters_df['Division'])[0]
hitters_df['NewLeague'] = pd.factorize(hitters_df['NewLeague'])[0]
hitters_df.head()
Out[97]:
In [49]:
hitters_df.shape
Out[49]:
In [31]:
collist = [col for col in hitters_df.columns if col != 'Salary']
X = hitters_df[collist]
y = hitters_df['Salary']
reg = LinearRegression()
reg.fit(X,y)
ypred = reg.predict(X)
np.sqrt(mean_squared_error(ypred,y))
Out[31]:
In [7]:
mses = []
nfeatures = range(1,len(collist))
print(nfeatures)
for nfeature in nfeatures:
#特征选择,回归问题利用f-regression函数,返回K个最高的特征
X_new = SelectKBest(f_regression,k=nfeature).fit(X,y)
#X_new表示经过选择后的训练集
selected = X_new.get_support()
feats = [col for (col,sel) in zip(collist,selected) if sel]
reg = LinearRegression()
X_r = hitters_df[feats]
reg.fit(X_r,y)
ypred = reg.predict(X_r)
mses.append(np.sqrt(mean_squared_error(ypred,y)))
plt.plot(nfeatures,mses)
plt.xlabel('k')
plt.ylabel('RMSE')
Out[7]:
In [46]:
cv_errors = []
kfold = KFold(len(hitters_df),n_folds=10)
for nfeature in nfeatures:
X_new = SelectKBest(f_regression, k=nfeature).fit(X,y)
selected = X_new.get_support()
feats = [col for (col,sel) in zip(collist,selected) if sel]
X_r = hitters_df[feats].values
y = hitters_df['Salary'].values
rmses = []
for train , test in kfold:
Xtrain,ytrain,Xtest,ytest = X_r[train],y[train],X_r[test],y[test]
ret = LinearRegression()
reg.fit(Xtrain,ytrain)
ypred = reg.predict(Xtest)
rmses.append(np.sqrt(mean_squared_error(ypred, ytest)))
cv_errors.append(np.mean(rmses))
plt.plot(nfeatures,cv_errors)
plt.xlabel('k')
plt.ylabel('RMSE')
Out[46]:
In [9]:
def cross_validate(X,y,nfolds,reg_name):
rmses = []
kfold = KFold(X.shape[0],n_folds=nfolds)
for train,test in kfold:
Xtrain,ytrain,Xtest,ytest = X[train],y[train],X[test],y[test]
reg = None
if reg_name == 'ridge':
reg = Ridge()
elif reg_name == 'Lasso':
reg = Lasso()
else:
reg = LinearRegression()
reg.fit(Xtrain,ytrain)
ypred = reg.predict(Xtest)
rmses.append(np.sqrt(mean_squared_error(ytest,ypred)))
return np.mean(rmses)
collist = [col for col in hitters_df.columns if col != 'Salary']
X = hitters_df[collist].values
y = hitters_df['Salary'].values
rmse_baseline = cross_validate(X,y,10,'baseline')
rmse_ridge = cross_validate(X,y,10,'ridge')
rmse_lasso = cross_validate(X,y,10,'lasso')
(rmse_baseline,rmse_ridge,rmse_lasso)
Out[9]:
In [24]:
## find alpha
new_cv_errors = []
alphas = [0.1 * alpha for alpha in range(1,200,20)]
kfold = KFold(X.shape[0],n_folds =10)
for alpha in alphas:
rmses = []
for train ,test in kfold:
Xtrain,ytrain,Xtest,ytest = X[train],y[train],X[test],y[test]
reg = Lasso(alpha = alpha)
reg.fit(Xtrain,ytrain)
ypred = reg.predict(Xtest)
rmses.append(np.sqrt(mean_squared_error(ytest,ypred)))
new_cv_errors.append(np.mean(rmses))
plt.plot(alphas,new_cv_errors)
plt.xlabel('alpha')
plt.ylabel('RMSE')
Out[24]:
In [25]:
#sklearn内置的cross_validate
from sklearn import linear_model
In [28]:
clf = linear_model.LassoCV(alphas=alphas)
clf.fit(X,y)
Out[28]:
In [29]:
clf.alpha_
Out[29]:
In [84]:
collist = [col for col in hitters_df.columns if col != 'Salary']
X = hitters_df[collist].values
y = hitters_df['Salary'].values
In [59]:
X.shape,y.shape
Out[59]:
In [52]:
from sklearn.preprocessing import normalize
In [53]:
#normliztion,注意X的类型
X_norm = normalize(X.astype(np.float32),axis=1,norm='l2')
In [54]:
#step1:进行主成分分析
from sklearn.decomposition import PCA
In [69]:
pca = PCA(n_components= 5)
#对X_norm进行分解
pca.fit(X_norm)
X_pca = pca.transform(X_norm)
X_pca
Out[69]:
In [90]:
#step2:linear regression
reg = LinearRegression()
reg.fit(X_pca,y)
ypred = reg.predict(X_pca)
np.mean(np.sqrt(mean_squared_error(ypred,y)))
Out[90]:
In [88]:
###交叉检验测试分解成成分
cv_errors = []
n_components= range(1,19)
k_flod = KFold(X.shape[0],n_folds=10)
for n_component in n_components:
pca = PCA(n_components=n_component)
X_pca = pca.fit_transform(X_norm)
rmses = []
for train,test in k_flod:
Xtrain,ytrain,Xtest,ytest = X_pca[train],y[train],X_pca[test],y[test]
reg = LinearRegression().fit(Xtrain,ytrain)
ypred = reg.predict(Xtest)
rmses.append(np.sqrt(mean_squared_error(ypred,ytest)))
cv_errors.append(np.mean(rmses))
plt.plot(n_components,cv_errors)
plt.xlabel('n_compositions')
plt.ylabel('cv_errors')
Out[88]:
In [106]:
from sklearn.cross_decomposition import PLSRegression
In [125]:
collist = [col for col in hitters_df.columns if col != 'Salary']
X = hitters_df[collist].values
y = hitters_df['Salary'].values
In [126]:
#normliztion,注意X的类型
X_norm = normalize(X.astype(np.float64),axis=1,norm='l2')
In [128]:
###交叉检验测试分解成成分
cv_errors = []
n_components= range(1,19)
k_flod = KFold(X.shape[0],n_folds=10)
for n_component in n_components:
pls1 = PLSRegression(n_components=n_component)
rmses = []
for train,test in k_flod:
Xtrain,ytrain,Xtest,ytest = X_pca[train],y[train],X_pca[test],y[test]
pls1.fit(Xtrain,ytrain)
ypred = pls1.predict(Xtest)
rmses.append(np.sqrt(mean_squared_error(ypred,ytest)))
cv_errors.append(np.mean(rmses))
plt.plot(n_components,cv_errors)
plt.xlabel('n_compositions')
plt.ylabel('cv_errors')
Out[128]:
In [114]:
Out[114]:
In [ ]: