In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
In [3]:
%matplotlib inline
In [4]:
filename = 'housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PRTATIO', 'B', 'LSTAT', 'MEDV']
In [5]:
df = pd.read_csv(filename, names=names, sep='\s+')
In [12]:
df.shape
Out[12]:
In [17]:
pd.set_option('precision', 1)
df.describe()
Out[17]:
In [7]:
df.dtypes
Out[7]:
In [12]:
pd.set_option('display.width', 120)
df.tail(30)
Out[12]:
In [15]:
df.isnull().sum()
Out[15]:
In [19]:
## correlation
pd.set_option('precision', 2)
df.corr(method = 'pearson')
Out[19]:
In [22]:
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
pyplot.show()
In [23]:
df.plot(kind='density', subplots=True, layout=(4,4), sharex=False, fontsize=1)
pyplot.show()
In [6]:
df.plot(kind='box', subplots=True, layout=(4,4), sharex=False, sharey=False, fontsize=8, figsize=(12,12))
pyplot.show()
In [10]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr())
fig.colorbar(cax)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()
In [13]:
y = df[names[-1]]
X = df[names[:-1]]
print y.shape
print X.shape
In [19]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
In [16]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=7)
In [18]:
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'
In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
In [21]:
models = {}
models['LR'] = LinearRegression()
models['LASSO'] = Lasso()
models['EN'] = ElasticNet()
models['KNN'] = KNeighborsRegressor()
models['CART'] = DecisionTreeRegressor()
models['SVM'] = SVR()
In [31]:
results = {}
for key in models:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_result = cross_val_score(models[key], X_train, y_train, cv=kfold, scoring=scoring)
results[key] = cv_result
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
In [37]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()
In [39]:
from sklearn.preprocessing import StandardScaler
In [41]:
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
pipelines['ScalerLASSO'] = Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])
pipelines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])
results = {}
for key in pipelines:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_result = cross_val_score(pipelines[key], X_train, y_train, cv=kfold, scoring=scoring)
results[key] = cv_result
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
In [42]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()
In [43]:
s_k = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
param_grid = {'KNN__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=s_k, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(X=X_train, y=y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
In [53]:
res= zip(grid_result.cv_results_['mean_train_score'], grid_result.cv_results_['param_KNN__n_neighbors'])
for a,b in res:
print a, b
In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
In [56]:
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler',StandardScaler()), ('AB', AdaBoostRegressor())])
ensembles['ScaledAB-KNN'] = Pipeline([('Scaler', StandardScaler()), ('ABKNN', AdaBoostRegressor(base_estimator= KNeighborsRegressor(n_neighbors=3)))])
ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(LinearRegression()))])
ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())])
ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())])
ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()), ('RBR', GradientBoostingRegressor())])
results = {}
for key in ensembles:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_result = cross_val_score(ensembles[key], X_train, y_train, cv=kfold, scoring=scoring)
results[key] = cv_result
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
In [57]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()
In [58]:
s_gbr = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])
param_grid = {'GBR__n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=s_gbr, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(X=X_train, y=y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
In [ ]:
s_gbr = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])
s_gbr.set_params(GBR__n_estimators=500).fit(X=X_train, y=y_train)
predictions = s_gbr.predict(X_test)
print mean_squared_error(y_test, )