In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot

In [3]:
%matplotlib inline

In [4]:
filename = 'housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',  'RAD', 'TAX', 'PRTATIO', 'B', 'LSTAT', 'MEDV']

In [5]:
df = pd.read_csv(filename, names=names, sep='\s+')

In [12]:
df.shape


Out[12]:
(506, 14)

In [17]:
pd.set_option('precision', 1)
df.describe()


Out[17]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PRTATIO B LSTAT MEDV
count 5.1e+02 506.0 506.0 5.1e+02 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0
mean 3.6e+00 11.4 11.1 6.9e-02 0.6 6.3 68.6 3.8 9.5 408.2 18.5 356.7 12.7 22.5
std 8.6e+00 23.3 6.9 2.5e-01 0.1 0.7 28.1 2.1 8.7 168.5 2.2 91.3 7.1 9.2
min 6.3e-03 0.0 0.5 0.0e+00 0.4 3.6 2.9 1.1 1.0 187.0 12.6 0.3 1.7 5.0
25% 8.2e-02 0.0 5.2 0.0e+00 0.4 5.9 45.0 2.1 4.0 279.0 17.4 375.4 6.9 17.0
50% 2.6e-01 0.0 9.7 0.0e+00 0.5 6.2 77.5 3.2 5.0 330.0 19.1 391.4 11.4 21.2
75% 3.7e+00 12.5 18.1 0.0e+00 0.6 6.6 94.1 5.2 24.0 666.0 20.2 396.2 17.0 25.0
max 8.9e+01 100.0 27.7 1.0e+00 0.9 8.8 100.0 12.1 24.0 711.0 22.0 396.9 38.0 50.0

In [7]:
df.dtypes


Out[7]:
CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PRTATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

In [12]:
pd.set_option('display.width', 120)
df.tail(30)


Out[12]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PRTATIO B LSTAT MEDV
476 4.87141 0.0 18.10 0 0.614 6.484 93.6 2.3053 24 666.0 20.2 396.21 18.68 16.7
477 15.02340 0.0 18.10 0 0.614 5.304 97.3 2.1007 24 666.0 20.2 349.48 24.91 12.0
478 10.23300 0.0 18.10 0 0.614 6.185 96.7 2.1705 24 666.0 20.2 379.70 18.03 14.6
479 14.33370 0.0 18.10 0 0.614 6.229 88.0 1.9512 24 666.0 20.2 383.32 13.11 21.4
480 5.82401 0.0 18.10 0 0.532 6.242 64.7 3.4242 24 666.0 20.2 396.90 10.74 23.0
481 5.70818 0.0 18.10 0 0.532 6.750 74.9 3.3317 24 666.0 20.2 393.07 7.74 23.7
482 5.73116 0.0 18.10 0 0.532 7.061 77.0 3.4106 24 666.0 20.2 395.28 7.01 25.0
483 2.81838 0.0 18.10 0 0.532 5.762 40.3 4.0983 24 666.0 20.2 392.92 10.42 21.8
484 2.37857 0.0 18.10 0 0.583 5.871 41.9 3.7240 24 666.0 20.2 370.73 13.34 20.6
485 3.67367 0.0 18.10 0 0.583 6.312 51.9 3.9917 24 666.0 20.2 388.62 10.58 21.2
486 5.69175 0.0 18.10 0 0.583 6.114 79.8 3.5459 24 666.0 20.2 392.68 14.98 19.1
487 4.83567 0.0 18.10 0 0.583 5.905 53.2 3.1523 24 666.0 20.2 388.22 11.45 20.6
488 0.15086 0.0 27.74 0 0.609 5.454 92.7 1.8209 4 711.0 20.1 395.09 18.06 15.2
489 0.18337 0.0 27.74 0 0.609 5.414 98.3 1.7554 4 711.0 20.1 344.05 23.97 7.0
490 0.20746 0.0 27.74 0 0.609 5.093 98.0 1.8226 4 711.0 20.1 318.43 29.68 8.1
491 0.10574 0.0 27.74 0 0.609 5.983 98.8 1.8681 4 711.0 20.1 390.11 18.07 13.6
492 0.11132 0.0 27.74 0 0.609 5.983 83.5 2.1099 4 711.0 20.1 396.90 13.35 20.1
493 0.17331 0.0 9.69 0 0.585 5.707 54.0 2.3817 6 391.0 19.2 396.90 12.01 21.8
494 0.27957 0.0 9.69 0 0.585 5.926 42.6 2.3817 6 391.0 19.2 396.90 13.59 24.5
495 0.17899 0.0 9.69 0 0.585 5.670 28.8 2.7986 6 391.0 19.2 393.29 17.60 23.1
496 0.28960 0.0 9.69 0 0.585 5.390 72.9 2.7986 6 391.0 19.2 396.90 21.14 19.7
497 0.26838 0.0 9.69 0 0.585 5.794 70.6 2.8927 6 391.0 19.2 396.90 14.10 18.3
498 0.23912 0.0 9.69 0 0.585 6.019 65.3 2.4091 6 391.0 19.2 396.90 12.92 21.2
499 0.17783 0.0 9.69 0 0.585 5.569 73.5 2.3999 6 391.0 19.2 395.77 15.10 17.5
500 0.22438 0.0 9.69 0 0.585 6.027 79.7 2.4982 6 391.0 19.2 396.90 14.33 16.8
501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273.0 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273.0 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 21.0 396.90 7.88 11.9

In [15]:
df.isnull().sum()


Out[15]:
CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PRTATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [19]:
## correlation
pd.set_option('precision', 2)
df.corr(method = 'pearson')


Out[19]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PRTATIO B LSTAT MEDV
CRIM 1.00 -0.20 0.41 -5.59e-02 0.42 -0.22 0.35 -0.38 6.26e-01 0.58 0.29 -0.39 0.46 -0.39
ZN -0.20 1.00 -0.53 -4.27e-02 -0.52 0.31 -0.57 0.66 -3.12e-01 -0.31 -0.39 0.18 -0.41 0.36
INDUS 0.41 -0.53 1.00 6.29e-02 0.76 -0.39 0.64 -0.71 5.95e-01 0.72 0.38 -0.36 0.60 -0.48
CHAS -0.06 -0.04 0.06 1.00e+00 0.09 0.09 0.09 -0.10 -7.37e-03 -0.04 -0.12 0.05 -0.05 0.18
NOX 0.42 -0.52 0.76 9.12e-02 1.00 -0.30 0.73 -0.77 6.11e-01 0.67 0.19 -0.38 0.59 -0.43
RM -0.22 0.31 -0.39 9.13e-02 -0.30 1.00 -0.24 0.21 -2.10e-01 -0.29 -0.36 0.13 -0.61 0.70
AGE 0.35 -0.57 0.64 8.65e-02 0.73 -0.24 1.00 -0.75 4.56e-01 0.51 0.26 -0.27 0.60 -0.38
DIS -0.38 0.66 -0.71 -9.92e-02 -0.77 0.21 -0.75 1.00 -4.95e-01 -0.53 -0.23 0.29 -0.50 0.25
RAD 0.63 -0.31 0.60 -7.37e-03 0.61 -0.21 0.46 -0.49 1.00e+00 0.91 0.46 -0.44 0.49 -0.38
TAX 0.58 -0.31 0.72 -3.56e-02 0.67 -0.29 0.51 -0.53 9.10e-01 1.00 0.46 -0.44 0.54 -0.47
PRTATIO 0.29 -0.39 0.38 -1.22e-01 0.19 -0.36 0.26 -0.23 4.65e-01 0.46 1.00 -0.18 0.37 -0.51
B -0.39 0.18 -0.36 4.88e-02 -0.38 0.13 -0.27 0.29 -4.44e-01 -0.44 -0.18 1.00 -0.37 0.33
LSTAT 0.46 -0.41 0.60 -5.39e-02 0.59 -0.61 0.60 -0.50 4.89e-01 0.54 0.37 -0.37 1.00 -0.74
MEDV -0.39 0.36 -0.48 1.75e-01 -0.43 0.70 -0.38 0.25 -3.82e-01 -0.47 -0.51 0.33 -0.74 1.00

In [22]:
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
pyplot.show()



In [23]:
df.plot(kind='density', subplots=True, layout=(4,4), sharex=False, fontsize=1)
pyplot.show()



In [6]:
df.plot(kind='box', subplots=True, layout=(4,4), sharex=False,  sharey=False, fontsize=8, figsize=(12,12))
pyplot.show()



In [10]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr())
fig.colorbar(cax)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()



In [13]:
y = df[names[-1]]
X = df[names[:-1]]
print y.shape
print X.shape


(506,)
(506, 13)

In [19]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [16]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=7)

In [18]:
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

In [20]:
from sklearn.linear_model import LinearRegression

from sklearn.linear_model import Lasso

from sklearn.linear_model import ElasticNet

from sklearn.tree import DecisionTreeRegressor

from sklearn.neighbors import KNeighborsRegressor

from sklearn.svm import SVR

from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_squared_error

In [21]:
models = {}

models['LR'] = LinearRegression()

models['LASSO'] = Lasso()

models['EN'] = ElasticNet()

models['KNN']  = KNeighborsRegressor()

models['CART'] = DecisionTreeRegressor()

models['SVM'] = SVR()

In [31]:
results = {}

for key in models:

    kfold = KFold(n_splits=num_folds, random_state=seed)

    cv_result = cross_val_score(models[key], X_train, y_train, cv=kfold, scoring=scoring)

    results[key] = cv_result

    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))


KNN: -41.896488 (13.901688)
SVM: -85.518342 (31.994798)
CART: -27.830154 (11.994918)
EN: -27.502259 (12.305022)
LR: -21.379856 (9.414264)
LASSO: -26.423561 (11.651110)

In [37]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()



In [39]:
from sklearn.preprocessing import StandardScaler

In [41]:
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])

pipelines['ScalerLASSO'] = Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])

pipelines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])

pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])

pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])

pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])

results = {}

for key in pipelines:

    kfold = KFold(n_splits=num_folds, random_state=seed)

    cv_result = cross_val_score(pipelines[key], X_train, y_train, cv=kfold, scoring=scoring)

    results[key] = cv_result

    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))


ScalerCART: -24.623580 (13.503302)
ScalerSVM: -29.633086 (17.009186)
ScalerKNN: -20.107620 (12.376949)
ScalerLASSO: -26.607314 (8.978761)
ScalerEN: -27.932372 (10.587490)
ScalerLR: -21.379856 (9.414264)

In [42]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()



In [43]:
s_k = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])

param_grid = {'KNN__n_neighbors':  [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}

kfold = KFold(n_splits=num_folds, random_state=seed)

grid = GridSearchCV(estimator=s_k, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(X=X_train, y=y_train)

print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))

In [53]:
res= zip(grid_result.cv_results_['mean_train_score'], grid_result.cv_results_['param_KNN__n_neighbors'])
for a,b in res:
    print a, b


0.0 1
-8.72121935976 3
-12.7354948482 5
-15.7113632473 7
-16.7747066501 9
-17.8059637546 11
-18.5498163019 13
-19.2884405979 15
-20.2389037149 17
-21.2455064052 19
-22.0803859781 21

In [54]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_squared_error

In [56]:
ensembles = {}

ensembles['ScaledAB'] = Pipeline([('Scaler',StandardScaler()), ('AB', AdaBoostRegressor())])

ensembles['ScaledAB-KNN'] = Pipeline([('Scaler', StandardScaler()), ('ABKNN', AdaBoostRegressor(base_estimator= KNeighborsRegressor(n_neighbors=3)))])

ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(LinearRegression()))])

ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())])

ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())])

ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()), ('RBR', GradientBoostingRegressor())])



results = {}

for key in ensembles:

    kfold = KFold(n_splits=num_folds, random_state=seed)

    cv_result = cross_val_score(ensembles[key], X_train, y_train, cv=kfold, scoring=scoring)

    results[key] = cv_result

    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))


ScaledAB-LR: -23.780579 (9.029614)
ScaledAB: -15.283989 (6.713950)
ScaledAB-KNN: -16.446338 (10.290478)
ScaledETR: -10.296758 (5.517486)
ScaledRFR: -13.708249 (6.847397)
ScaledGBR: -10.379358 (4.633832)

In [57]:
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()



In [58]:
s_gbr = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])

param_grid = {'GBR__n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}

kfold = KFold(n_splits=num_folds, random_state=seed)

grid = GridSearchCV(estimator=s_gbr, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(X=X_train, y=y_train)

print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))


最优:-9.28708798401 使用{'GBR__n_estimators': 500}

In [ ]:
s_gbr = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])
s_gbr.set_params(GBR__n_estimators=500).fit(X=X_train, y=y_train)
predictions = s_gbr.predict(X_test)

print mean_squared_error(y_test, )