notebook.community

Edit and run



In [2]:

    
import numpy as np
import pandas as pd
from matplotlib import pyplot



In [3]:

    
%matplotlib inline



In [4]:

    
filename = 'housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',  'RAD', 'TAX', 'PRTATIO', 'B', 'LSTAT', 'MEDV']



In [5]:

    
df = pd.read_csv(filename, names=names, sep='\s+')



In [12]:

    
df.shape









    Out[12]:





(506, 14)



In [17]:

    
pd.set_option('precision', 1)
df.describe()



In [7]:

    
df.dtypes









    Out[7]:





CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PRTATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object



In [12]:

    
pd.set_option('display.width', 120)
df.tail(30)



In [15]:

    
df.isnull().sum()









    Out[15]:





CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PRTATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64



In [19]:

    
## correlation
pd.set_option('precision', 2)
df.corr(method = 'pearson')



In [22]:

    
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
pyplot.show()



In [23]:

    
df.plot(kind='density', subplots=True, layout=(4,4), sharex=False, fontsize=1)
pyplot.show()



In [6]:

    
df.plot(kind='box', subplots=True, layout=(4,4), sharex=False,  sharey=False, fontsize=8, figsize=(12,12))
pyplot.show()



In [10]:

    
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr())
fig.colorbar(cax)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()



In [13]:

    
y = df[names[-1]]
X = df[names[:-1]]
print y.shape
print X.shape









    



(506,)
(506, 13)



In [19]:

    
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV



In [16]:

    
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=7)



In [18]:

    
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'



In [20]:

    
from sklearn.linear_model import LinearRegression

from sklearn.linear_model import Lasso

from sklearn.linear_model import ElasticNet

from sklearn.tree import DecisionTreeRegressor

from sklearn.neighbors import KNeighborsRegressor

from sklearn.svm import SVR

from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_squared_error



In [21]:

    
models = {}

models['LR'] = LinearRegression()

models['LASSO'] = Lasso()

models['EN'] = ElasticNet()

models['KNN']  = KNeighborsRegressor()

models['CART'] = DecisionTreeRegressor()

models['SVM'] = SVR()



In [31]:

    
results = {}

for key in models:

    kfold = KFold(n_splits=num_folds, random_state=seed)

    cv_result = cross_val_score(models[key], X_train, y_train, cv=kfold, scoring=scoring)

    results[key] = cv_result

    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))









    



KNN: -41.896488 (13.901688)
SVM: -85.518342 (31.994798)
CART: -27.830154 (11.994918)
EN: -27.502259 (12.305022)
LR: -21.379856 (9.414264)
LASSO: -26.423561 (11.651110)



In [37]:

    
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()



In [39]:

    
from sklearn.preprocessing import StandardScaler



In [41]:

    
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])

pipelines['ScalerLASSO'] = Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])

pipelines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])

pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])

pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])

pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])

results = {}

for key in pipelines:

    kfold = KFold(n_splits=num_folds, random_state=seed)

    cv_result = cross_val_score(pipelines[key], X_train, y_train, cv=kfold, scoring=scoring)

    results[key] = cv_result

    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))









    



ScalerCART: -24.623580 (13.503302)
ScalerSVM: -29.633086 (17.009186)
ScalerKNN: -20.107620 (12.376949)
ScalerLASSO: -26.607314 (8.978761)
ScalerEN: -27.932372 (10.587490)
ScalerLR: -21.379856 (9.414264)



In [42]:

    
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()



In [43]:

    
s_k = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])

param_grid = {'KNN__n_neighbors':  [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}

kfold = KFold(n_splits=num_folds, random_state=seed)

grid = GridSearchCV(estimator=s_k, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(X=X_train, y=y_train)

print('最优：%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))



In [53]:

    
res= zip(grid_result.cv_results_['mean_train_score'], grid_result.cv_results_['param_KNN__n_neighbors'])
for a,b in res:
    print a, b









    



0.0 1
-8.72121935976 3
-12.7354948482 5
-15.7113632473 7
-16.7747066501 9
-17.8059637546 11
-18.5498163019 13
-19.2884405979 15
-20.2389037149 17
-21.2455064052 19
-22.0803859781 21



In [54]:

    
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_squared_error



In [56]:

    
ensembles = {}

ensembles['ScaledAB'] = Pipeline([('Scaler',StandardScaler()), ('AB', AdaBoostRegressor())])

ensembles['ScaledAB-KNN'] = Pipeline([('Scaler', StandardScaler()), ('ABKNN', AdaBoostRegressor(base_estimator= KNeighborsRegressor(n_neighbors=3)))])

ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(LinearRegression()))])

ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())])

ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())])

ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()), ('RBR', GradientBoostingRegressor())])



results = {}

for key in ensembles:

    kfold = KFold(n_splits=num_folds, random_state=seed)

    cv_result = cross_val_score(ensembles[key], X_train, y_train, cv=kfold, scoring=scoring)

    results[key] = cv_result

    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))









    



ScaledAB-LR: -23.780579 (9.029614)
ScaledAB: -15.283989 (6.713950)
ScaledAB-KNN: -16.446338 (10.290478)
ScaledETR: -10.296758 (5.517486)
ScaledRFR: -13.708249 (6.847397)
ScaledGBR: -10.379358 (4.633832)



In [57]:

    
fig = pyplot.figure()
ax = fig.add_subplot(111)
ax.boxplot(results.values())
ax.set_xticklabels(results.keys())
pyplot.show()



In [58]:

    
s_gbr = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])

param_grid = {'GBR__n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}

kfold = KFold(n_splits=num_folds, random_state=seed)

grid = GridSearchCV(estimator=s_gbr, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(X=X_train, y=y_train)

print('最优：%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))









    



最优：-9.28708798401 使用{'GBR__n_estimators': 500}



In [ ]:

    
s_gbr = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])
s_gbr.set_params(GBR__n_estimators=500).fit(X=X_train, y=y_train)
predictions = s_gbr.predict(X_test)

print mean_squared_error(y_test, )

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PRTATIO	B	LSTAT	MEDV
count	5.1e+02	506.0	506.0	5.1e+02	506.0	506.0	506.0	506.0	506.0	506.0	506.0	506.0	506.0	506.0
mean	3.6e+00	11.4	11.1	6.9e-02	0.6	6.3	68.6	3.8	9.5	408.2	18.5	356.7	12.7	22.5
std	8.6e+00	23.3	6.9	2.5e-01	0.1	0.7	28.1	2.1	8.7	168.5	2.2	91.3	7.1	9.2
min	6.3e-03	0.0	0.5	0.0e+00	0.4	3.6	2.9	1.1	1.0	187.0	12.6	0.3	1.7	5.0
25%	8.2e-02	0.0	5.2	0.0e+00	0.4	5.9	45.0	2.1	4.0	279.0	17.4	375.4	6.9	17.0
50%	2.6e-01	0.0	9.7	0.0e+00	0.5	6.2	77.5	3.2	5.0	330.0	19.1	391.4	11.4	21.2
75%	3.7e+00	12.5	18.1	0.0e+00	0.6	6.6	94.1	5.2	24.0	666.0	20.2	396.2	17.0	25.0
max	8.9e+01	100.0	27.7	1.0e+00	0.9	8.8	100.0	12.1	24.0	711.0	22.0	396.9	38.0	50.0

	CRIM	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PRTATIO	B	LSTAT	MEDV
476	4.87141	18.10	0.614	6.484	93.6	2.3053	24	666.0	20.2	396.21	18.68	16.7
477	15.02340	18.10	0.614	5.304	97.3	2.1007	24	666.0	20.2	349.48	24.91	12.0
478	10.23300	18.10	0.614	6.185	96.7	2.1705	24	666.0	20.2	379.70	18.03	14.6
479	14.33370	18.10	0.614	6.229	88.0	1.9512	24	666.0	20.2	383.32	13.11	21.4
480	5.82401	18.10	0.532	6.242	64.7	3.4242	24	666.0	20.2	396.90	10.74	23.0
481	5.70818	18.10	0.532	6.750	74.9	3.3317	24	666.0	20.2	393.07	7.74	23.7
482	5.73116	18.10	0.532	7.061	77.0	3.4106	24	666.0	20.2	395.28	7.01	25.0
483	2.81838	18.10	0.532	5.762	40.3	4.0983	24	666.0	20.2	392.92	10.42	21.8
484	2.37857	18.10	0.583	5.871	41.9	3.7240	24	666.0	20.2	370.73	13.34	20.6
485	3.67367	18.10	0.583	6.312	51.9	3.9917	24	666.0	20.2	388.62	10.58	21.2
486	5.69175	18.10	0.583	6.114	79.8	3.5459	24	666.0	20.2	392.68	14.98	19.1
487	4.83567	18.10	0.583	5.905	53.2	3.1523	24	666.0	20.2	388.22	11.45	20.6
488	0.15086	27.74	0.609	5.454	92.7	1.8209	4	711.0	20.1	395.09	18.06	15.2
489	0.18337	27.74	0.609	5.414	98.3	1.7554	4	711.0	20.1	344.05	23.97	7.0
490	0.20746	27.74	0.609	5.093	98.0	1.8226	4	711.0	20.1	318.43	29.68	8.1
491	0.10574	27.74	0.609	5.983	98.8	1.8681	4	711.0	20.1	390.11	18.07	13.6
492	0.11132	27.74	0.609	5.983	83.5	2.1099	4	711.0	20.1	396.90	13.35	20.1
493	0.17331	9.69	0.585	5.707	54.0	2.3817	6	391.0	19.2	396.90	12.01	21.8
494	0.27957	9.69	0.585	5.926	42.6	2.3817	6	391.0	19.2	396.90	13.59	24.5
495	0.17899	9.69	0.585	5.670	28.8	2.7986	6	391.0	19.2	393.29	17.60	23.1
496	0.28960	9.69	0.585	5.390	72.9	2.7986	6	391.0	19.2	396.90	21.14	19.7
497	0.26838	9.69	0.585	5.794	70.6	2.8927	6	391.0	19.2	396.90	14.10	18.3
498	0.23912	9.69	0.585	6.019	65.3	2.4091	6	391.0	19.2	396.90	12.92	21.2
499	0.17783	9.69	0.585	5.569	73.5	2.3999	6	391.0	19.2	395.77	15.10	17.5
500	0.22438	9.69	0.585	6.027	79.7	2.4982	6	391.0	19.2	396.90	14.33	16.8
501	0.06263	11.93	0.573	6.593	69.1	2.4786	1	273.0	21.0	391.99	9.67	22.4
502	0.04527	11.93	0.573	6.120	76.7	2.2875	1	273.0	21.0	396.90	9.08	20.6
503	0.06076	11.93	0.573	6.976	91.0	2.1675	1	273.0	21.0	396.90	5.64	23.9
504	0.10959	11.93	0.573	6.794	89.3	2.3889	1	273.0	21.0	393.45	6.48	22.0
505	0.04741	11.93	0.573	6.030	80.8	2.5050	1	273.0	21.0	396.90	7.88	11.9

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PRTATIO	B	LSTAT	MEDV
CRIM	1.00	-0.20	0.41	-5.59e-02	0.42	-0.22	0.35	-0.38	6.26e-01	0.58	0.29	-0.39	0.46	-0.39
ZN	-0.20	1.00	-0.53	-4.27e-02	-0.52	0.31	-0.57	0.66	-3.12e-01	-0.31	-0.39	0.18	-0.41	0.36
INDUS	0.41	-0.53	1.00	6.29e-02	0.76	-0.39	0.64	-0.71	5.95e-01	0.72	0.38	-0.36	0.60	-0.48
CHAS	-0.06	-0.04	0.06	1.00e+00	0.09	0.09	0.09	-0.10	-7.37e-03	-0.04	-0.12	0.05	-0.05	0.18
NOX	0.42	-0.52	0.76	9.12e-02	1.00	-0.30	0.73	-0.77	6.11e-01	0.67	0.19	-0.38	0.59	-0.43
RM	-0.22	0.31	-0.39	9.13e-02	-0.30	1.00	-0.24	0.21	-2.10e-01	-0.29	-0.36	0.13	-0.61	0.70
AGE	0.35	-0.57	0.64	8.65e-02	0.73	-0.24	1.00	-0.75	4.56e-01	0.51	0.26	-0.27	0.60	-0.38
DIS	-0.38	0.66	-0.71	-9.92e-02	-0.77	0.21	-0.75	1.00	-4.95e-01	-0.53	-0.23	0.29	-0.50	0.25
RAD	0.63	-0.31	0.60	-7.37e-03	0.61	-0.21	0.46	-0.49	1.00e+00	0.91	0.46	-0.44	0.49	-0.38
TAX	0.58	-0.31	0.72	-3.56e-02	0.67	-0.29	0.51	-0.53	9.10e-01	1.00	0.46	-0.44	0.54	-0.47
PRTATIO	0.29	-0.39	0.38	-1.22e-01	0.19	-0.36	0.26	-0.23	4.65e-01	0.46	1.00	-0.18	0.37	-0.51
B	-0.39	0.18	-0.36	4.88e-02	-0.38	0.13	-0.27	0.29	-4.44e-01	-0.44	-0.18	1.00	-0.37	0.33
LSTAT	0.46	-0.41	0.60	-5.39e-02	0.59	-0.61	0.60	-0.50	4.89e-01	0.54	0.37	-0.37	1.00	-0.74
MEDV	-0.39	0.36	-0.48	1.75e-01	-0.43	0.70	-0.38	0.25	-3.82e-01	-0.47	-0.51	0.33	-0.74	1.00