In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
In [2]:
from sklearn import datasets
data = datasets.fetch_california_housing()
In [3]:
dir(data)
Out[3]:
In [4]:
data_df = pd.DataFrame(data=data.data, columns=data.feature_names)
In [5]:
data_df.head()
Out[5]:
In [6]:
data_df.loc[:, 'target'] = data.target
In [7]:
data_df.shape
Out[7]:
In [8]:
data_df.head()
Out[8]:
In [9]:
data_df.target.plot(kind='hist')
Out[9]:
In [10]:
data_df.corrwith(data_df.target).plot(kind='bar', rot=30)
Out[10]:
In [11]:
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
model = Pipeline([
('reg', LinearRegression())
])
fold = KFold(n_splits=10, random_state=12345)
X = data_df.loc[:, ['MedInc', 'AveBedrms', 'Latitude']].values
y = data_df.target.values
results = cross_validate(
model, X, y, cv=fold, scoring='r2', return_train_score=True
)
In [12]:
results = pd.DataFrame.from_dict(results)
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
Out[12]:
In [37]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import RMSprop
def build_model():
model = Sequential()
#add model layers
model.add(BatchNormalization())
model.add(Dense(256, activation='relu')),
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu')),
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(
optimizer=RMSprop(lr=1.5 * 1E-3),
loss='mean_squared_error',
metrics=['mean_squared_error'])
return model
model = Pipeline([
('reg', KerasRegressor(build_fn=build_model, epochs=50, batch_size=128, verbose=False))
])
fold = KFold(n_splits=10, random_state=12345)
X = data_df.drop('target', axis=1).values.astype(np.float32)
y = data_df.target.values.astype(np.float32)
results = cross_validate(
model, X, y, cv=fold, scoring='r2', return_train_score=True
)
In [38]:
results = pd.DataFrame.from_dict(results)
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
Out[38]:
In [39]:
results.mean()
Out[39]: