In [11]:
import logging
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBClassifier
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline
np.set_printoptions(precision=6)
np.set_printoptions(suppress=True)
np.random.seed(1000)
# logging.basicConfig(level=logging.DEBUG)
In [2]:
# load boston dataset from sklearn
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
# create dataset
dataset = Dataset(X_train,y_train,X_test)
# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)
# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10,scorer=mean_absolute_error)
In [4]:
class BostonDataset(Dataset):
def preprocess(self):
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
return X_train, y_train, X_test, y_test
BostonDataset()
Out[4]:
In [5]:
def boston_dataset():
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
return X_train, y_train, X_test, y_test
Dataset(preprocessor=boston_dataset)
Out[5]:
In [4]:
dataset = Dataset(X_train, y_train, X_test)
dataset
Out[4]:
In [23]:
#
linreg = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True})
xgbclf = Classifier(dataset=dataset, estimator=XGBClassifier)
linreg, xgbclf
Out[23]:
In [24]:
#
def mlp_model(X_train, X_test, y_train, y_test=None):
model = Sequential()
model.add(Dense(256, input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(output_dim=1, init='glorot_uniform'))
model.add(Activation('linear'))
model.compile(loss='mean_squared_error', optimizer="RMSprop")
model.fit(X_train, y_train.values, nb_epoch=10, batch_size=128, verbose=1,
validation_data=(X_test, y_test.values))
return model.predict(X_test)
Regressor(dataset=dataset, estimator=mlp_model)
Out[24]:
In [7]:
class MLPRegressor(Regressor):
def estimator(self, X_train, X_test, y_train, y_test=None):
# ....
return model.predict(X_test)
MLPRegressor(dataset=dataset)
Out[7]:
Define model
In [26]:
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
In [27]:
res = model_rf.validate(mean_absolute_error,k=5,shuffle=True,stratify=False)
In [10]:
# Use randomly sampled 20% of the data as a holdout dataset
res = model_rf.validate(mean_absolute_error, test_size=0.20)
In [29]:
# Custom indices for holdout
train_index = np.array(range(250))
test_index = np.array(range(250,333))
res = model_rf.validate(mean_absolute_error,indices=(train_index,test_index))
Initialize pipeline with some models
In [30]:
dataset = Dataset(preprocessor=boston_dataset)
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 151},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15},name='knn')
pipeline = ModelsPipeline(model_rf, model_lr, model_knn)
In [31]:
weights = pipeline.find_weights(mean_absolute_error)
result = pipeline.weight(weights)
In [32]:
# get predictions for test
result = pipeline.mean().execute()
# or Validate
_ = pipeline.mean().validate(mean_absolute_error,10)
In [33]:
result = pipeline.apply(lambda x: np.max(x,axis=0)).execute()
result
Out[33]:
In [9]:
# 1st level
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 151},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15},name='knn')
pipeline = ModelsPipeline(model_rf,model_lr,model_knn)
stack_ds = pipeline.stack(k=5,seed=111)
# 2nd level
stack_rf = Regressor(dataset=stack_ds, estimator=RandomForestRegressor, parameters={'n_estimators': 15},name='rf')
stack_lr = Regressor(dataset=stack_ds, estimator=LinearRegression, parameters={'normalize': True},name='lr')
stack_pipeline = ModelsPipeline(stack_rf,stack_lr)
# 3rd level
weights = stack_pipeline.find_weights(mean_absolute_error)
print('---')
result = stack_pipeline.weight(weights).validate(mean_absolute_error,10)