In [11]:
import logging
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBClassifier

from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

np.set_printoptions(precision=6)
np.set_printoptions(suppress=True)

np.random.seed(1000)
# logging.basicConfig(level=logging.DEBUG)

Minimalist example:


In [2]:
# load boston dataset from sklearn
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression 
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')

# Stack two models 
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10,scorer=mean_absolute_error)


Metric: mean_absolute_error
Folds accuracy: [2.5409484630187498, 1.5968722187619016, 1.956704827743762, 1.8927907834120281, 2.7337753076007165, 2.7029209330510935, 1.6230094648637268, 2.5274361322273666, 2.4850130256874818, 2.281401529471093]
Mean accuracy: 2.23408726858
Standard Deviation: 0.410827978086
Variance: 0.168779627578

Dataset creation

Class-based

In [4]:
class BostonDataset(Dataset):
    def preprocess(self):
        data = load_boston()
        X, y = data['data'], data['target']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
        return X_train, y_train, X_test, y_test
BostonDataset()


Out[4]:
BostonDataset(1f75270001f1c408a9341958f62cf493)
Function-based

In [5]:
def boston_dataset():
    data = load_boston()
    X, y = data['data'], data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
    return X_train, y_train, X_test, y_test
Dataset(preprocessor=boston_dataset)


Out[5]:
boston_dataset(a32eb71bbba4a4ca5e54a13e00b3fd81)
Minimal working example, no cache

In [4]:
dataset = Dataset(X_train, y_train, X_test)
dataset


Out[4]:
Dataset(c9b316f827981b3d0b53f8ab139234ea)

Model creation

For models that supports scikit-learn API

In [23]:
# 
linreg = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True})
xgbclf = Classifier(dataset=dataset, estimator=XGBClassifier)
linreg, xgbclf


Out[23]:
(LinearRegression(34d4a8b31c27c12cbee90430d34f6e38),
 XGBClassifier(e09a1c5931ad63447f68420beffe1a63))
Function-based definition

In [24]:
# 
def mlp_model(X_train, X_test, y_train, y_test=None):
    model = Sequential()

    model.add(Dense(256, input_shape=(X_train.shape[1],)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(output_dim=1, init='glorot_uniform'))
    model.add(Activation('linear'))

    model.compile(loss='mean_squared_error', optimizer="RMSprop")
    model.fit(X_train, y_train.values, nb_epoch=10, batch_size=128, verbose=1,
              validation_data=(X_test, y_test.values))
    return model.predict(X_test)
Regressor(dataset=dataset, estimator=mlp_model)


Out[24]:
mlp_model(03860b84c0263516981a901aedbdc1d1)
Class-based definition

In [7]:
class MLPRegressor(Regressor):
    def estimator(self, X_train, X_test, y_train, y_test=None):
        # ....
        return model.predict(X_test)
MLPRegressor(dataset=dataset)


Out[7]:
MLPRegressor(3d51fd0fddac221d82f22edb5ecd893c)

Model validation

Define model


In [26]:
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
K-fold cross-validation

In [27]:
res = model_rf.validate(mean_absolute_error,k=5,shuffle=True,stratify=False)


Metric: mean_absolute_error
Folds accuracy: [2.2178901098901109, 2.0261318681318681, 2.5353626373626379, 2.4249890109890115, 2.3894945054945058]
Mean accuracy: 2.31877362637
Standard Deviation: 0.178331507546
Variance: 0.0318021265837
Valditaion using holdout dataset

In [10]:
# Use randomly sampled 20% of the data as a holdout dataset
res = model_rf.validate(mean_absolute_error, test_size=0.20)


Metric: mean_absolute_error
Accuracy: 2.18037260752

In [29]:
# Custom indices for holdout
train_index = np.array(range(250)) 
test_index = np.array(range(250,333)) 

res = model_rf.validate(mean_absolute_error,indices=(train_index,test_index))


Metric: mean_absolute_error
Accuracy: 2.37060240964

More on pipelines

Initialize pipeline with some models


In [30]:
dataset = Dataset(preprocessor=boston_dataset)

model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 151},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15},name='knn')

pipeline = ModelsPipeline(model_rf, model_lr, model_knn)
Weighted average

In [31]:
weights = pipeline.find_weights(mean_absolute_error)
result = pipeline.weight(weights)


Best Score (mean_absolute_error): 2.13885516123
Best Weights: [ 0.883654  0.116346  0.      ]
Simple mean

In [32]:
# get predictions for test 
result = pipeline.mean().execute()
# or Validate 
_ = pipeline.mean().validate(mean_absolute_error,10)


Metric: mean_absolute_error
Folds accuracy: [3.3973174238660273, 2.7173869162889996, 2.2419615273624673, 3.0460731991343413, 3.69471451536146, 3.7883397204515954, 2.1306147334648409, 2.9458670951396022, 2.2279531241149551, 2.5849058757589845]
Mean accuracy: 2.87751341309
Standard Deviation: 0.574079708111
Variance: 0.329567511265
Custom function

In [33]:
result = pipeline.apply(lambda x: np.max(x,axis=0)).execute()
result


Out[33]:
array([ 28.892009,  29.37481 ,  33.466667,  29.166667,  47.077483,
        26.761489,  34.97431 ,  20.713333,  24.627809,  14.088647,
        27.172003,  46.706623,  23.34    ,  31.115211,  20.074106,
        24.520919,  26.813333,  26.28    ,  19.326667,  22.926667,
        14.781457,  27.214832,  22.796856,  32.649007,  16.96    ,
        19.510596,  43.784106,  25.313333,  26.133333,  13.452318,
        32.067085,  10.633333,  22.453333,  28.96824 ,  44.8     ,
        31.2     ,  10.826667,  44.594702,  24.29462 ,  16.87743 ,
        27.84    ,  20.377483,  25.1     ,  23.053333,  23.606667,
        34.122363,  30.254863,  17.62    ,  43.774834,  20.236077,
        23.432906])

3 level stacking


In [9]:
# 1st level
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 151},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15},name='knn')

pipeline = ModelsPipeline(model_rf,model_lr,model_knn)
stack_ds = pipeline.stack(k=5,seed=111)

# 2nd level
stack_rf = Regressor(dataset=stack_ds, estimator=RandomForestRegressor, parameters={'n_estimators': 15},name='rf')
stack_lr = Regressor(dataset=stack_ds, estimator=LinearRegression, parameters={'normalize': True},name='lr')
stack_pipeline = ModelsPipeline(stack_rf,stack_lr)

# 3rd level
weights = stack_pipeline.find_weights(mean_absolute_error)
print('---')
result = stack_pipeline.weight(weights).validate(mean_absolute_error,10)


Best Score (mean_absolute_error): 2.04313642617
Best Weights: [ 0.356845  0.643155]
---
Metric: mean_absolute_error
Folds accuracy: [2.1941236696859359, 1.6244554310298898, 1.8894005943524717, 2.0743091479239579, 2.4599829226741226, 2.7573855144072361, 1.6437354143971163, 2.333320517625137, 2.2536311445247388, 2.4101536603595477]
Mean accuracy: 2.1640498017
Standard Deviation: 0.343969088383
Variance: 0.118314733763