scikit-learn Demo


In [3]:
%matplotlib notebook

import json
import time
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from yellowbrick.datasets import load_occupancy

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import BaseEstimator, TransformerMixin

Data Loading


In [4]:
df = load_occupancy(return_dataset=True).to_dataframe()
df.head()


Out[4]:
datetime temperature relative humidity light CO2 humidity occupancy
0 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 1
1 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 1
2 2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 1
3 2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 1
4 2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 1

In [ ]:
df.describe()

In [ ]:
sns.pairplot(hue="occupancy", data=df)

In [ ]:
X = df[[col for col in df.columns if col != "occupancy" and col != "datetime"]]
y = df["occupancy"]

print(X.shape)
print(y.shape)

Custom Transformer


In [11]:
class ExtractDailyFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, extract_weekday=True, extract_hour=True):
        super(ExtractDailyFeatures, self).__init__()
        self.set_params(
            extract_weekday=extract_weekday,
            extract_hour=extract_hour,
        )
        
    def fit(self, X, y=None):
        self.date_range_ = {}
        for col in X.columns:
            series = pd.to_datetime(X[col])
            self.date_range_[col] = [
                series.min(), series.max()
            ]
        return self
        
    def transform(self, X):
        """
        Assumes that X is 2D array containing a single column, with a datetime
        """
        cols = []
        for col in self.date_range_:
            series = pd.to_datetime(X[col])
            if self.extract_weekday:
                weekdays = series.apply(lambda d: d.weekday())
                weekdays.name = f"{col}_weekday"
                cols.append(weekdays)
            if self.extract_hour:
                hours = series.apply(lambda d: d.hour)
                hours.name = f"{col}_hour"
                cols.append(hours)
        
        if len(cols) > 0:
            return pd.concat([X] + cols, axis=1)
        return X
    

extractor = ExtractDailyFeatures().fit(df[["datetime"]])
extractor.transform(df[["datetime"]]).head()


/Users/benjamin/.pyenv/versions/3.7.3/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  FutureWarning)
Out[11]:
datetime datetime_weekday datetime_hour
0 2015-02-04 17:51:00 2 17
1 2015-02-04 17:51:59 2 17
2 2015-02-04 17:53:00 2 17
3 2015-02-04 17:54:00 2 17
4 2015-02-04 17:55:00 2 17

Initial Model Triples


In [ ]:
def train_score(model, X, y):
    """
    This function reflects the experimental methodology for this specific dataset
    """
    start = time.time()
    cv = StratifiedKFold(n_splits=12, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='f1_macro')
    cv_end = time.time()
    
    model.fit(X, y)
    fit_end = time.time()
    
    return model, {
        "f1_macro": scores,
        "model_name": model[-1].__class__.__name__,
        "params": model.get_params(),
        "cv_time": cv_end - start,
        "fit_time": fit_end - cv_end,
        "timestamp": start,
        "description": "trained as part of the XBUS 505 Session 3 demo",
    }


def save_model(model, info):
    name = f"{info['model_name']}-{info['timestamp']}"
    with open(f"{name}.pickle", "wb") as f:
        joblib.dump(model)
    
    with open(f"{name}.json", "w") as f:
        json.dump(info)


def generate_models(classifiers, rescalers, X, y, save=False):
    for classifier in classifiers:
        for rescaler in rescalers:
            model = Pipeline([
                ("scale", rescaler()),
                ("model", classifier()),
            ])
            
            model, info = train_score (model, X, y)
            if save:
                save_model(model, info)
            print(f"{info['model_name']}: {info['f1_macro'].mean():0.4f}")

In [ ]:
classifiers = [
    GaussianNB, RandomForestClassifier, LogisticRegression
]

rescalers = [
    StandardScaler, MinMaxScaler
]


generate_models(classifiers, rescalers, X, y)