In [3]:
%matplotlib notebook
import json
import time
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick.datasets import load_occupancy
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
In [4]:
df = load_occupancy(return_dataset=True).to_dataframe()
df.head()
Out[4]:
In [ ]:
df.describe()
In [ ]:
sns.pairplot(hue="occupancy", data=df)
In [ ]:
X = df[[col for col in df.columns if col != "occupancy" and col != "datetime"]]
y = df["occupancy"]
print(X.shape)
print(y.shape)
In [11]:
class ExtractDailyFeatures(BaseEstimator, TransformerMixin):
def __init__(self, extract_weekday=True, extract_hour=True):
super(ExtractDailyFeatures, self).__init__()
self.set_params(
extract_weekday=extract_weekday,
extract_hour=extract_hour,
)
def fit(self, X, y=None):
self.date_range_ = {}
for col in X.columns:
series = pd.to_datetime(X[col])
self.date_range_[col] = [
series.min(), series.max()
]
return self
def transform(self, X):
"""
Assumes that X is 2D array containing a single column, with a datetime
"""
cols = []
for col in self.date_range_:
series = pd.to_datetime(X[col])
if self.extract_weekday:
weekdays = series.apply(lambda d: d.weekday())
weekdays.name = f"{col}_weekday"
cols.append(weekdays)
if self.extract_hour:
hours = series.apply(lambda d: d.hour)
hours.name = f"{col}_hour"
cols.append(hours)
if len(cols) > 0:
return pd.concat([X] + cols, axis=1)
return X
extractor = ExtractDailyFeatures().fit(df[["datetime"]])
extractor.transform(df[["datetime"]]).head()
Out[11]:
In [ ]:
def train_score(model, X, y):
"""
This function reflects the experimental methodology for this specific dataset
"""
start = time.time()
cv = StratifiedKFold(n_splits=12, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='f1_macro')
cv_end = time.time()
model.fit(X, y)
fit_end = time.time()
return model, {
"f1_macro": scores,
"model_name": model[-1].__class__.__name__,
"params": model.get_params(),
"cv_time": cv_end - start,
"fit_time": fit_end - cv_end,
"timestamp": start,
"description": "trained as part of the XBUS 505 Session 3 demo",
}
def save_model(model, info):
name = f"{info['model_name']}-{info['timestamp']}"
with open(f"{name}.pickle", "wb") as f:
joblib.dump(model)
with open(f"{name}.json", "w") as f:
json.dump(info)
def generate_models(classifiers, rescalers, X, y, save=False):
for classifier in classifiers:
for rescaler in rescalers:
model = Pipeline([
("scale", rescaler()),
("model", classifier()),
])
model, info = train_score (model, X, y)
if save:
save_model(model, info)
print(f"{info['model_name']}: {info['f1_macro'].mean():0.4f}")
In [ ]:
classifiers = [
GaussianNB, RandomForestClassifier, LogisticRegression
]
rescalers = [
StandardScaler, MinMaxScaler
]
generate_models(classifiers, rescalers, X, y)