In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ClassBalance
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ClassificationReport
from ipywidgets import interact
In [2]:
import os
from yellowbrick.download import download_all
## The path to the test data sets
FIXTURES = os.path.join(os.getcwd(), "data")
## Dataset loading mechanisms
datasets = {
"occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
}
def load_data(name, download=True):
"""
Loads and wrangles the passed in dataset by name.
If download is specified, this method will download any missing files.
"""
# Get the path from the datasets
path = datasets[name]
# Check if the data exists, otherwise download or raise
if not os.path.exists(path):
if download:
download_all()
else:
raise ValueError((
"'{}' dataset has not been downloaded, "
"use the download.py module to fetch datasets"
).format(name))
# Return the data frame
return pd.read_csv(path)
In [3]:
# Load the classification data set
data = load_data("occupancy")
# Specify the features of interest and the classes of the target
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ["unoccupied", "occupied"]
# Extract the numpy arrays from the data frame
X = data[features].as_matrix()
y = data.occupancy.as_matrix()
# Create the train and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
In [4]:
models = {
cls.__name__: cls
for cls in (ClassBalance, ClassPredictionError, ClassificationReport)
}
@interact(model=list(models.keys()))
def graph_classifers(model="ClassBalance"):
_, axes = plt.subplots(ncols=1, figsize=(15,5))
viz = models[model](RandomForestClassifier(), classes=classes)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.finalize()
return axes
In [ ]: