Taking examples/examples.ipynb
as a starting point.
In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
import os
import sys
sys.path.append("..")
sys.path.append("../..")
import numpy as np
import pandas as pd
import yellowbrick as yb
import matplotlib.pyplot as plt
In [3]:
from download import download_all
## The path to the test data sets
FIXTURES = os.path.join(os.getcwd(), "data")
## Dataset loading mechanisms
datasets = {
"credit": os.path.join(FIXTURES, "credit", "credit.csv"),
"concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
"occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
"mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
}
def load_data(name, download=True):
"""
Loads and wrangles the passed in dataset by name.
If download is specified, this method will download any missing files.
"""
# Get the path from the datasets
path = datasets[name]
# Check if the data exists, otherwise download or raise
if not os.path.exists(path):
if download:
download_all()
else:
raise ValueError((
"'{}' dataset has not been downloaded, "
"use the download.py module to fetch datasets"
).format(name))
# Return the data frame
return pd.read_csv(path)
In [4]:
# Load the classification data set
data = load_data('occupancy')
print(len(data))
data.head()
Out[4]:
In [5]:
# Specify the features of interest and the classes of the target
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ['unoccupied', 'occupied']
# Get a small sample for demo-ing
X = data.head(1000)[features]
y = data.head(1000).occupancy
In [6]:
from yellowbrick.features import (Rank1D,
Rank2D,
ScatterViz,
RadViz,
ParallelCoordinates,
JointPlotVisualizer,
PCADecomposition
)
from yellowbrick.pipeline import VisualPipeline
dataframe version
In [7]:
fig, ax = plt.subplots(3, 2, figsize=[12, 16])
pl = VisualPipeline([
('rank1d', Rank1D(features=features, ax=ax[0,0])),
('rank2d', Rank2D(features=features, ax=ax[1,0])),
('pcoords', ParallelCoordinates(features=features, classes=classes, ax=ax[0,1])),
('radviz', RadViz(features=features, classes=classes, ax=ax[1,1])),
('scatter', ScatterViz(features=features[:2], classes=classes, ax=ax[2,0])),
# ('joinplot', JointPlotVisualizer(feature=features[0], ax=ax[2,1])),
('pca', PCADecomposition(ax=ax[2,1]))
])
pl.fit_transform_show(X, y);
numpy version
In [8]:
fig, ax = plt.subplots(3, 2, figsize=[12, 16])
pl = VisualPipeline([
('rank1d', Rank1D(features=features, ax=ax[0,0])),
('rank2d', Rank2D(features=features, ax=ax[1,0])),
('pcoords', ParallelCoordinates(features=features, classes=classes, ax=ax[0,1])),
('radviz', RadViz(features=features, classes=classes, ax=ax[1,1])),
('scatter', ScatterViz(x=0, y=1, classes=classes, ax=ax[2,0])),
# ('joinplot', JointPlotVisualizer(feature=features[0], cvax=ax[2,1])),
('pca', PCADecomposition(ax=ax[2,1]))
])
pl.fit_transform_show(X.values, y);
In [ ]: