In [1]:
import os
import sys
# Modify the path
sys.path.append("..")
import pandas as pd
import yellowbrick as yb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
In [3]:
occupancy = pd.read_csv('data/occupancy/occupancy.csv')
features = [
"temperature", "relative humidity", "light", "C02", "humidity"
]
classes = ["unoccupied", "occupied"]
X = occupancy[features]
y = occupancy['occupancy']
# Create the train and test data
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
In [4]:
from yellowbrick.contrib.scatter import ScatterVisualizer
visualizer = ScatterVisualizer(
x="light", y="C02", alpha=0.1, classes=classes
)
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.show()
In [8]:
import itertools
import numpy as np
from yellowbrick.features.base import DataVisualizer
from yellowbrick.utils import is_dataframe, is_structured_array
from yellowbrick.utils import has_ndarray_int_columns
from yellowbrick.exceptions import YellowbrickValueError
from yellowbrick.style.colors import resolve_colors
##########################################################################
# Quick Methods
##########################################################################
def scatterviz(X,
y=None,
ax=None,
alpha = 1,
features=None,
classes=None,
color=None,
colormap=None,
markers=None,
**kwargs):
"""Displays a bivariate scatter plot.
This helper function is a quick wrapper to utilize the ScatterVisualizer
(Transformer) for one-off analysis.
Parameters
----------
X : ndarray or DataFrame of shape n x m
A matrix of n instances with m features
y : ndarray or Series of length n, default: None
An array or series of target or class values
ax : matplotlib axes, default: None
The axes to plot the figure on.
features : list of strings, default: None
The names of two features or columns.
More than that will raise an error.
classes : list of strings, default: None
The names of the classes in the target
color : list or tuple of colors, default: None
Specify the colors for each individual class
colormap : string or matplotlib cmap, default: None
Sequential colormap for continuous target
markers : iterable of strings, default: ,+o*vhd
Matplotlib style markers for points on the scatter plot points
alpha : float, default: 1.0
Specify a transparency where 1 is completely opaque and 0 is completely
transparent. This property makes densely clustered points more visible.
Returns
-------
ax : matplotlib axes
Returns the axes that the parallel coordinates were drawn on.
"""
# Instantiate the visualizer
visualizer = ScatterVisualizer(ax, features, classes, color, colormap,
markers, **kwargs)
# Fit and transform the visualizer (calls draw)
visualizer.fit(X, y, **kwargs)
visualizer.transform(X)
# Return the axes object on the visualizer
return visualizer.ax
##########################################################################
# Static ScatterVisualizer Visualizer
##########################################################################
class ScatterVisualizer(DataVisualizer):
"""
ScatterVisualizer is a bivariate feature data visualization algorithm that
plots using the Cartesian coordinates of each point.
Parameters
----------
ax : a matplotlib plot, default: None
The axis to plot the figure on.
x : string, default: None
The feature name that corresponds to a column name or index postion
in the matrix that will be plotted against the x-axis
y : string, default: None
The feature name that corresponds to a column name or index postion
in the matrix that will be plotted against the y-axis
features : a list of two feature names to use, default: None
List of two features that correspond to the columns in the array.
The order of the two features correspond to X and Y axes on the
graph. More than two feature names or columns will raise an error.
If a DataFrame is passed to fit and features is None, feature names
are selected that are the columns of the DataFrame.
classes : a list of class names for the legend, default: None
If classes is None and a y value is passed to fit then the classes
are selected from the target vector.
color : optional list or tuple of colors to colorize points, default: None
Use either color to colorize the points on a per class basis or
colormap to color them on a continuous scale.
colormap : optional string or matplotlib cmap to colorize points, default: None
Use either color to colorize the points on a per class basis or
colormap to color them on a continuous scale.
markers : iterable of strings, default: ,+o*vhd
Matplotlib style markers for points on the scatter plot points
kwargs : keyword arguments passed to the super class.
alpha : float, default: 1.0
Specify a transparency where 1 is completely opaque and 0 is completely
transparent. This property makes densely clustered points more visible.
These parameters can be influenced later on in the visualization
process, but can and should be set as early as possible.
"""
def __init__(self,
ax=None,
x=None,
y=None,
alpha = 1,
features=None,
classes=None,
color=None,
colormap=None,
markers=None,
**kwargs):
"""
Initialize the base scatter with many of the options required in order
to make the visualization work.
"""
super(ScatterVisualizer, self).__init__(ax, features, classes, color,
colormap, **kwargs)
self.x = x
self.y = y
self.alpha = alpha
self.markers = itertools.cycle(
kwargs.pop('markers', (',', '+', 'o', '*', 'v', 'h', 'd')))
self.color = color
self.colormap = colormap
if self.x is not None and self.y is not None and self.features_ is not None:
raise YellowbrickValueError(
'Please specify x,y or features, not both.')
if self.x is not None and self.y is not None and self.features_ is None:
self.features_ = [self.x, self.y]
# Ensure with init that features doesn't have more than two features
if features is not None:
if len(features) != 2:
raise YellowbrickValueError(
'ScatterVisualizer only accepts two features.')
def fit(self, X, y=None, **kwargs):
"""
The fit method is the primary drawing input for the parallel coords
visualization since it has both the X and y data required for the
viz and the transform method does not.
Parameters
----------
X : ndarray or DataFrame of shape n x m
A matrix of n instances with 2 features
y : ndarray or Series of length n
An array or series of target or class values
kwargs : dict
Pass generic arguments to the drawing method
Returns
-------
self : instance
Returns the instance of the transformer/visualizer
"""
_, ncols = X.shape
if ncols == 2:
X_two_cols = X
if self.features_ is None:
self.features_ = ["Feature One", "Feature Two"]
# Handle the feature names if they're None.
elif self.features_ is not None and is_dataframe(X):
X_two_cols = X[self.features_].as_matrix()
# handle numpy named/ structured array
elif self.features_ is not None and is_structured_array(X):
X_selected = X[self.features_]
X_two_cols = X_selected.copy().view((np.float64, len(X_selected.dtype.names)))
# handle features that are numeric columns in ndarray matrix
elif self.features_ is not None and has_ndarray_int_columns(self.features_, X):
f_one, f_two = self.features_
X_two_cols = X[:, [int(f_one), int(f_two)]]
else:
raise YellowbrickValueError("""
ScatterVisualizer only accepts two features, please
explicitly set these two features in the init kwargs or
pass a matrix/ dataframe in with only two columns.""")
# Store the classes for the legend if they're None.
if self.classes_ is None:
# TODO: Is this the most efficient method?
self.classes_ = [str(label) for label in np.unique(y)]
# Draw the instances
self.draw(X_two_cols, y, **kwargs)
# Fit always returns self.
return self
def draw(self, X, y, **kwargs):
"""Called from the fit method, this method creates a scatter plot that
draws each instance as a class or target colored point, whose location
is determined by the feature data set.
"""
# Set the axes limits
self.ax.set_xlim([-1,1])
self.ax.set_ylim([-1,1])
# set the colors
color_values = resolve_colors(
n_colors=len(self.classes_),
colormap=self.colormap,
colors=self.color
)
colors = dict(zip(self.classes_, color_values))
# Create a data structure to hold the scatter plot representations
to_plot = {}
for kls in self.classes_:
to_plot[kls] = [[], []]
# Add each row of the data set to to_plot for plotting
# TODO: make this an independent function for override
for i, row in enumerate(X):
row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
x_, y_ = row_[0], row_[1]
kls = self.classes_[y[i]]
to_plot[kls][0].append(x_)
to_plot[kls][1].append(y_)
# Add the scatter plots from the to_plot function
# TODO: store these plots to add more instances to later
# TODO: make this a separate function
for i, kls in enumerate(self.classes_):
self.ax.scatter(
to_plot[kls][0],
to_plot[kls][1],
marker=next(self.markers),
color=colors[kls],
label=str(kls),
alpha=self.alpha,
**kwargs)
self.ax.axis('equal')
def finalize(self, **kwargs):
"""
Finalize executes any subclass-specific axes finalization steps.
The user calls show and show calls finalize.
Parameters
----------
kwargs: generic keyword arguments.
"""
# Divide out the two features
feature_one, feature_two = self.features_
# Set the title
self.set_title('Scatter Plot: {0} vs {1}'.format(
str(feature_one), str(feature_two)))
# Add the legend
self.ax.legend(loc='best')
self.ax.set_xlabel(str(feature_one))
self.ax.set_ylabel(str(feature_two))
# Alias for ScatterViz
ScatterViz = ScatterVisualizer
In [9]:
visualizer = ScatterVisualizer(x="light", y="C02", alpha=0.5, classes=classes, size=(500,350))
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.show()
In [10]:
visualizer = ScatterVisualizer(
x="temperature", y="light", alpha=0.7, classes=classes
)
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.show()
In [ ]: