Taking examples/examples.ipynb as a starting point.



In [1]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:

    
import os
import sys

sys.path.append("..")
sys.path.append("../..")

import numpy as np 
import pandas as pd
import yellowbrick as yb
import matplotlib.pyplot as plt









    



/Users/pschafer/.pyenv/versions/3.6.2/envs/yellowbrick/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Dataset



In [3]:

    
from download import download_all 

## The path to the test data sets
FIXTURES  = os.path.join(os.getcwd(), "data")

## Dataset loading mechanisms
datasets = {
    "credit": os.path.join(FIXTURES, "credit", "credit.csv"),
    "concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
    "occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
    "mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
}

def load_data(name, download=True):
    """
    Loads and wrangles the passed in dataset by name.
    If download is specified, this method will download any missing files. 
    """
    # Get the path from the datasets 
    path = datasets[name]
    
    # Check if the data exists, otherwise download or raise 
    if not os.path.exists(path):
        if download:
            download_all() 
        else:
            raise ValueError((
                "'{}' dataset has not been downloaded, "
                "use the download.py module to fetch datasets"
            ).format(name))
    
    # Return the data frame
    return pd.read_csv(path)



In [4]:

    
# Load the classification data set
data = load_data('occupancy') 
print(len(data))
data.head()









    



20560






    Out[4]:







  
    
      
      datetime
      temperature
      relative humidity
      light
      C02
      humidity
      occupancy
    
  
  
    
      0
      2015-02-04 17:51:00
      23.18
      27.2720
      426.0
      721.25
      0.004793
      1
    
    
      1
      2015-02-04 17:51:59
      23.15
      27.2675
      429.5
      714.00
      0.004783
      1
    
    
      2
      2015-02-04 17:53:00
      23.15
      27.2450
      426.0
      713.50
      0.004779
      1
    
    
      3
      2015-02-04 17:54:00
      23.15
      27.2000
      426.0
      708.25
      0.004772
      1
    
    
      4
      2015-02-04 17:55:00
      23.10
      27.2000
      426.0
      704.50
      0.004757
      1



In [5]:

    
# Specify the features of interest and the classes of the target 
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ['unoccupied', 'occupied']

# Get a small sample for demo-ing
X = data.head(1000)[features]
y = data.head(1000).occupancy

Plot all the things



In [6]:

    
from yellowbrick.features import (Rank1D, 
                                  Rank2D,
                                  ScatterViz,
                                  RadViz,
                                  ParallelCoordinates,
                                  JointPlotVisualizer,
                                  PCADecomposition
                                 )
from yellowbrick.pipeline import VisualPipeline

dataframe version



In [7]:

    
fig, ax = plt.subplots(3, 2, figsize=[12, 16])
pl = VisualPipeline([
    ('rank1d', Rank1D(features=features, ax=ax[0,0])),
    ('rank2d', Rank2D(features=features, ax=ax[1,0])),
    ('pcoords', ParallelCoordinates(features=features, classes=classes, ax=ax[0,1])),
    ('radviz', RadViz(features=features, classes=classes, ax=ax[1,1])),
    ('scatter', ScatterViz(features=features[:2], classes=classes, ax=ax[2,0])),
#     ('joinplot', JointPlotVisualizer(feature=features[0], ax=ax[2,1])),
    ('pca', PCADecomposition(ax=ax[2,1]))
])
pl.fit_transform_show(X, y);

numpy version



In [8]:

    
fig, ax = plt.subplots(3, 2, figsize=[12, 16])
pl = VisualPipeline([
    ('rank1d', Rank1D(features=features, ax=ax[0,0])),
    ('rank2d', Rank2D(features=features, ax=ax[1,0])),
    ('pcoords', ParallelCoordinates(features=features, classes=classes, ax=ax[0,1])),
    ('radviz', RadViz(features=features, classes=classes, ax=ax[1,1])),
    ('scatter', ScatterViz(x=0, y=1, classes=classes, ax=ax[2,0])),
#     ('joinplot', JointPlotVisualizer(feature=features[0], cvax=ax[2,1])),
    ('pca', PCADecomposition(ax=ax[2,1]))
])
pl.fit_transform_show(X.values, y);



In [ ]:

	datetime	temperature	relative humidity	light	C02	humidity	occupancy
0	2015-02-04 17:51:00	23.18	27.2720	426.0	721.25	0.004793	1
1	2015-02-04 17:51:59	23.15	27.2675	429.5	714.00	0.004783	1
2	2015-02-04 17:53:00	23.15	27.2450	426.0	713.50	0.004779	1
3	2015-02-04 17:54:00	23.15	27.2000	426.0	708.25	0.004772	1
4	2015-02-04 17:55:00	23.10	27.2000	426.0	704.50	0.004757	1