Taking examples/examples.ipynb as a starting point.


In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.append("..")
sys.path.append("../..")

import numpy as np 
import pandas as pd
import yellowbrick as yb

In [3]:
from yellowbrick.features.rankd import Rank1D, Rank2D, rank1d, rank2d

In [4]:
# !pip install pandas requests nose

Dataset


In [5]:
# %run download.py

In [6]:
from download import download_all 

## The path to the test data sets
FIXTURES  = os.path.join(os.getcwd(), "data")

## Dataset loading mechanisms
datasets = {
    "credit": os.path.join(FIXTURES, "credit", "credit.csv"),
    "concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
    "occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
    "mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
}

def load_data(name, download=True):
    """
    Loads and wrangles the passed in dataset by name.
    If download is specified, this method will download any missing files. 
    """
    # Get the path from the datasets 
    path = datasets[name]
    
    # Check if the data exists, otherwise download or raise 
    if not os.path.exists(path):
        if download:
            download_all() 
        else:
            raise ValueError((
                "'{}' dataset has not been downloaded, "
                "use the download.py module to fetch datasets"
            ).format(name))
    
    # Return the data frame
    return pd.read_csv(path)

In [7]:
# Load the classification data set
data = load_data('credit') 
data.head()


Out[7]:
limit sex edu married age apr_delay may_delay jun_delay jul_delay aug_delay ... jul_bill aug_bill sep_bill apr_pay may_pay jun_pay jul_pay aug_pay sep_pay default
0 20000 2 2 1 24 2 2 -1 -1 -2 ... 0 0 0 0 689 0 0 0 0 1
1 120000 2 2 2 26 -1 2 0 0 0 ... 3272 3455 3261 0 1000 1000 1000 0 2000 1
2 90000 2 2 2 34 0 0 0 0 0 ... 14331 14948 15549 1518 1500 1000 1000 1000 5000 0
3 50000 2 2 1 37 0 0 0 0 0 ... 28314 28959 29547 2000 2019 1200 1100 1069 1000 0
4 50000 1 2 1 57 -1 0 -1 0 0 ... 20940 19146 19131 2000 36681 10000 9000 689 679 0

5 rows × 24 columns


In [8]:
# Specify the features of interest
features = [
        'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay',
        'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill',
        'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay',
        'jul_pay', 'aug_pay', 'sep_pay',
    ]

X = data[features]
y = data.default

Rank1D

New visualizer


In [9]:
# get features from column names...
visualizer = Rank1D(algorithm='shapiro')
visualizer.fit_transform_show(X, y);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [10]:
# Raw numpy version
visualizer = Rank1D(algorithm='shapiro', features=features)
visualizer.fit_transform_show(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [11]:
# numpy version, no feature names
visualizer = Rank1D(algorithm='shapiro')
visualizer.fit_transform_show(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [12]:
# disable tick labels
visualizer = Rank1D(algorithm='shapiro', show_feature_names=False)
visualizer.fit_transform_show(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

vertical orient


In [13]:
# get features from column names...
visualizer = Rank1D(algorithm='shapiro', orient='v')
visualizer.fit_transform_show(X, y);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [14]:
# Raw numpy version
visualizer = Rank1D(algorithm='shapiro', features=features, orient='v')
visualizer.fit_transform_show(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [15]:
# numpy version, no feature names
visualizer = Rank1D(algorithm='shapiro', orient='v')
visualizer.fit_transform_show(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [16]:
# disable tick labels
visualizer = Rank1D(algorithm='shapiro', show_feature_names=False, orient='v')
visualizer.fit_transform_show(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

quick methods


In [17]:
# get features from column names...
rank1d(X, y);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [18]:
# Raw numpy version
rank1d(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [19]:
# numpy version, no feature names
rank1d(X.values, y.values);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [20]:
# disable tick labels
rank1d(X.values, y.values, show_feature_names=False);


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

quick methods, vertical


In [21]:
# get features from column names...
rank1d(X, y, orient='v');


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [22]:
# Raw numpy version
rank1d(X.values, y.values, orient='v');


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [23]:
# numpy version, no feature names
rank1d(X.values, y.values, orient='v');


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [24]:
# disable tick labels
rank1d(X.values, y.values, show_feature_names=False, orient='v');


/Users/pschafer/.virtualenvs/yellowbrick/lib/python3.6/site-packages/scipy/stats/morestats.py:1326: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

Rank2D

Fixing order of the tick labels, using the feature names to label.


In [25]:
# get features from column names...
visualizer = Rank2D()
visualizer.fit_transform_show(X, y);



In [26]:
# raw numpy version
visualizer = Rank2D(features=features)
visualizer.fit_transform_show(X.values, y.values);



In [27]:
# numpy version, no feature names
visualizer = Rank2D()
visualizer.fit_transform_show(X.values, y.values);



In [28]:
# disable tick labels
visualizer = Rank2D(show_feature_names=False)
visualizer.fit_transform_show(X.values, y.values);


Quick method


In [29]:
# get features from column names...
rank2d(X);



In [30]:
# raw numpy version
rank2d(X.values, features=features);



In [31]:
# numpy version, no feature names
rank2d(X.values);



In [32]:
# disable tick labels
rank2d(X, show_feature_names=False);