Taking examples/examples.ipynb
as a starting point.
In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
import os
import sys
sys.path.append("..")
sys.path.append("../..")
import numpy as np
import pandas as pd
import yellowbrick as yb
In [3]:
from yellowbrick.features.rankd import Rank1D, Rank2D, rank1d, rank2d
In [4]:
# !pip install pandas requests nose
In [5]:
# %run download.py
In [6]:
from download import download_all
## The path to the test data sets
FIXTURES = os.path.join(os.getcwd(), "data")
## Dataset loading mechanisms
datasets = {
"credit": os.path.join(FIXTURES, "credit", "credit.csv"),
"concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
"occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
"mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
}
def load_data(name, download=True):
"""
Loads and wrangles the passed in dataset by name.
If download is specified, this method will download any missing files.
"""
# Get the path from the datasets
path = datasets[name]
# Check if the data exists, otherwise download or raise
if not os.path.exists(path):
if download:
download_all()
else:
raise ValueError((
"'{}' dataset has not been downloaded, "
"use the download.py module to fetch datasets"
).format(name))
# Return the data frame
return pd.read_csv(path)
In [7]:
# Load the classification data set
data = load_data('credit')
data.head()
Out[7]:
In [8]:
# Specify the features of interest
features = [
'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay',
'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill',
'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay',
'jul_pay', 'aug_pay', 'sep_pay',
]
X = data[features]
y = data.default
In [9]:
# get features from column names...
visualizer = Rank1D(algorithm='shapiro')
visualizer.fit_transform_show(X, y);
In [10]:
# Raw numpy version
visualizer = Rank1D(algorithm='shapiro', features=features)
visualizer.fit_transform_show(X.values, y.values);
In [11]:
# numpy version, no feature names
visualizer = Rank1D(algorithm='shapiro')
visualizer.fit_transform_show(X.values, y.values);
In [12]:
# disable tick labels
visualizer = Rank1D(algorithm='shapiro', show_feature_names=False)
visualizer.fit_transform_show(X.values, y.values);
In [13]:
# get features from column names...
visualizer = Rank1D(algorithm='shapiro', orient='v')
visualizer.fit_transform_show(X, y);
In [14]:
# Raw numpy version
visualizer = Rank1D(algorithm='shapiro', features=features, orient='v')
visualizer.fit_transform_show(X.values, y.values);
In [15]:
# numpy version, no feature names
visualizer = Rank1D(algorithm='shapiro', orient='v')
visualizer.fit_transform_show(X.values, y.values);
In [16]:
# disable tick labels
visualizer = Rank1D(algorithm='shapiro', show_feature_names=False, orient='v')
visualizer.fit_transform_show(X.values, y.values);
In [17]:
# get features from column names...
rank1d(X, y);
In [18]:
# Raw numpy version
rank1d(X.values, y.values);
In [19]:
# numpy version, no feature names
rank1d(X.values, y.values);
In [20]:
# disable tick labels
rank1d(X.values, y.values, show_feature_names=False);
In [21]:
# get features from column names...
rank1d(X, y, orient='v');
In [22]:
# Raw numpy version
rank1d(X.values, y.values, orient='v');
In [23]:
# numpy version, no feature names
rank1d(X.values, y.values, orient='v');
In [24]:
# disable tick labels
rank1d(X.values, y.values, show_feature_names=False, orient='v');
In [25]:
# get features from column names...
visualizer = Rank2D()
visualizer.fit_transform_show(X, y);
In [26]:
# raw numpy version
visualizer = Rank2D(features=features)
visualizer.fit_transform_show(X.values, y.values);
In [27]:
# numpy version, no feature names
visualizer = Rank2D()
visualizer.fit_transform_show(X.values, y.values);
In [28]:
# disable tick labels
visualizer = Rank2D(show_feature_names=False)
visualizer.fit_transform_show(X.values, y.values);
In [29]:
# get features from column names...
rank2d(X);
In [30]:
# raw numpy version
rank2d(X.values, features=features);
In [31]:
# numpy version, no feature names
rank2d(X.values);
In [32]:
# disable tick labels
rank2d(X, show_feature_names=False);