In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("./../..")
In [66]:
%reload_ext yellowbrick
%matplotlib inline
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import OneHotEncoder
In [34]:
headers = pd.read_csv("./horse-colic.attrs")
headers.Attribute.values
Out[34]:
In [69]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data'
# Retrieve Data Set
df = pd.read_csv(url, delim_whitespace=True)
df.columns = headers.Attribute.values
df.replace(to_replace="?", value=np.nan, inplace=True,)
df.shape
# matrix = df.as_matrix()
Out[69]:
In [111]:
matrix = df.as_matrix()
In [112]:
matrix
Out[112]:
In [113]:
# a = np.asarray([[320, True], [400, False], [350, True], [360, True], [340, True], [340, True], [425, False], [380, False], [365, True]])
# print (a)
In [114]:
# matrix[matrix==np.nan] = -999999
In [125]:
nan_matrix = matrix.astype(float)
In [134]:
nan_col_counts = [np.count_nonzero(np.isnan(col)) for col in nan_matrix.T]
nan_col_counts
Out[134]:
In [148]:
ind = np.arange(len(nan_col_counts)) # the x locations for the groups
width = 0.5 # the width of the bars
fig, ax = plt.subplots(figsize=(8, 6), dpi=80,)
rects1 = ax.bar(ind - width/2, nan_col_counts, width,
color='black')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Count')
ax.set_title('Missing Values by Column')
ax.set_xticks(ind)
ax.set_xticklabels(headers.Attribute.values, rotation='vertical')
ax.legend()
plt.show()
In [195]:
# yellowbrick.features.missing
# Feature importance visualizer
#
# Author: Nathan Danielsen <nathan.danielsen@gmail.com>
# Created: Fri Mar 24 8:17:36 2018 -0500
#
# Copyright (C) 2018 District Data Labs
# For license information, see LICENSE.txt
#
# ID: missing.py [] nathan.danielsen@gmail.com.com $
"""
Implementation of missing values visualizers
To Include:
- Bar
- Density Matrix (by time, specifiable index)
- Heatmap
"""
##########################################################################
## Imports
##########################################################################
import numpy as np
import matplotlib.pyplot as plt
from yellowbrick.utils import is_dataframe
from yellowbrick.utils import is_structured_array
from yellowbrick.features.base import DataVisualizer
# from yellowbrick.style.colors import resolve_colors
##########################################################################
## Feature Visualizer
##########################################################################
class MissingValuesBarVisualizer(DataVisualizer):
"""
"""
def __init__(self,
ax=None,
x=None,
y=None,
features=None,
classes=None,
color=None,
colormap=None,
**kwargs):
"""
"""
super(MissingValuesBarVisualizer, self).__init__(ax, features, classes, color,
colormap, **kwargs)
def fit(self, X, y=None, **kwargs):
"""
TODO if y, then color code the missing values in the chart?
"""
nrows, ncols = df.shape
# Handle the feature names if they're None.
if self.features_ is not None and is_dataframe(X):
X = X[self.features_].as_matrix()
# handle numpy named/ structured array
elif self.features_ is not None and is_structured_array(X):
X_selected = X[self.features_]
X = X_selected.copy().view((np.float64, len(X_selected.dtype.names)))
else:
pass
if self.features_ is None:
self.features_ = range(nrows)
if self.classes_ is None:
# TODO: Is this the most efficient method?
self.classes_ = [str(label) for label in np.unique(y)]
nan_matrix = X.astype(float)
self.nan_col_counts = [np.count_nonzero(np.isnan(col)) for col in nan_matrix.T]
# Draw the instances
self.draw(X, y, **kwargs)
# Fit always returns self.
return self
def draw(self, X, y, **kwargs):
"""Called from the fit method, this method creates a scatter plot that
draws each instance as a class or target colored point, whose location
is determined by the feature data set.
"""
width = 0.5 # the width of the bars
self.ax.bar(ind - width/2, self.nan_col_counts, width,
color='black')
def finalize(self, **kwargs):
"""
Finalize executes any subclass-specific axes finalization steps.
The user calls poof and poof calls finalize.
Parameters
----------
kwargs: generic keyword arguments.
"""
# Set the title
self.set_title(
'Missing Values by Column'
)
ind = np.arange(len(self.features_)) # the x locations for the groups
# Remove the ticks from the graph
self.ax.set_ylabel('Count')
self.ax.set_xticks(ind)
self.ax.set_xticklabels(self.features_, rotation='vertical')
# Add the legend
self.ax.legend(loc='best')
In [196]:
viz = MissingValuesBarVisualizer(features=headers.Attribute.values)
viz.fit(matrix)
viz.poof()
In [210]:
matrix
nan_matrix = matrix.astype(float)
nan_matrix[0]
Out[210]:
In [266]:
In [ ]:
In [271]:
nrows, ncols = matrix.shape
In [290]:
# yellowbrick.features.missing
# Feature importance visualizer
#
# Author: Nathan Danielsen <nathan.danielsen@gmail.com>
# Created: Fri Mar 24 8:17:36 2018 -0500
#
# Copyright (C) 2018 District Data Labs
# For license information, see LICENSE.txt
#
# ID: missing.py [] nathan.danielsen@gmail.com.com $
"""
Implementation of missing values visualizers
To Include:
- Bar
- Density Matrix (by time, specifiable index)
- Heatmap
"""
##########################################################################
## Imports
##########################################################################
import numpy as np
import matplotlib.pyplot as plt
from yellowbrick.utils import is_dataframe
from yellowbrick.utils import is_structured_array
from yellowbrick.features.base import DataVisualizer
# from yellowbrick.style.colors import resolve_colors
##########################################################################
## Feature Visualizer
##########################################################################
class MissingValueDensity(DataVisualizer):
"""
"""
def __init__(self,
ax=None,
x=None,
y=None,
features=None,
classes=None,
color=None,
colormap=None,
**kwargs):
"""
"""
super(MissingValueDensity, self).__init__(ax, features, classes, color,
colormap, **kwargs)
def fit(self, X, y=None, **kwargs):
"""
TODO if y, then color code the missing values in the chart?
"""
nrows, ncols = df.shape
# Handle the feature names if they're None.
if self.features_ is not None and is_dataframe(X):
X = X[self.features_].as_matrix()
# handle numpy named/ structured array
elif self.features_ is not None and is_structured_array(X):
X_selected = X[self.features_]
X = X_selected.copy().view((np.float64, len(X_selected.dtype.names)))
else:
pass
if self.features_ is None:
self.features_ = range(nrows)
if self.classes_ is None:
# TODO: Is this the most efficient method?
self.classes_ = [str(label) for label in np.unique(y)]
nan_matrix = X.astype(float)
self.nan_col_counts = [np.count_nonzero(np.isnan(col)) for col in nan_matrix.T]
self.nan_locs = np.argwhere(np.isnan(nan_matrix))
# Draw the instances
self.draw(X, y, **kwargs)
# Fit always returns self.
return self
def draw(self, X, y, **kwargs):
"""Called from the fit method, this method creates a scatter plot that
draws each instance as a class or target colored point, whose location
is determined by the feature data set.
"""
width = 0.5 # the width of the bars
x, y = list(zip(*self.nan_locs))
self.ax.scatter(x, y, alpha=0.5, marker="|")
def finalize(self, **kwargs):
"""
Finalize executes any subclass-specific axes finalization steps.
The user calls poof and poof calls finalize.
Parameters
----------
kwargs: generic keyword arguments.
"""
# Set the title
self.set_title(
'Dispersion of Missing Values by Feature'
)
ind = np.arange(len(self.features_)) # the x locations for the groups
# Remove the ticks from the graph
self.ax.set_xlabel('Count')
self.ax.set_yticks(ind)
self.ax.set_yticklabels(self.features_)
# Add the legend
self.ax.legend(loc='best')
In [291]:
from pylab import rcParams
rcParams['figure.figsize'] = 5, 10
viz = MissingValueDensity(features=headers.Attribute.values)
viz.fit(matrix)
viz.poof()
In [ ]: