User testing with the Wine Quality data set from the UCI repository Data consist of wine quality catogorization (on a scale of 1-12) with 10 features. This is the white wine data set.
In [2]:
import os
import sys
# Modify the path
sys.path.append("..")
import pandas as pd
import yellowbrick as yb
import matplotlib.pyplot as plt
import numpy as np
In [3]:
wine = pd.read_csv('Dropbox/DataScience/YellowBrick/whitewine.csv')
In [4]:
# Feature Analysis Imports
# NOTE that all these are available for import from the `yellowbrick.features` module
from yellowbrick.features.rankd import Rank2D
from yellowbrick.features.radviz import RadViz
from yellowbrick.features.pcoords import ParallelCoordinates
In [59]:
wine.head()
Out[59]:
In [6]:
wine.quality.describe()
Out[6]:
In [68]:
#add a new column to create a binary class for wine quality
winemed = wine.quality.median()
wine['qualitybin'] = wine['quality'].apply(lambda x: str('high') if x > 6 else str('low'))
In [70]:
wine.qualitybin.describe()
Out[70]:
In [65]:
# Load the classification data set
data = wine
# Specify the features of interest
features = [
'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
'density', 'pH', 'sulphates', 'alcohol'
]
# Extract the numpy arrays from the data frame
X = data[features].as_matrix()
y = data['quality'].as_matrix()
In [66]:
y
Out[66]:
Rank 2D
In [48]:
# Instantiate the visualizer with the Covariance ranking algorithm
visualizer = Rank2D(features=features, algorithm='covariance')
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.transform(X) # Transform the data
visualizer.show() # Draw/show/show the data
In [49]:
# Instantiate the visualizer with the Pearson ranking algorithm
visualizer = Rank2D(features=features, algorithm='pearson')
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.transform(X) # Transform the data
visualizer.show() # Draw/show/show the data
RadViz
In [73]:
# Load the classification data set
data = wine
# Specify the features of interest and the classes of the target
features = [
'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
'density', 'pH', 'sulphates', 'alcohol'
]
classes = ['low','high']
# Extract the numpy arrays from the data frame
X = data[features].as_matrix()
y = data.qualitybin.as_matrix()
In [74]:
y
Out[74]:
In [75]:
# Instantiate the visualizer
visualizer = RadViz(classes=classes, features=features)
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.transform(X) # Transform the data
visualizer.show() # Draw/show/show the data
Parallel Coordinates
In [52]:
# Instantiate the visualizer
visualizer = ParallelCoordinates(classes=classes, features=features)
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.transform(X) # Transform the data
visualizer.show() # Draw/show/show the data
Regressor Evaluation
In [22]:
# Regression Evaluation Imports
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from yellowbrick.regressor import PredictionError, ResidualsPlot
In [28]:
# Load the data
df = wine
feature_names = [
'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
'density', 'pH', 'sulphates', 'alcohol'
]
target_name = 'quality'
# Get the X and y data from the DataFrame
X = df[feature_names].as_matrix()
y = df[target_name].as_matrix()
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [29]:
# Instantiate the linear model and visualizer
ridge = Ridge()
visualizer = ResidualsPlot(ridge)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.show() # Draw/show/show the data
Prediction Error Plot
In [49]:
# Load the data
df = wine
feature_names = [
'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
'density', 'pH', 'sulphates', 'alcohol'
]
target_name = 'quality'
# Get the X and y data from the DataFrame
X = df[feature_names].as_matrix()
y = df[target_name].as_matrix()
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [50]:
# Instantiate the linear model and visualizer
lasso = Lasso()
visualizer = PredictionError(lasso)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.show() # Draw/show/show the data
Classifier Evaluation
In [79]:
# Classifier Evaluation Imports
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport, ROCAUC, ClassBalance
In [80]:
# Load the classification data set
data = wine
# Specify the features of interest and the classes of the target
features = [
'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
'density', 'pH', 'sulphates', 'alcohol'
]
classes = data['qualitybin']
# Extract the numpy arrays from the data frame
X = data[features].as_matrix()
y = data['qualitybin'].as_matrix()
# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [81]:
# Instantiate the classification model and visualizer
bayes = GaussianNB()
visualizer = ClassificationReport(bayes, classes=classes)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.show() # Draw/show/show the data
In [ ]: