notebook.community

Edit and run

User testing with the Wine Quality data set from the UCI repository Data consist of wine quality catogorization (on a scale of 1-12) with 10 features. This is the white wine data set.



In [2]:

    
import os
import sys 

# Modify the path 
sys.path.append("..")

import pandas as pd
import yellowbrick as yb 
import matplotlib.pyplot as plt
import numpy as np



In [3]:

    
wine = pd.read_csv('Dropbox/DataScience/YellowBrick/whitewine.csv')



In [4]:

    
# Feature Analysis Imports 
# NOTE that all these are available for import from the `yellowbrick.features` module 
from yellowbrick.features.rankd import Rank2D 
from yellowbrick.features.radviz import RadViz 
from yellowbrick.features.pcoords import ParallelCoordinates









    



/Users/tuulimorrill/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [59]:

    
wine.head()









    Out[59]:






  
    
      
      fixed acidity
      volatile acidity
      citric acid
      residual sugar
      chlorides
      free sulfur dioxide
      total sulfur dioxide
      density
      pH
      sulphates
      alcohol
      quality
      qualitybin
    
  
  
    
      0
      7.0
      0.27
      0.36
      20.7
      0.045
      45.0
      170.0
      1.0010
      3.00
      0.45
      8.8
      6
      low
    
    
      1
      6.3
      0.30
      0.34
      1.6
      0.049
      14.0
      132.0
      0.9940
      3.30
      0.49
      9.5
      6
      low
    
    
      2
      8.1
      0.28
      0.40
      6.9
      0.050
      30.0
      97.0
      0.9951
      3.26
      0.44
      10.1
      6
      low
    
    
      3
      7.2
      0.23
      0.32
      8.5
      0.058
      47.0
      186.0
      0.9956
      3.19
      0.40
      9.9
      6
      low
    
    
      4
      7.2
      0.23
      0.32
      8.5
      0.058
      47.0
      186.0
      0.9956
      3.19
      0.40
      9.9
      6
      low



In [6]:

    
wine.quality.describe()









    Out[6]:





count    4898.000000
mean        5.877909
std         0.885639
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64



In [68]:

    
#add a new column to create a binary class for wine quality 
winemed = wine.quality.median()
wine['qualitybin'] = wine['quality'].apply(lambda x: str('high') if x > 6 else str('low'))



In [70]:

    
wine.qualitybin.describe()









    Out[70]:





count     4898
unique       2
top        low
freq      3838
Name: qualitybin, dtype: object



In [65]:

    
# Load the classification data set
data = wine

# Specify the features of interest
features = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
        'density', 'pH', 'sulphates', 'alcohol'
    ]

# Extract the numpy arrays from the data frame
X = data[features].as_matrix()
y = data['quality'].as_matrix()



In [66]:

    
y









    Out[66]:





array([6, 6, 6, ..., 6, 7, 6])

Rank 2D



In [48]:

    
# Instantiate the visualizer with the Covariance ranking algorithm
visualizer = Rank2D(features=features, algorithm='covariance')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.show()    # Draw/show/show the data



In [49]:

    
# Instantiate the visualizer with the Pearson ranking algorithm
visualizer = Rank2D(features=features, algorithm='pearson')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.show()                   # Draw/show/show the data

RadViz



In [73]:

    
# Load the classification data set
data = wine

# Specify the features of interest and the classes of the target
features = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
        'density', 'pH', 'sulphates', 'alcohol'
    ]
classes = ['low','high']

# Extract the numpy arrays from the data frame
X = data[features].as_matrix()
y = data.qualitybin.as_matrix()



In [74]:

    
y









    Out[74]:





array(['low', 'low', 'low', ..., 'low', 'high', 'low'], dtype=object)



In [75]:

    
# Instantiate the visualizer
visualizer = RadViz(classes=classes, features=features)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.show()         # Draw/show/show the data









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-75-2a8bc7586859> in <module>()
      2 visualizer = RadViz(classes=classes, features=features)
      3 
----> 4 visualizer.fit(X, y)      # Fit the data to the visualizer
      5 visualizer.transform(X)   # Transform the data
      6 visualizer.show()         # Draw/show/show the data

/Users/tuulimorrill/anaconda3/lib/python3.6/site-packages/yellowbrick/features/base.py in fit(self, X, y, **kwargs)
    182 
    183         # Draw the instances
--> 184         self.draw(X, y, **kwargs)
    185 
    186         # Fit always returns self.

/Users/tuulimorrill/anaconda3/lib/python3.6/site-packages/yellowbrick/features/radviz.py in draw(self, X, y, **kwargs)
    181             row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
    182             xy   = (s * row_).sum(axis=0) / row.sum()
--> 183             kls = self.classes_[y[i]]
    184 
    185             to_plot[kls][0].append(xy[0])

TypeError: list indices must be integers or slices, not str

Parallel Coordinates



In [52]:

    
# Instantiate the visualizer
visualizer = ParallelCoordinates(classes=classes, features=features)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.show()         # Draw/show/show the data









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-52-4fa6ca6340e5> in <module>()
      2 visualizer = ParallelCoordinates(classes=classes, features=features)
      3 
----> 4 visualizer.fit(X, y)      # Fit the data to the visualizer
      5 visualizer.transform(X)   # Transform the data
      6 visualizer.show()         # Draw/show/show the data

/Users/tuulimorrill/anaconda3/lib/python3.6/site-packages/yellowbrick/features/base.py in fit(self, X, y, **kwargs)
    182 
    183         # Draw the instances
--> 184         self.draw(X, y, **kwargs)
    185 
    186         # Fit always returns self.

/Users/tuulimorrill/anaconda3/lib/python3.6/site-packages/yellowbrick/features/pcoords.py in draw(self, X, y, **kwargs)
    175             # TODO: How to map classmap to labels?
    176             label = y[idx] # Get the label for the row
--> 177             label = self.classes_[label]
    178 
    179             if label not in used_legends:

TypeError: list indices must be integers or slices, not str

Regressor Evaluation



In [22]:

    
# Regression Evaluation Imports

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split

from yellowbrick.regressor import PredictionError, ResidualsPlot



In [28]:

    
# Load the data
df = wine
feature_names = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
        'density', 'pH', 'sulphates', 'alcohol'
    ]

target_name = 'quality'

# Get the X and y data from the DataFrame
X = df[feature_names].as_matrix()
y = df[target_name].as_matrix()

# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [29]:

    
# Instantiate the linear model and visualizer
ridge = Ridge()
visualizer = ResidualsPlot(ridge)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.show()             # Draw/show/show the data

Prediction Error Plot



In [49]:

    
# Load the data
df = wine
feature_names = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
        'density', 'pH', 'sulphates', 'alcohol'
    ]

target_name = 'quality'

# Get the X and y data from the DataFrame
X = df[feature_names].as_matrix()
y = df[target_name].as_matrix()

# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [50]:

    
# Instantiate the linear model and visualizer
lasso = Lasso()
visualizer = PredictionError(lasso)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.show()             # Draw/show/show the data

Classifier Evaluation



In [79]:

    
# Classifier Evaluation Imports

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from yellowbrick.classifier import ClassificationReport, ROCAUC, ClassBalance



In [80]:

    
# Load the classification data set
data = wine

# Specify the features of interest and the classes of the target
features = [
        'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide',
        'density', 'pH', 'sulphates', 'alcohol'
    ]

classes = data['qualitybin']

# Extract the numpy arrays from the data frame
X = data[features].as_matrix()
y = data['qualitybin'].as_matrix()

# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [81]:

    
# Instantiate the classification model and visualizer
bayes = GaussianNB()
visualizer = ClassificationReport(bayes, classes=classes)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.show()             # Draw/show/show the data









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-81-4c8e0ef2dbcf> in <module>()
      4 
      5 visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
----> 6 visualizer.score(X_test, y_test)  # Evaluate the model on the test data
      7 g = visualizer.show()             # Draw/show/show the data

/Users/tuulimorrill/anaconda3/lib/python3.6/site-packages/yellowbrick/classifier.py in score(self, X, y, **kwargs)
    133         self.scores = map(lambda s: dict(zip(self.classes_, s)), self.scores[0:3])
    134         self.scores = dict(zip(keys, self.scores))
--> 135         return self.draw(y, y_pred)
    136 
    137     def draw(self, y, y_pred):

/Users/tuulimorrill/anaconda3/lib/python3.6/site-packages/yellowbrick/classifier.py in draw(self, y, y_pred)
    154         self.matrix = []
    155         for cls in self.classes_:
--> 156             self.matrix.append([self.scores['precision'][cls],self.scores['recall'][cls],self.scores['f1'][cls]])
    157 
    158         for column in range(len(self.matrix)+1):

KeyError: 'high'



In [ ]:

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality	qualitybin
0	7.0	0.27	0.36	20.7	0.045	45.0	170.0	1.0010	3.00	0.45	8.8	6	low
1	6.3	0.30	0.34	1.6	0.049	14.0	132.0	0.9940	3.30	0.49	9.5	6	low
2	8.1	0.28	0.40	6.9	0.050	30.0	97.0	0.9951	3.26	0.44	10.1	6	low
3	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6	low
4	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6	low