In [1]:

    
%matplotlib inline

Predicting Abalone Snail Sex Using Physical Characteristics

Data was found at https://archive.ics.uci.edu/ml/datasets/Abalone , University of California, Irvine's Machine Learning repository.



In [3]:

    
import os
import sys 

# Modify the path 
sys.path.append("..")

import numpy as np
import pandas as pd
import yellowbrick as yb 
import matplotlib.pyplot as plt



In [4]:

    
os.chdir("/Users/lisacombs/Documents/yellowbrick/")



In [5]:

    
## Load the data

data = pd.read_csv("./life.csv")
data.head()



In [6]:

    
# Use only M/F, no infants and make variable numeric.
data = data.loc[data['sex'].isin(['M','F'])]
data['sex'] = np.where(data['sex']=='M', 0, 1)



In [7]:

    
# Feature Analysis Imports 
# NOTE that all these are available for import from the `yellowbrick.features` module 
from yellowbrick.features.rankd import Rank2D 
from yellowbrick.features.radviz import RadViz 
from yellowbrick.features.pcoords import ParallelCoordinates









    



//anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [8]:

    
list(data) # numeric variables to be used as features









    Out[8]:





['sex',
 ' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight',
 ' rings']



In [9]:

    
# Specify the features of interest

features = [' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight',
 ' rings']

# Extract the numpy arrays from the data frame 
X = data[features].as_matrix()
y = data.sex.as_matrix()



In [10]:

    
# Instantiate the visualizer with the Covariance ranking algorithm 
visualizer = Rank2D(features=features, algorithm='covariance')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.show()    # Draw/show/show the data



In [11]:

    
# Instantiate the visualizer with the Pearson ranking algorithm 
visualizer = Rank2D(features=features, algorithm='pearson')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.show()    # Draw/show/show the data



In [12]:

    
# Specify the features of interest and the classes of the target 
features = [' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight',
 ' rings']

classes = ['M', 'F']

# Extract the numpy arrays from the data frame 
X = data[features].as_matrix()
y = data.sex.as_matrix()



In [13]:

    
# Instantiate the visualizer
visualizer = visualizer = RadViz(classes=classes, features=features)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.show()         # Draw/show/show the data



In [14]:

    
# Instantiate the visualizer
visualizer = visualizer = ParallelCoordinates(classes=classes, features=features)

visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.show()         # Draw/show/show the data



In [15]:

    
# Regression Evaluation Imports 

from sklearn.linear_model import Ridge, Lasso 
from sklearn.model_selection import train_test_split

from yellowbrick.regressor import PredictionError, ResidualsPlot



In [16]:

    
# Load the data - without classifier

feature_names = [' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight', 
 ' rings']

target_name = ' sh_weight'

# Get the X and y data from the DataFrame 
X = data[feature_names].as_matrix()
y = data[target_name].as_matrix() 

# Create the train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [17]:

    
# Instantiate the linear model and visualizer 
ridge = Ridge()
visualizer = ResidualsPlot(ridge)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
g = visualizer.show()             # Draw/show/show the data



In [18]:

    
feature_names = [' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight']
    
target_name = ' rings'

# Get the X and y data from the DataFrame 
X = data[feature_names].as_matrix()
y = data[target_name].as_matrix() 

# Create the train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [19]:

    
# Instantiate the linear model and visualizer 
lasso = Lasso()
visualizer = PredictionError(lasso)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
g = visualizer.show()             # Draw/show/show the data









    



//anaconda/lib/python3.5/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)



In [20]:

    
# Classifier Evaluation Imports 

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from yellowbrick.classifier import ClassificationReport, ROCAUC, ClassBalance



In [21]:

    
# Specify the features of interest and the classes of the target 
features = [' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight',
 ' rings']

classes = ['M', 'F']

# Extract the numpy arrays from the data frame 
X = data[features].as_matrix()
y = data.sex.as_matrix()

# Create the train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [22]:

    
# Instantiate the classification model and visualizer 
bayes = GaussianNB()
visualizer = ClassificationReport(bayes, classes=classes)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
g = visualizer.show()             # Draw/show/show the data



In [23]:

    
# Specify the features of interest and the classes of the target 
features = [' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight',
 ' rings']

classes = ['M', 'F']

# Extract the numpy arrays from the data frame 
X = data[features].as_matrix()
y = data.sex.as_matrix()

# Create the train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [24]:

    
logistic = LogisticRegression()
visualizer = ROCAUC(logistic)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
g = visualizer.show()             # Draw/show/show the data



In [25]:

    
# Specify the features of interest and the classes of the target 
features = [' length',
 ' diameter',
 ' height',
 ' w_weight',
 ' s_weight',
 ' v_weight',
 ' sh_weight',
 ' rings']

classes = ['M', 'F']

# Extract the numpy arrays from the data frame 
X = data[features].as_matrix()
y = data.sex.as_matrix()

# Create the train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [26]:

    
# Instantiate the classification model and visualizer 
forest = RandomForestClassifier()
visualizer = ClassBalance(forest, classes=classes)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data 
g = visualizer.show()             # Draw/show/show the data

	sex	length	diameter	height	w_weight	s_weight	v_weight	sh_weight	rings
0	M	0.455	0.365	0.095	0.5140	0.2245	0.1010	0.150	15
1	M	0.350	0.265	0.090	0.2255	0.0995	0.0485	0.070	7
2	F	0.530	0.420	0.135	0.6770	0.2565	0.1415	0.210	9
3	M	0.440	0.365	0.125	0.5160	0.2155	0.1140	0.155	10
4	I	0.330	0.255	0.080	0.2050	0.0895	0.0395	0.055	7