In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
In [2]:
# Dataset: https://archive.ics.uci.edu/ml/datasets/Iris/
# IRIS Dataset Size: 150 samples
# Train: 70% Eval: 30%
In [3]:
data_path = r'..\Data\ClassExamples\Iris'
In [4]:
df = pd.read_csv(os.path.join(data_path, 'iris.data.csv'))
In [5]:
df.head()
Out[5]:
In [6]:
df.tail()
Out[6]:
In [7]:
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
In [8]:
l[:5]
Out[8]:
In [9]:
df = df.iloc[l]
In [10]:
df.head()
Out[10]:
In [11]:
df.to_csv(os.path.join(data_path, 'iris_data_train.csv'),
index = True,
index_label = 'Row',
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
In [12]:
df.to_csv(os.path.join(data_path,'iris_data_classifier_test.csv'),
index = True,
index_label = 'Row',
columns = ['sepal_length','sepal_width','petal_length','petal_width'])
In [13]:
df['class'].value_counts()
Out[13]:
In [14]:
setosa = df['class'] == 'Iris-setosa'
versicolor = df['class'] == 'Iris-versicolor'
virginica = df['class'] == 'Iris-virginica'
In [15]:
setosa.head()
Out[15]:
In [16]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(df[setosa].sepal_length,
y = df[setosa].sepal_width,
label = 'setosa',
color = 'g')
plt.scatter(df[versicolor].sepal_length,
y = df[versicolor].sepal_width,
label = 'versicolor',
color = 'r')
plt.scatter(df[virginica].sepal_length,
y = df[virginica].sepal_width,
label = 'virginica',
color = 'b')
plt.xlabel('length')
plt.ylabel('width')
plt.title('sepal')
plt.grid(True)
plt.legend()
Out[16]:
In [17]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(df[setosa].petal_length,
y = df[setosa].petal_width,
label = 'setosa',
color = 'g')
plt.scatter(df[versicolor].petal_length,
y = df[versicolor].petal_width,
label = 'versicolor',
color = 'r')
plt.scatter(df[virginica].petal_length,
y = df[virginica].petal_width,
label = 'virginica',
color = 'b')
plt.xlabel('length')
plt.ylabel('width')
plt.title('petal')
plt.grid(True)
plt.legend()
Out[17]:
In [18]:
fig = plt.figure(figsize = (12, 8))
plt.hist([df[setosa].petal_length,
df[versicolor].petal_length,
df[virginica].petal_length],
bins = 10,
label = ['setosa',
'versicolor',
'virginica'])
plt.title('petal length')
plt.xlabel('petal length')
plt.ylabel('count')
plt.legend()
Out[18]:
In [19]:
fig = plt.figure(figsize = (12, 8))
plt.hist([df[setosa].petal_width,
df[versicolor].petal_width,
df[virginica].petal_width],
bins = 10,
label = ['setosa',
'versicolor',
'virginica'])
plt.title('petal width')
plt.xlabel('petal width')
plt.ylabel('count')
plt.legend()
Out[19]:
In [20]:
fig = plt.figure(figsize = (12, 8))
plt.hist([df[setosa].sepal_length,
df[versicolor].sepal_length,
df[virginica].sepal_length],
bins = 10,
label = ['setosa',
'versicolor',
'virginica'])
plt.title('sepal length')
plt.xlabel('sepal length')
plt.ylabel('count')
plt.legend()
Out[20]:
In [21]:
fig = plt.figure(figsize = (12, 8))
plt.hist([df[setosa].sepal_width,
df[versicolor].sepal_width,
df[virginica].sepal_width],
bins = 10,
label = ['setosa',
'versicolor',
'virginica'])
plt.title('sepal width')
plt.xlabel('sepal width')
plt.ylabel('count')
plt.legend()
Out[21]:
In [22]:
df_predict_default = pd.read_csv(
os.path.join(
data_path,
'output_default',
'bp-yVKPO2ydD0u-iris_data_train.csv.gz'))
df_predict_default.index = df_predict_default.tag
In [23]:
df_predict_default.head()
Out[23]:
In [24]:
def predicted_class(row):
if row['Iris-setosa'] >= row['Iris-versicolor'] and row['Iris-setosa'] >= row['Iris-virginica']:
return "Iris-setosa"
if row['Iris-versicolor'] >= row['Iris-setosa'] and row['Iris-versicolor'] >= row['Iris-virginica']:
return "Iris-versicolor"
return "Iris-virginica"
In [25]:
lst_predicted=[]
for index, row in df_predict_default.iterrows():
lst_predicted.append(predicted_class(row))
df_predict_default['predicted_default'] = lst_predicted
In [26]:
df_predict_default.head()
Out[26]:
In [27]:
df_predict_numeric = pd.read_csv(
os.path.join(
data_path,
'output_numeric',
'bp-K58XKrCYvk4-iris_data_train.csv.gz'))
df_predict_numeric.index = df_predict_numeric.tag
In [28]:
lst_predicted=[]
for index, row in df_predict_numeric.iterrows():
lst_predicted.append(predicted_class(row))
df_predict_numeric['predicted_numeric'] = lst_predicted
In [29]:
df_predict_numeric.head()
Out[29]:
In [30]:
print('Confusion matrix - Actual versus prediction with bin recipe')
cf_bin_recipe = pd.crosstab(df['class'],
df_predict_default.predicted_default)
In [31]:
cf_bin_recipe
Out[31]:
Prediction with default recipe is good
In [32]:
print('Confusion matrix - Actual versus prediction with numeric recipe')
cf_num_recipe = pd.crosstab(df['class'],
df_predict_numeric.predicted_numeric)
In [33]:
cf_num_recipe
Out[33]:
Versicolor - 30 examples got misclassified as Virginica
In [34]:
def print_metrics(cf_matrix):
# Note:AWS ML computes all these for you...demo to show how these are calculated.
# Total samples is sum of all columns in each row
total_samples = cf_matrix.sum(axis = 1).sum()
# Diagonal contains correct class predictions
accuracy = np.diag(cf_matrix).sum() / total_samples
print('Accuracy: {0:0.3f}'.format(accuracy))
print('\n')
# TPR, Recall = True Positive/Actual Positive
recall = np.diag(cf_matrix) / cf_matrix.sum(axis = 1)
print('recall')
print(recall)
print('\n')
# Precision = True Positive/Predicted Positive
precision = np.diag(cf_matrix) / cf_matrix.sum(axis = 0)
print('precision')
print(precision)
print('\n')
f1_scores = 2 * recall * precision / (recall + precision)
print('f1 scores')
print(f1_scores)
print('\n')
print('average f1 score {0:0.3f}'.format(f1_scores.mean()))
print('\n')
In [35]:
print_metrics(cf_bin_recipe)
In [36]:
print_metrics(cf_num_recipe)
In [37]:
df_predict_numeric.predicted_numeric.value_counts()
Out[37]:
In [38]:
df_predict_default.predicted_default.value_counts()
Out[38]:
F1 Score is a binary classification metric. It is harmonic mean of precision and recall
F1 Score = 2 X Precision X Recall / (Precision + Recall)
Higher F1 Score reflects better predictive accuracy
Multi-Class Evaluation
Average of class wise F1 Score
Baseline F1 Score = Hypothetical model that predicts only most frequent class as the answer
Visualization - Confusion Matrix - Available on AWS ML Console
Matrix. Rows = true class. Columns = predicted class
Cell color – diagonal indicates true class prediction %
Cell color – non-diagonal indicates incorrect prediction %
Last column is F1 score for that class. Last but one column is true class distribution
Last row is predicted class distribution
Upto 10 classes are shown – listed from most frequent to least frequent
For more than 10 classes, first 9 most freq. classes are shown and 10th class will collapse rest of the classes and mark as other
You can download the confusion matrix thru url - Explore Performance page under Evaluations