Iris Classification with Logistic Regression



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from models import linear_model, logistic_model, log_cost, log_cost_dev, gd_update
from models import binary_confusion_matrix, std_normalize, binary_accuracy, create_parameters, data_normalize
from sklearn.model_selection import train_test_split

%matplotlib inline

1), prepare data



In [2]:

    
df = pd.read_csv('./data/iris.csv')
df = df.reindex(np.random.permutation(df.index))
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 72 to 38
Data columns (total 6 columns):
Id               150 non-null int64
SepalLengthCm    150 non-null float64
SepalWidthCm     150 non-null float64
PetalLengthCm    150 non-null float64
PetalWidthCm     150 non-null float64
Species          150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 8.2+ KB



In [3]:

    
df['IsSetosa'] = df['Species'].apply(lambda a: 1.0 if a=='Iris-setosa' else 0)
data = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'IsSetosa']]
data.head()









    Out[3]:







  
    
      
      SepalLengthCm
      SepalWidthCm
      PetalLengthCm
      PetalWidthCm
      IsSetosa
    
  
  
    
      72
      6.3
      2.5
      4.9
      1.5
      0.0
    
    
      99
      5.7
      2.8
      4.1
      1.3
      0.0
    
    
      21
      5.1
      3.7
      1.5
      0.4
      1.0
    
    
      39
      5.1
      3.4
      1.5
      0.2
      1.0
    
    
      147
      6.5
      3.0
      5.2
      2.0
      0.0



In [4]:

    
train, test = train_test_split(data, test_size=0.2)
train_X = np.array(train[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
train_y = np.array(train[['IsSetosa']])



In [5]:

    
np.mean(train_X, axis=0)









    Out[5]:





array([ 5.87333333,  3.01833333,  3.82083333,  1.20833333])



In [6]:

    
train_stds, train_means = std_normalize(train_X)



In [7]:

    
np.mean(train_X, axis=0)









    Out[7]:





array([ -7.51250913e-16,   1.62832710e-15,   5.92118946e-17,
         1.25825276e-16])



In [8]:

    
np.std(train_X, axis=0)









    Out[8]:





array([ 1.,  1.,  1.,  1.])

2), train



In [9]:

    
feature_size = train_X.shape[1]
sample_count = train_X.shape[0]

W, b = create_parameters(feature_size)

threshold = 0.5
lr = 0.01

for epoch in range(0, 1000):
    h = logistic_model(train_X, W, b)
    dW, db = log_cost_dev(train_X, train_y, h)
    W, b = gd_update(W, b, dW, db, lr)
    if (epoch + 1) % 100 == 0:
        cur_cost = log_cost(h, train_y)
        conf = binary_confusion_matrix(h, train_y, threshold=threshold)
        print('epoch: {0}, cost: {1}, conf: {2}'.format(epoch + 1, cur_cost, conf))

predictions = logistic_model(train_X, W, b)
final_cost = log_cost(predictions, train_y)
conf = binary_confusion_matrix(predictions, train_y, threshold=threshold)
print('training finished!')
print('final cost: {0}, conf: {1}'.format(final_cost, conf))









    



epoch: 100, cost: 1.3745801338925958, conf: (0.04477611940298507, 0.07894736842105263, 0.05714285714285714)
epoch: 200, cost: 0.674898608867079, conf: (0.42105263157894735, 0.631578947368421, 0.5052631578947367)
epoch: 300, cost: 0.4110166387166828, conf: (0.6379310344827587, 0.9736842105263158, 0.7708333333333335)
epoch: 400, cost: 0.29126399010419846, conf: (0.7450980392156863, 1.0, 0.8539325842696629)
epoch: 500, cost: 0.22517771606799214, conf: (0.8636363636363636, 1.0, 0.9268292682926829)
epoch: 600, cost: 0.18350381003692745, conf: (0.9047619047619048, 1.0, 0.9500000000000001)
epoch: 700, cost: 0.15485671561715814, conf: (0.95, 1.0, 0.9743589743589743)
epoch: 800, cost: 0.13397360852104612, conf: (0.95, 1.0, 0.9743589743589743)
epoch: 900, cost: 0.11809009119945889, conf: (1.0, 1.0, 1.0)
epoch: 1000, cost: 0.10561324578302447, conf: (1.0, 1.0, 1.0)
training finished!
final cost: 0.10550201197717494, conf: (1.0, 1.0, 1.0)

3). try test data



In [10]:

    
test_X = np.array(test[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
test_y = np.array(test[['IsSetosa']])
data_normalize(test_X, train_stds, train_means)



In [11]:

    
test_h = logistic_model(test_X, W, b)
test_cost = log_cost(test_h, test_y)
test_conf = binary_confusion_matrix(test_h, test_y, threshold=threshold)
print('test cost: {0}, conf: {1}'.format(test_cost, test_conf))









    



test cost: 0.12952589901785055, conf: (0.9230769230769231, 1.0, 0.9600000000000001)

so, this is only for Setosa, we want generalize binary classification to multi-classies

Iris, one-vs-all

1), prepare data again



In [12]:

    
df['Species'].unique()









    Out[12]:





array(['Iris-versicolor', 'Iris-setosa', 'Iris-virginica'], dtype=object)



In [29]:

    
df['IsSetosa'] = df['Species'].apply(lambda a: 1.0 if a=='Iris-setosa' else 0)
df['IsVericolor'] = df['Species'].apply(lambda a: 1.0 if a=='Iris-versicolor' else 0)
df['IsVirginica'] = df['Species'].apply(lambda a: 1.0 if a=='Iris-virginica' else 0)
data = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'IsSetosa', 'IsVericolor', 'IsVirginica']]

train, test = train_test_split(data, test_size=0.2)
train_X = np.array(train[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
train_y0 = np.array(train[['IsSetosa']])
train_y1 = np.array(train[['IsVericolor']])
train_y2 = np.array(train[['IsVirginica']])
train_y_all = np.array(train[['IsSetosa', 'IsVericolor', 'IsVirginica']])

test_X = np.array(test[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
test_y_all = np.array(test[['IsSetosa', 'IsVericolor', 'IsVirginica']])

x_means, x_stds = std_normalize(train_X)
data_normalize(test_X, x_means, x_stds)

2), define some utils



In [30]:

    
def train_lr_classifier(X, y, lr=0.01, threshold=0.5, epochs=1000, step_size=100):
    feature_size = X.shape[1]
    sample_count = y.shape[0]
    W, b = create_parameters(feature_size)
    
    for epoch in range(0, epochs):
        h = logistic_model(X, W, b)
        dW, db = log_cost_dev(X, y, h)
        W, b = gd_update(W, b, dW, db, lr)
        if (epoch + 1) % step_size == 0:
            cur_cost = log_cost(h, y)
            conf = binary_confusion_matrix(h, y, threshold=threshold)
            print('epoch: {0}, cost: {1}, conf: {2}'.format(epoch + 1, cur_cost, conf))

    predictions = logistic_model(X, W, b)
    final_cost = log_cost(predictions, y)
    conf = binary_confusion_matrix(predictions, y, threshold=threshold)
    print('training finished!')
    print('final cost: {0}, conf: {1}'.format(final_cost, conf))
    return W, b



In [31]:

    
m0 = train_lr_classifier(train_X, train_y0, lr=0.01, threshold=0.5)









    



epoch: 100, cost: 0.6413593778580737, conf: (0.8333333333333334, 0.23255813953488372, 0.3636363636363636)
epoch: 200, cost: 0.3239125928763921, conf: (0.9629629629629629, 0.6046511627906976, 0.7428571428571429)
epoch: 300, cost: 0.19547085770561048, conf: (1.0, 0.8837209302325582, 0.9382716049382717)
epoch: 400, cost: 0.136598131506859, conf: (1.0, 0.9767441860465116, 0.988235294117647)
epoch: 500, cost: 0.10514600594178773, conf: (1.0, 0.9767441860465116, 0.988235294117647)
epoch: 600, cost: 0.08612278759634744, conf: (1.0, 0.9767441860465116, 0.988235294117647)
epoch: 700, cost: 0.07351856281845054, conf: (1.0, 0.9767441860465116, 0.988235294117647)
epoch: 800, cost: 0.06459017480374314, conf: (1.0, 0.9767441860465116, 0.988235294117647)
epoch: 900, cost: 0.05794039815033709, conf: (1.0, 0.9767441860465116, 0.988235294117647)
epoch: 1000, cost: 0.052792391408487065, conf: (1.0, 0.9767441860465116, 0.988235294117647)
training finished!
final cost: 0.05274675309628947, conf: (1.0, 0.9767441860465116, 0.988235294117647)



In [32]:

    
m1 = train_lr_classifier(train_X, train_y1, lr=0.01, threshold=0.5, epochs=50000, step_size=10000)









    



epoch: 10000, cost: 0.4672810189697581, conf: (0.64, 0.43243243243243246, 0.5161290322580645)
epoch: 20000, cost: 0.46612468084009984, conf: (0.64, 0.43243243243243246, 0.5161290322580645)
epoch: 30000, cost: 0.4659093728560644, conf: (0.64, 0.43243243243243246, 0.5161290322580645)
epoch: 40000, cost: 0.46579602875281123, conf: (0.64, 0.43243243243243246, 0.5161290322580645)
epoch: 50000, cost: 0.46573324705435276, conf: (0.64, 0.43243243243243246, 0.5161290322580645)
training finished!
final cost: 0.46573324684480516, conf: (0.64, 0.43243243243243246, 0.5161290322580645)



In [33]:

    
m2 = train_lr_classifier(train_X, train_y2, lr=0.01, threshold=0.5, epochs=50000, step_size=10000)









    



epoch: 10000, cost: 0.12692241149644967, conf: (0.95, 0.95, 0.9500000000000001)
epoch: 20000, cost: 0.09590092286288818, conf: (0.9512195121951219, 0.975, 0.9629629629629629)
epoch: 30000, cost: 0.08151045623289317, conf: (0.9512195121951219, 0.975, 0.9629629629629629)
epoch: 40000, cost: 0.07295167490350239, conf: (0.9512195121951219, 0.975, 0.9629629629629629)
epoch: 50000, cost: 0.06715221727815238, conf: (0.9512195121951219, 0.975, 0.9629629629629629)
training finished!
final cost: 0.067151727629488, conf: (0.9512195121951219, 0.975, 0.9629629629629629)

Classify multi-class with Softmax

What is softmax?

$$ \begin{equation} Softmax(x_j) = \frac{e^{x_j}}{\sum_{i=1}^m e^{x_{i}}} \end{equation} $$

See details in models.py



In [36]:

    
import models as ml



In [37]:

    
feature_size = train_X.shape[1]
sample_count = train_X.shape[0]
class_count = train_y_all.shape[1]

W, b = ml.create_parameters(feature_size, class_count)

for epoch in range(0, 100000):
    h = ml.softmax_regression_model(train_X, W, b)
    dW, db = ml.crossentropy_cost_dev(train_X, train_y_all, h)
    W, b = ml.gd_update(W, b, dW, db, lr=0.01)
    if (epoch + 1) % 10000 == 0:
        cur_cost = ml.crossentropy_cost(h, train_y_all)
        cur_acc = ml.categorical_accuracy(h, train_y_all)
        print('epoch: {0}, cost: {1}, acc: {2}'.format(epoch + 1, cur_cost, cur_acc))

predictions = ml.softmax_regression_model(train_X, W, b)
final_cost = ml.crossentropy_cost(predictions, train_y_all)
final_acc = ml.categorical_accuracy(predictions, train_y_all)
print('training finished!')
print('train cost: {0}, acc: {1}'.format(final_cost, final_acc))

test_h = ml.softmax_regression_model(test_X, W, b)
test_cost = ml.crossentropy_cost(test_h, test_y_all)
test_acc = ml.categorical_accuracy(test_h, test_y_all)
print('test cost: {0}, acc: {1}'.format(test_cost, test_acc))









    



epoch: 10000, cost: 0.040805551375077395, acc: 0.975
epoch: 20000, cost: 0.02969192477577395, acc: 0.975
epoch: 30000, cost: 0.02494988618992539, acc: 0.975
epoch: 40000, cost: 0.02219898562705884, acc: 0.9833333333333333
epoch: 50000, cost: 0.020356746091358142, acc: 0.9833333333333333
epoch: 60000, cost: 0.019015789622029135, acc: 0.9833333333333333
epoch: 70000, cost: 0.01798504255574665, acc: 0.9833333333333333
epoch: 80000, cost: 0.017161654145229403, acc: 0.9833333333333333
epoch: 90000, cost: 0.016484771512794216, acc: 0.9833333333333333
epoch: 100000, cost: 0.01591581752130277, acc: 0.9833333333333333
training finished!
train cost: 0.015915765086367127, acc: 0.9833333333333333
test cost: 0.026905784702824006, acc: 0.9666666666666667

let's take a look



In [47]:

    
np.argmax(ml.softmax_regression_model(train_X[0:4], W, b), axis=1)









    Out[47]:





array([1, 0, 1, 1])



In [48]:

    
np.argmax(train_y_all[0:4], axis=1)









    Out[48]:





array([1, 0, 1, 1])



In [ ]:

	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	IsSetosa
72	6.3	2.5	4.9	1.5	0.0
99	5.7	2.8	4.1	1.3	0.0
21	5.1	3.7	1.5	0.4	1.0
39	5.1	3.4	1.5	0.2	1.0
147	6.5	3.0	5.2	2.0	0.0

	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	IsSetosa
72	6.3	2.5	4.9	1.5	0.0
99	5.7	2.8	4.1	1.3	0.0
21	5.1	3.7	1.5	0.4	1.0
39	5.1	3.4	1.5	0.2	1.0
147	6.5	3.0	5.2	2.0	0.0

	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	IsSetosa
72	6.3	2.5	4.9	1.5	0.0
99	5.7	2.8	4.1	1.3	0.0
21	5.1	3.7	1.5	0.4	1.0
39	5.1	3.4	1.5	0.2	1.0
147	6.5	3.0	5.2	2.0	0.0