Linear Classification (Irises)


In [43]:
# third party
import matplotlib.pyplot as pyplot
import numpy
import pandas
import seaborn
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

In [29]:
%matplotlib inline

In [2]:
iris = datasets.load_iris()
x_iris, y_iris = iris.data, iris.target

In [4]:
print(x_iris.shape)
print(y_iris.shape)


(150, 4)
(150,)

In [5]:
print(x_iris[0], y_iris[0])


(array([ 5.1,  3.5,  1.4,  0.2]), 0)

In [8]:
print(iris.target_names)


['setosa' 'versicolor' 'virginica']

Model with First Two Attributes

The first model will be a linear model with two input attributes.

First, get all the rows and the first two-columns for the x data.


In [10]:
x, y = x_iris[:, :2], y_iris

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

In [13]:
print(x_train.shape, y_train.shape)


((112, 2), (112,))

Standardize the Data


In [30]:
frame = pandas.DataFrame(x_train)
frame.head()


Out[30]:
     0    1
0  5.0  2.3
1  4.9  3.1
2  6.3  2.3
3  5.8  2.6
4  6.2  2.9

In [42]:
seaborn.set_style('whitegrid')
axes = seaborn.distplot(frame[0], hist=False, label=iris.feature_names[0])
axes = seaborn.distplot(frame[1], ax=axes, hist=False, label=iris.feature_names[1])
title = axes.set_title("First Two Attributes")


<matplotlib.figure.Figure at 0x7f5f66b09110>

Looking at the plot, you can see that they have different ranges, so the data will be standardized. By subtracting the mean from each data point and dividing by the standard deviation for that column, each column is changed to have a mean of 0 and a standard deviation of 1.


In [ ]:
scalar = preprocessing.StandardScaler().fit(x_train)
x_train = scalar.transform(x_train)
x_test = scalar.transform(x_test)

In [46]:
numpy.unique(y_train)


Out[46]:
array([0, 1, 2])

In [49]:
iris.target_names


Out[49]:
array(['setosa', 'versicolor', 'virginica'], 
      dtype='|S10')

In [51]:
iris.feature_names[:2]


Out[51]:
['sepal length (cm)', 'sepal width (cm)']

In [100]:
def print_irises():
    figure = pyplot.figure()
    axe = figure.gca()
    axe.set_xlabel(iris.feature_names[0])
    axe.set_ylabel(iris.feature_names[1])
    colors = 'r b g'.split()
    for index, name in enumerate(iris.target_names):
        x_data = frame[0][y_train == index]
        y_data = frame[1][y_train == index]
        axe.plot(x_data, y_data, '{0}o'.format(colors[index]), label=str(iris.target_names[index]))
    title = axe.set_title("{0} vs {1}".format(iris.feature_names[0], iris.feature_names[1]))
    legend = axe.legend()
    return axe
axe = print_irises()


<matplotlib.figure.Figure at 0x7f5f660aa490>

Stochastic Gradient Descent

First we need to re-think the problem as a binary classification problem - Can we predict whether a flower is a setosa or not using our two features? Given only two features, it doesn't appear that we can separate the versicolor from the virginica using lines, but we might be able to separate setosa from the other two species.


In [82]:
## imports

from sklearn.linear_model import SGDClassifier

In [87]:
numpy.unique(y_train)


Out[87]:
array([0, 1, 2])

Since we don't care about separating versicolor from virginica and setosa is 0, we can re-do the y-training set to only have 0's (setosa) and 1's (not setosa).


In [90]:
y_train = pandas.Series(y_train)

In [91]:
y_binary = y_train.apply(lambda x: 0 if x == 0 else 1)

In [96]:
len(y_binary[y_binary == 0]) == len(y_train[y_train==0])
len(y_binary) == len(y_train)
y_binary.unique()


Out[96]:
array([1, 0])

In [97]:
classifier = SGDClassifier()
classifier = classifier.fit(x_train, y_binary)

In [106]:
intercept = classifier.intercept_[0]
w_1, w_2 = classifier.coef_[0]
print("{0} + {1} x_1 + {2} x_2 = 0".format(intercept, w_1, w_2))


-32.2312061614 + 103.271327774 x_1 + -176.39512508 x_2 = 0

In [109]:
print("{0} + {1} x_1 = {2} x_2".format(intercept, w_1, -w_2))


-32.2312061614 + 103.271327774 x_1 = 176.39512508 x_2

In [110]:
print("({0} + {1} x_1)/{2} = x_2".format(intercept, w_1, -w_2))


(-32.2312061614 + 103.271327774 x_1)/176.39512508 = x_2

In [113]:
axe = print_irises()
x_plot = numpy.linspace(4, 8, 100)
y_plot = (intercept + x_plot * w_1)/-w_2
line = axe.plot(x_plot, y_plot)


<matplotlib.figure.Figure at 0x7f5f65082e10>