In [43]:
# third party
import matplotlib.pyplot as pyplot
import numpy
import pandas
import seaborn
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
In [29]:
%matplotlib inline
In [2]:
iris = datasets.load_iris()
x_iris, y_iris = iris.data, iris.target
In [4]:
print(x_iris.shape)
print(y_iris.shape)
In [5]:
print(x_iris[0], y_iris[0])
In [8]:
print(iris.target_names)
The first model will be a linear model with two input attributes.
First, get all the rows and the first two-columns for the x data.
In [10]:
x, y = x_iris[:, :2], y_iris
In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
In [13]:
print(x_train.shape, y_train.shape)
In [30]:
frame = pandas.DataFrame(x_train)
frame.head()
Out[30]:
In [42]:
seaborn.set_style('whitegrid')
axes = seaborn.distplot(frame[0], hist=False, label=iris.feature_names[0])
axes = seaborn.distplot(frame[1], ax=axes, hist=False, label=iris.feature_names[1])
title = axes.set_title("First Two Attributes")
Looking at the plot, you can see that they have different ranges, so the data will be standardized. By subtracting the mean from each data point and dividing by the standard deviation for that column, each column is changed to have a mean of 0 and a standard deviation of 1.
In [ ]:
scalar = preprocessing.StandardScaler().fit(x_train)
x_train = scalar.transform(x_train)
x_test = scalar.transform(x_test)
In [46]:
numpy.unique(y_train)
Out[46]:
In [49]:
iris.target_names
Out[49]:
In [51]:
iris.feature_names[:2]
Out[51]:
In [100]:
def print_irises():
figure = pyplot.figure()
axe = figure.gca()
axe.set_xlabel(iris.feature_names[0])
axe.set_ylabel(iris.feature_names[1])
colors = 'r b g'.split()
for index, name in enumerate(iris.target_names):
x_data = frame[0][y_train == index]
y_data = frame[1][y_train == index]
axe.plot(x_data, y_data, '{0}o'.format(colors[index]), label=str(iris.target_names[index]))
title = axe.set_title("{0} vs {1}".format(iris.feature_names[0], iris.feature_names[1]))
legend = axe.legend()
return axe
axe = print_irises()
First we need to re-think the problem as a binary classification problem - Can we predict whether a flower is a setosa or not using our two features? Given only two features, it doesn't appear that we can separate the versicolor from the virginica using lines, but we might be able to separate setosa from the other two species.
In [82]:
## imports
from sklearn.linear_model import SGDClassifier
In [87]:
numpy.unique(y_train)
Out[87]:
Since we don't care about separating versicolor from virginica and setosa is 0, we can re-do the y-training set to only have 0's (setosa) and 1's (not setosa).
In [90]:
y_train = pandas.Series(y_train)
In [91]:
y_binary = y_train.apply(lambda x: 0 if x == 0 else 1)
In [96]:
len(y_binary[y_binary == 0]) == len(y_train[y_train==0])
len(y_binary) == len(y_train)
y_binary.unique()
Out[96]:
In [97]:
classifier = SGDClassifier()
classifier = classifier.fit(x_train, y_binary)
In [106]:
intercept = classifier.intercept_[0]
w_1, w_2 = classifier.coef_[0]
print("{0} + {1} x_1 + {2} x_2 = 0".format(intercept, w_1, w_2))
In [109]:
print("{0} + {1} x_1 = {2} x_2".format(intercept, w_1, -w_2))
In [110]:
print("({0} + {1} x_1)/{2} = x_2".format(intercept, w_1, -w_2))
In [113]:
axe = print_irises()
x_plot = numpy.linspace(4, 8, 100)
y_plot = (intercept + x_plot * w_1)/-w_2
line = axe.plot(x_plot, y_plot)