Imports



In [93]:

    
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import numpy
from scipy.stats import sem

Load Data



In [3]:

    
iris = datasets.load_iris()



In [4]:

    
print(iris.data.shape)



In [5]:

    
print(iris.target.shape)









    



(150,)



In [7]:

    
print(iris.data[0], iris.target[1])









    



[ 5.1  3.5  1.4  0.2] 0



In [10]:

    
print(iris.target_names)









    



['setosa' 'versicolor' 'virginica']



In [11]:

    
print(iris.feature_names)









    



['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

Split into Test and Train



In [14]:

    
X, Y = iris.data[:, :2], iris.target



In [16]:

    
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=66)

Scale the X-data



In [18]:

    
scaler = preprocessing.StandardScaler().fit(X)



In [19]:

    
x_train = scaler.transform(x_train)



In [20]:

    
x_test = scaler.transform(x_test)

Plot



In [52]:

    
colors = 'r w b'.split()



In [55]:

    
fig = plt.figure()
axe = fig.gca()
# separate by flower type
handles = []
for target in range(len(colors)):
    xs = x_train[:, 0][y_train == target]
    ys = x_train[:, 1][y_train == target]
    handles.append(axe.scatter(xs, ys, c=colors[target], label=iris.target_names[target]))

axe.legend(handles, iris.target_names, scatterpoints=1)
xl = axe.set_xlabel('Sepal Length')
yl = axe.set_ylabel('Sepal Width')

Stochastic Gradient Descent



In [60]:

    
classifier = SGDClassifier()
classifier = classifier.fit(x_train, y_train)



In [61]:

    
print(classifier.coef_)









    



[[-22.20245135  12.6759546 ]
 [ -8.21257535 -11.07290648]
 [ 18.653168   -13.35873436]]



In [62]:

    
print(classifier.intercept_)









    



[-11.15155373  -9.08544534  -3.43205237]

There are three entries for the coefficents and intercepts because there are three targets, so it picks each target and fits a line that separates it from the other two categories.

Plot the fit



In [75]:

    
x_min, x_max = x_train[:, 0].min() - .5, x_train[:, 0].max() + .5



In [76]:

    
y_min, y_max = x_train[:, 1].min() - .5, x_train[:,1].max() + .5



In [77]:

    
xs = numpy.arange(x_min, x_max, 0.5)



In [78]:

    
for plot in range(3):
    figure = plt.figure()
    axe = figure.gca()
    axe.set_title('Class {0} versus the rest'.format(iris.target_names[plot]))
    axe.set_xlabel('Sepal Length')
    axe.set_ylabel('Sepal Width')
    axe.set_xlim(x_min, x_max)
    axe.set_ylim(y_min, y_max)
    axe.scatter(x_train[:, 0], x_train[:, 1], c=y_train)
    ys = (-classifier.intercept_[plot] - xs * classifier.coef_[plot, 0]) / classifier.coef_[plot, 1]
    axe.plot(xs, ys)

Evaluating the Classifier



In [81]:

    
y_train_predict = classifier.predict(x_train)
print(metrics.accuracy_score(y_train, y_train_predict))



In [82]:

    
y_predict = classifier.predict(x_test)
print(metrics.accuracy_score(y_test, y_predict))









    



0.710526315789



In [85]:

    
print(metrics.classification_report(y_test, y_predict, target_names=iris.target_names))









    



             precision    recall  f1-score   support

     setosa       0.93      0.93      0.93        14
 versicolor       0.67      0.17      0.27        12
  virginica       0.57      1.00      0.73        12

avg / total       0.73      0.71      0.66        38



In [86]:

    
print(metrics.confusion_matrix(y_test, y_predict))









    



[[13  1  0]
 [ 1  2  9]
 [ 0  0 12]]

Cross-Validation



In [90]:

    
classifier = Pipeline([('scaler', preprocessing.StandardScaler()),
                      ('linear_model', SGDClassifier())])



In [91]:

    
cross_validation = KFold(X.shape[0], 5, shuffle=True, random_state=666)
scores = cross_val_score(classifier, X, Y, cv=cross_validation)



In [92]:

    
print(scores)









    



[ 0.8         0.7         0.8         0.76666667  0.7       ]



In [94]:

    
print("Mean Score: {0:.3f} (+/- {1:.3f})".format(numpy.mean(scores), sem(scores)))









    



Mean Score: 0.753 (+/- 0.023)



In [ ]: