In [93]:
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import numpy
from scipy.stats import sem
Load Data
In [3]:
iris = datasets.load_iris()
In [4]:
print(iris.data.shape)
In [5]:
print(iris.target.shape)
In [7]:
print(iris.data[0], iris.target[1])
In [10]:
print(iris.target_names)
In [11]:
print(iris.feature_names)
In [14]:
X, Y = iris.data[:, :2], iris.target
In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=66)
In [18]:
scaler = preprocessing.StandardScaler().fit(X)
In [19]:
x_train = scaler.transform(x_train)
In [20]:
x_test = scaler.transform(x_test)
In [52]:
colors = 'r w b'.split()
In [55]:
fig = plt.figure()
axe = fig.gca()
# separate by flower type
handles = []
for target in range(len(colors)):
xs = x_train[:, 0][y_train == target]
ys = x_train[:, 1][y_train == target]
handles.append(axe.scatter(xs, ys, c=colors[target], label=iris.target_names[target]))
axe.legend(handles, iris.target_names, scatterpoints=1)
xl = axe.set_xlabel('Sepal Length')
yl = axe.set_ylabel('Sepal Width')
In [60]:
classifier = SGDClassifier()
classifier = classifier.fit(x_train, y_train)
In [61]:
print(classifier.coef_)
In [62]:
print(classifier.intercept_)
There are three entries for the coefficents and intercepts because there are three targets, so it picks each target and fits a line that separates it from the other two categories.
In [75]:
x_min, x_max = x_train[:, 0].min() - .5, x_train[:, 0].max() + .5
In [76]:
y_min, y_max = x_train[:, 1].min() - .5, x_train[:,1].max() + .5
In [77]:
xs = numpy.arange(x_min, x_max, 0.5)
In [78]:
for plot in range(3):
figure = plt.figure()
axe = figure.gca()
axe.set_title('Class {0} versus the rest'.format(iris.target_names[plot]))
axe.set_xlabel('Sepal Length')
axe.set_ylabel('Sepal Width')
axe.set_xlim(x_min, x_max)
axe.set_ylim(y_min, y_max)
axe.scatter(x_train[:, 0], x_train[:, 1], c=y_train)
ys = (-classifier.intercept_[plot] - xs * classifier.coef_[plot, 0]) / classifier.coef_[plot, 1]
axe.plot(xs, ys)
In [81]:
y_train_predict = classifier.predict(x_train)
print(metrics.accuracy_score(y_train, y_train_predict))
In [82]:
y_predict = classifier.predict(x_test)
print(metrics.accuracy_score(y_test, y_predict))
In [85]:
print(metrics.classification_report(y_test, y_predict, target_names=iris.target_names))
In [86]:
print(metrics.confusion_matrix(y_test, y_predict))
In [90]:
classifier = Pipeline([('scaler', preprocessing.StandardScaler()),
('linear_model', SGDClassifier())])
In [91]:
cross_validation = KFold(X.shape[0], 5, shuffle=True, random_state=666)
scores = cross_val_score(classifier, X, Y, cv=cross_validation)
In [92]:
print(scores)
In [94]:
print("Mean Score: {0:.3f} (+/- {1:.3f})".format(numpy.mean(scores), sem(scores)))
In [ ]: