In [10]:
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
print(X_iris.shape, y_iris.shape)
print(X_iris[0], y_iris[0])
In [2]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
X, y = X_iris[:, :2], y_iris
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
print(X_train.shape, y_train.shape)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [5]:
import matplotlib.pyplot as plt
colors = ['red', 'greenyellow', 'blue']
for i in xrange(len(colors)):
xs = X_train[:, 0][y_train == i]
ys = X_train[:, 1][y_train == i]
plt.scatter(xs, ys, c=colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.show()
In [7]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
print(clf.coef_)
print(clf.intercept_)
In [18]:
x_min, x_max = X_train[:, 0].min() - .5, X_train[:, 0].max() + .5
y_min, y_max = X_train[:, 1].min() - .5, X_train[:, 1].max() + .5
xs = np.arange(x_min, x_max, 0.5)
fig, axes = plt.subplots(1, 3)
fig.set_size_inches(10, 6)
for i in [0, 1, 2]:
axes[i].set_aspect('equal')
axes[i].set_title('Class ' + str(i) + ' versus the rest')
axes[i].set_xlabel('Sepal length')
axes[i].set_ylabel('Sepal width')
axes[i].set_xlim(x_min, x_max)
axes[i].set_ylim(y_min, y_max)
# FIXME sca(axes[i])
plt.scatter(X_train[:, 0], X_train[:, 1], c= y_train, cmap=plt.cm.prism)
ys = (-clf.intercept_[i] - xs * clf.coef_[i, 0]) / clf.coef_[i, 1]
plt.plot(xs, ys, hold=True)
# plt.show()
In [20]:
# Test with new flower with sepal width 4.7, sepal length 3.1
print(clf.predict(scaler.transform([4.7, 3.1])))
print(clf.decision_function(scaler.transform([[4.7, 3.1]])))
In [22]:
from sklearn import metrics
y_train_pred = clf.predict(X_train)
print(metrics.accuracy_score(y_train, y_train_pred))
In [24]:
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
In [25]:
print(metrics.classification_report(y_test, y_pred, target_names=iris.target_names))
In [26]:
print(metrics.confusion_matrix(y_test, y_pred))
In [30]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
clf = Pipeline([
('scaler', preprocessing.StandardScaler()),
('linear_model', SGDClassifier())
])
cv = KFold(X.shape[0], 5, shuffle=True, random_state=33)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)
In [32]:
from scipy.stats import sem
def mean_score(scores):
return ("Mean score: {0:.3f} (+/- {1:.3f})").format(np.mean(scores), sem(scores))
print(mean_score(scores))
In [ ]: