In [44]:
from sklearn import datasets, decomposition
import numpy as np
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
print X_iris.shape, y_iris.shape
print X_iris[0], y_iris[0], np.unique(y_iris)
In [45]:
In [17]:
colors = ['red', 'greenyellow', 'blue']
for i in xrange(len(colors)):
px = X_iris[:, 0][y_iris == i]
py = X_iris[:, 1][y_iris == i]
plt.scatter(px, py, c=colors[i])
plt.legend(iris.target_names)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.show()
In [6]:
from sklearn.cross_validation import train_test_split
# Get dataset with only the first two attributes
X, y = X_iris[:,:2], y_iris
# Split the dataset into a trainig and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
print X_train.shape, y_train.shape
print X_train.mean(), X_train.std()
In [7]:
from sklearn.preprocessing import StandardScaler
# Standarize the features
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print X_train.mean(), X_train.std()
In [18]:
# create the linear model classifier
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
# fit (train) the classifier
clf.fit(X_train, y_train)
# print learned coeficients
print clf.coef_
print clf.intercept_
In [26]:
x_min, x_max = X_train[:, 0].min() - .5, X_train[:, 0].max() + .5
y_min, y_max = X_train[:, 1].min() - .5, X_train[:, 1].max() + .5
xs = np.arange(x_min,x_max,0.5)
fig, axes = plt.subplots(1,3)
fig.set_size_inches(10,6)
for i in [0,1,2]:
axes[i].set_aspect('equal')
axes[i].set_title('Class ' + str(i) + ' versus the rest')
axes[i].set_xlabel('Sepal length')
axes[i].set_ylabel('Sepal width')
axes[i].set_xlim(x_min, x_max)
axes[i].set_ylim(y_min, y_max)
plt.sca(axes[i])
for j in xrange(len(colors)):
px = X_train[:, 0][y_train == j]
py = X_train[:, 1][y_train == j]
plt.scatter(px, py, c=colors[j])
ys = (-clf.intercept_[i]-xs*clf.coef_[i,0])/clf.coef_[i,1]
plt.plot(xs,ys,hold=True)
plt.show()
In [29]:
print clf.predict(scaler.transform([[4.7, 3.1]]))
#the decision_function gives a confidence score for each class
print clf.decision_function(scaler.transform([[4.7, 3.1]]))
In [31]:
from sklearn import metrics
y_pred = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
In [54]:
print metrics.classification_report(y_test, y_pred, target_names=iris.target_names)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print confusion_matrix
plt.matshow(confusion_matrix)
plt.show()
In [43]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
# create a composite estimator made by a pipeline of the standarization and the linear model
clf = Pipeline([
('scaler', StandardScaler()),
('linear_model', SGDClassifier())
])
# create a k-fold croos validation iterator of k=5 folds
cv = KFold(X.shape[0], 5, shuffle=True, random_state=33)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print scores
In [51]:
n_components = 2
pca = decomposition.PCA(n_components)
pca.fit_transform(X_iris)
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
plt.show()
In [ ]: