Read in data


In [44]:
from sklearn import datasets, decomposition
import numpy as np
import matplotlib.pyplot as plt

iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
print X_iris.shape, y_iris.shape
print X_iris[0], y_iris[0], np.unique(y_iris)


(150, 4) (150,)
[ 5.1  3.5  1.4  0.2] 0 [0 1 2]

In [45]:

Plot data


In [17]:
colors = ['red', 'greenyellow', 'blue']

for i in xrange(len(colors)):
    px = X_iris[:, 0][y_iris == i]
    py = X_iris[:, 1][y_iris == i]
    plt.scatter(px, py, c=colors[i])

plt.legend(iris.target_names)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.show()

Divide data into training and testing set


In [6]:
from sklearn.cross_validation import train_test_split

# Get dataset with only the first two attributes
X, y = X_iris[:,:2], y_iris
# Split the dataset into a trainig and a testing set
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
print X_train.shape, y_train.shape
print X_train.mean(), X_train.std()


(112, 2) (112,)
4.41964285714 1.53080506864

Preprocessing data


In [7]:
from sklearn.preprocessing import StandardScaler
# Standarize the features
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)
print X_train.mean(), X_train.std()


1.74463618155e-15 1.0

Train with Algorithm


In [18]:
# create the linear model classifier
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
# fit (train) the classifier
clf.fit(X_train, y_train)
# print learned coeficients
print clf.coef_
print clf.intercept_


[[-28.56232699  15.06870628]
 [ -8.94402784  -8.14000854]
 [ 14.04159132 -12.8156682 ]]
[-17.62477802  -2.35658325  -9.7570213 ]

Plot the desicion curve


In [26]:
x_min, x_max = X_train[:, 0].min() - .5, X_train[:, 0].max() + .5
y_min, y_max = X_train[:, 1].min() - .5, X_train[:, 1].max() + .5
xs = np.arange(x_min,x_max,0.5)
fig, axes = plt.subplots(1,3)
fig.set_size_inches(10,6)
for i in [0,1,2]:
    axes[i].set_aspect('equal')
    axes[i].set_title('Class ' + str(i) + ' versus the rest')
    axes[i].set_xlabel('Sepal length')
    axes[i].set_ylabel('Sepal width')
    axes[i].set_xlim(x_min, x_max)
    axes[i].set_ylim(y_min, y_max)
    plt.sca(axes[i])
    for j in xrange(len(colors)):
        px = X_train[:, 0][y_train == j]
        py = X_train[:, 1][y_train == j]
        plt.scatter(px, py, c=colors[j])
    ys = (-clf.intercept_[i]-xs*clf.coef_[i,0])/clf.coef_[i,1]
    plt.plot(xs,ys,hold=True)
plt.show()

Evaluate single case


In [29]:
print clf.predict(scaler.transform([[4.7, 3.1]]))
#the decision_function gives a confidence score for each class
print clf.decision_function(scaler.transform([[4.7, 3.1]]))


[0]
[[ 19.77232705   8.13983962 -28.65250296]]

Evaluate the performance on test data


In [31]:
from sklearn import metrics
y_pred = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)


0.684210526316

Evaluate the results


In [54]:
print metrics.classification_report(y_test, y_pred, target_names=iris.target_names)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print confusion_matrix

plt.matshow(confusion_matrix)
plt.show()


             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00         8
 versicolor       0.43      0.27      0.33        11
  virginica       0.65      0.79      0.71        19

avg / total       0.66      0.68      0.66        38

[[ 8  0  0]
 [ 0  3  8]
 [ 0  4 15]]

Pipe everything together and do a KFold cross validation


In [43]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline

# create a composite estimator made by a pipeline of the standarization and the linear model
clf = Pipeline([
        ('scaler', StandardScaler()),
        ('linear_model', SGDClassifier())
])
# create a k-fold croos validation iterator of k=5 folds
cv = KFold(X.shape[0], 5, shuffle=True, random_state=33)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print scores


[ 0.73333333  0.63333333  0.73333333  0.66666667  0.6       ]

In [51]:
n_components = 2
pca = decomposition.PCA(n_components)
pca.fit_transform(X_iris)
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
plt.show()

In [ ]: