In [4]:
"""seeds dataset"""


Out[4]:
'seeds dataset'

In [6]:
import numpy as np


def load_dataset(dataset_name):
    '''
    data,labels = load_dataset(dataset_name)

    Load a given dataset

    Returns
    -------
    data : numpy ndarray
    labels : list of str
    '''
    data = []
    labels = []
    with open('./ch02/{0}.tsv'.format(dataset_name)) as ifile:
        for line in ifile:
            tokens = line.strip().split('\t')
            data.append([float(tk) for tk in tokens[:-1]])
            labels.append(tokens[-1])
    data = np.array(data)
    labels = np.array(labels)
    return data, labels


feature_names = [
    'area',
    'perimeter',
    'compactness',
    'length of kernel',
    'width of kernel',
    'asymmetry coefficien',
    'length of kernel groove',
]
features, labels = load_dataset('seeds')

In [7]:
"""Classifying with scikit-learn
* fit(features, labels): this is the learning step and fits the parameters of the model
* predict(features): this method can only be called after fit and returns a prediction for one or more inputs
"""


Out[7]:
'Classifying with scikit-learn\n* fit(features, labels): this is the learning step and fits the parameters of the model\n* predict(features): this method can only be called after fit and returns a prediction for one or more inputs\n'

In [8]:
"""K nearest neighbour classification
"""

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=1)

In [11]:
"""Cross validation"""
from sklearn.cross_validation import KFold
kf = KFold(len(features), n_folds=5, shuffle=True)

means = []
for training, testing in kf:
   # We learn a model for this fold with `fit` and then apply it to the
   # testing data with `predict`:
   classifier.fit(features[training], labels[training])
   prediction = classifier.predict(features[testing])

   # np.mean on an array of booleans returns fraction
 # of correct decisions for this fold:
   curmean = np.mean(prediction == labels[testing])
   means.append(curmean)
print('Mean accuracy: {:.1%}'.format(np.mean(means)))


Mean accuracy: 91.0%

In [13]:
"""Normalization"""
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

classifier = KNeighborsClassifier(n_neighbors=1)
"""
The Pipeline constructor takes a list of pairs (str,clf).
Each pair corresponds to a step in the pipeline: the first element is a string naming the step, 
while the second element is the object that performs the transformation.
Advanced usage of the object uses these names to refer to different steps.
"""
classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])

means = []
for training,testing in kf:
    # We learn a model for this fold with `fit` and then apply it to the
    # testing data with `predict`:
    classifier.fit(features[training], labels[training])
    prediction = classifier.predict(features[testing])

    # np.mean on an array of booleans returns fraction
    # of correct decisions for this fold:
    curmean = np.mean(prediction == labels[testing])
    means.append(curmean)
print('Mean accuracy: {:.1%}'.format(np.mean(means)))


Mean accuracy: 93.3%

In [21]:
"""k(from 1 ~ 100) knn"""

import numpy as np
from matplotlib import pyplot as plt

from sklearn.neighbors import KNeighborsClassifier

from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


features, labels = load_dataset('seeds')

# Values of k to consider: all in 1 .. 100
ks = np.arange(1, 101)

# We build a classifier object here with the default number of neighbors
# (It happens to be 5, but it does not matter as we will be changing it below
classifier = KNeighborsClassifier()
classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])

# accuracies will hold our results
accuracies = []
for k in ks:
    # set the classifier parameter
    classifier.set_params(knn__n_neighbors=k)
    crossed = cross_val_score(classifier, features, labels)

    # Save only the average
    accuracies.append(crossed.mean())

accuracies = np.array(accuracies)
print accuracies

# Scale the accuracies by 100 to plot as a percentage instead of as a fraction
# plt.plot(ks, accuracies*100)
# plt.xlabel('Value for k (nr. of neighbors)')
# plt.ylabel('Accuracy (%)')
# plt.savefig('figure6.png')
# plt.show()


[ 0.89432367  0.88546699  0.9041868   0.9041868   0.92351047  0.9281401
  0.92330918  0.90881643  0.91847826  0.92793881  0.91827697  0.91847826
  0.91344605  0.91827697  0.91344605  0.91344605  0.91827697  0.92310789
  0.90861514  0.91827697  0.91344605  0.90378422  0.90378422  0.88929147
  0.89412238  0.89412238  0.88929147  0.88446055  0.89412238  0.88929147
  0.89412238  0.8989533   0.90378422  0.90378422  0.90861514  0.89412238
  0.90861514  0.90378422  0.90378422  0.90378422  0.90861514  0.8989533
  0.91344605  0.90378422  0.91344605  0.90861514  0.91827697  0.8989533
  0.91344605  0.90378422  0.91827697  0.90861514  0.91344605  0.90378422
  0.90861514  0.8989533   0.89432367  0.89412238  0.89432367  0.89915459
  0.89915459  0.90881643  0.91364734  0.90881643  0.91364734  0.91364734
  0.90398551  0.91364734  0.90881643  0.89915459  0.91364734  0.90398551
  0.90398551  0.89915459  0.90881643  0.90398551  0.90398551  0.90398551
  0.90398551  0.90881643  0.90398551  0.89432367  0.89432367  0.88949275
  0.88949275  0.89432367  0.88466184  0.88949275  0.89412238  0.88929147
  0.88929147  0.88929147  0.88466184  0.875       0.87983092  0.88929147
  0.87962963  0.87037037  0.87540258  0.88023349]

In [ ]: