In [4]:
"""seeds dataset"""
Out[4]:
In [6]:
import numpy as np
def load_dataset(dataset_name):
'''
data,labels = load_dataset(dataset_name)
Load a given dataset
Returns
-------
data : numpy ndarray
labels : list of str
'''
data = []
labels = []
with open('./ch02/{0}.tsv'.format(dataset_name)) as ifile:
for line in ifile:
tokens = line.strip().split('\t')
data.append([float(tk) for tk in tokens[:-1]])
labels.append(tokens[-1])
data = np.array(data)
labels = np.array(labels)
return data, labels
feature_names = [
'area',
'perimeter',
'compactness',
'length of kernel',
'width of kernel',
'asymmetry coefficien',
'length of kernel groove',
]
features, labels = load_dataset('seeds')
In [7]:
"""Classifying with scikit-learn
* fit(features, labels): this is the learning step and fits the parameters of the model
* predict(features): this method can only be called after fit and returns a prediction for one or more inputs
"""
Out[7]:
In [8]:
"""K nearest neighbour classification
"""
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=1)
In [11]:
"""Cross validation"""
from sklearn.cross_validation import KFold
kf = KFold(len(features), n_folds=5, shuffle=True)
means = []
for training, testing in kf:
# We learn a model for this fold with `fit` and then apply it to the
# testing data with `predict`:
classifier.fit(features[training], labels[training])
prediction = classifier.predict(features[testing])
# np.mean on an array of booleans returns fraction
# of correct decisions for this fold:
curmean = np.mean(prediction == labels[testing])
means.append(curmean)
print('Mean accuracy: {:.1%}'.format(np.mean(means)))
In [13]:
"""Normalization"""
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
classifier = KNeighborsClassifier(n_neighbors=1)
"""
The Pipeline constructor takes a list of pairs (str,clf).
Each pair corresponds to a step in the pipeline: the first element is a string naming the step,
while the second element is the object that performs the transformation.
Advanced usage of the object uses these names to refer to different steps.
"""
classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
means = []
for training,testing in kf:
# We learn a model for this fold with `fit` and then apply it to the
# testing data with `predict`:
classifier.fit(features[training], labels[training])
prediction = classifier.predict(features[testing])
# np.mean on an array of booleans returns fraction
# of correct decisions for this fold:
curmean = np.mean(prediction == labels[testing])
means.append(curmean)
print('Mean accuracy: {:.1%}'.format(np.mean(means)))
In [21]:
"""k(from 1 ~ 100) knn"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
features, labels = load_dataset('seeds')
# Values of k to consider: all in 1 .. 100
ks = np.arange(1, 101)
# We build a classifier object here with the default number of neighbors
# (It happens to be 5, but it does not matter as we will be changing it below
classifier = KNeighborsClassifier()
classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
# accuracies will hold our results
accuracies = []
for k in ks:
# set the classifier parameter
classifier.set_params(knn__n_neighbors=k)
crossed = cross_val_score(classifier, features, labels)
# Save only the average
accuracies.append(crossed.mean())
accuracies = np.array(accuracies)
print accuracies
# Scale the accuracies by 100 to plot as a percentage instead of as a fraction
# plt.plot(ks, accuracies*100)
# plt.xlabel('Value for k (nr. of neighbors)')
# plt.ylabel('Accuracy (%)')
# plt.savefig('figure6.png')
# plt.show()
In [ ]: