In [ ]:
"""将ch02_01模型抽成函数"""
In [7]:
# This function was called ``learn_model`` in the first edition
def fit_model(features, labels):
'''Learn a simple threshold model'''
best_acc = -1.0
# loop over all the features
for fi in range(features.shape[1]):
thresh = features[:, fi].copy()
thresh.sort()
for t in thresh:
pred = (features[:, fi] > t)
# measure the accuracy of this
acc = (pred == ~labels).mean()
rev_acc = (pred == ~labels).mean()
if rev_acc > acc:
acc = rev_acc
reverse = True
else:
reverse = False
if acc > best_acc:
best_acc = acc
best_fi = fi
best_t = t
best_reverse = reverse
# a model is a threshold and an index
return best_t, best_fi, best_reverse
# This function was called ``apply_model`` in the first edition
def predict(model, features):
'''Apply a learned model'''
# a model is a pair as returned by fit_model
t, fi, reverse = model
if reverse:
return features[:, fi] <= t
else:
return features[:, fi] > t
def accuracy(features, labels, model):
'''Compute the accuracy of the model'''
preds = predict(model, features)
return np.mean(preds == labels)
In [ ]:
"""held out"""
In [8]:
import numpy as np
from sklearn.datasets import load_iris
data = load_iris()
features = data['data']
labels = data['target_names'][data['target']]
# We are going to remove the setosa examples as they are too easy:
is_setosa = (labels == 'setosa')
features = features[~is_setosa]
labels = labels[~is_setosa]
# Now we classify virginica vs non-virginica
is_virginica = (labels == 'virginica')
# Split the data in two: testing and training
testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...]
# Training is the negation of testing: i.e., datapoints not used for testing,
# will be used for training
training = ~testing
model = fit_model(features[training], is_virginica[training])
train_accuracy = accuracy(features[training], is_virginica[training], model)
test_accuracy = accuracy(features[testing], is_virginica[testing], model)
print('''\
Training accuracy was {0:.1%}.
Testing accuracy was {1:.1%} (N = {2}).
'''.format(train_accuracy, test_accuracy, testing.sum()))
In [ ]:
"""holding out data and cross validation"""
In [9]:
correct = 0.0
for ei in range(len(features)):
# select all but the one at position `ei`:
training = np.ones(len(features), bool)
training[ei] = False
testing = ~training
model = fit_model(features[training], is_virginica[training])
predictions = predict(model, features[testing])
correct += np.sum(predictions == is_virginica[testing])
acc = correct/float(len(features))
print('Accuracy: {0:.1%}'.format(acc))
In [ ]:
"""stump"""
In [10]:
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
from sklearn.datasets import load_iris
data = load_iris()
features = data.data
labels = data.target_names[data.target]
is_setosa = (labels == 'setosa')
features = features[~is_setosa]
labels = labels[~is_setosa]
is_virginica = (labels == 'virginica')
# Initialize to a value that is worse than any possible test
best_acc = -1.0
# Loop over all the features
for fi in range(features.shape[1]):
# Test every possible threshold value for feature fi
thresh = features[:, fi].copy()
# Test them in order
thresh.sort()
for t in thresh:
# Generate predictions using t as a threshold
pred = (features[:, fi] > t)
# Accuracy is the fraction of predictions that match reality
acc = (pred == is_virginica).mean()
# We test whether negating the test is a better threshold:
acc_neg = ((~pred) == is_virginica).mean()
if acc_neg > acc:
acc = acc_neg
negated = True
else:
negated = False
# If this is better than previous best, then this is now the new best:
if acc > best_acc:
best_acc = acc
best_fi = fi
best_t = t
best_is_negated = negated
print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format(
best_t, data.feature_names[best_fi], best_fi, best_acc))
In [ ]:
"""
To think of the problem at a higher abstraction level, "What makes up a classification model?"
We can break it up into three parts:
1. The structure of the model: how excactly will a model make decisions?
2. The search procedure: how do we find the model we need to use?
3. The gain or loss function: how do we decide which of the possibilities tested should be returned?
"""