In [ ]:
"""将ch02_01模型抽成函数"""

In [7]:
# This function was called ``learn_model`` in the first edition
def fit_model(features, labels):
    '''Learn a simple threshold model'''
    best_acc = -1.0
    # loop over all the features
    for fi in range(features.shape[1]):
        thresh = features[:, fi].copy()
        thresh.sort()
        for t in  thresh:
            pred = (features[:, fi] > t)
            # measure the accuracy of this
            acc = (pred == ~labels).mean()
            
            rev_acc = (pred == ~labels).mean()
            if rev_acc > acc:
                acc = rev_acc
                reverse = True
            else:
                reverse = False
            if acc > best_acc:
                best_acc = acc
                best_fi = fi
                best_t = t
                best_reverse = reverse
    # a model is a threshold and an index
    return best_t, best_fi, best_reverse

# This function was called ``apply_model`` in the first edition
def predict(model, features):
    '''Apply a learned model'''
    # a model is a pair as returned by fit_model
    t, fi, reverse = model
    if reverse:
        return features[:, fi] <= t
    else:
        return features[:, fi] > t


def accuracy(features, labels, model):
    '''Compute the accuracy of the model'''
    preds = predict(model, features)
    return np.mean(preds == labels)

In [ ]:
"""held out"""

In [8]:
import numpy as np
from sklearn.datasets import load_iris

data = load_iris()
features = data['data']
labels = data['target_names'][data['target']]

# We are going to remove the setosa examples as they are too easy:
is_setosa = (labels == 'setosa')
features = features[~is_setosa]
labels = labels[~is_setosa]

# Now we classify virginica vs non-virginica
is_virginica = (labels == 'virginica')

# Split the data in two: testing and training
testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...]

# Training is the negation of testing: i.e., datapoints not used for testing,
# will be used for training
training = ~testing

model = fit_model(features[training], is_virginica[training])
train_accuracy = accuracy(features[training], is_virginica[training], model)
test_accuracy = accuracy(features[testing], is_virginica[testing], model)

print('''\
Training accuracy was {0:.1%}.
Testing accuracy was {1:.1%} (N = {2}).
'''.format(train_accuracy, test_accuracy, testing.sum()))


Training accuracy was 48.0%.
Testing accuracy was 56.0% (N = 50).


In [ ]:
"""holding out data and cross validation"""

In [9]:
correct = 0.0

for ei in range(len(features)):
    # select all but the one at position `ei`:
    training = np.ones(len(features), bool)
    training[ei] = False
    testing = ~training
    model = fit_model(features[training], is_virginica[training])
    predictions = predict(model, features[testing])
    correct += np.sum(predictions == is_virginica[testing])
acc = correct/float(len(features))
print('Accuracy: {0:.1%}'.format(acc))


Accuracy: 99.0%

In [ ]:
"""stump"""

In [10]:
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

from sklearn.datasets import load_iris
data = load_iris()
features = data.data
labels = data.target_names[data.target]


is_setosa = (labels == 'setosa')
features = features[~is_setosa]
labels = labels[~is_setosa]
is_virginica = (labels == 'virginica')


# Initialize to a value that is worse than any possible test
best_acc = -1.0

# Loop over all the features
for fi in range(features.shape[1]):
    # Test every possible threshold value for feature fi
    thresh = features[:, fi].copy()

    # Test them in order
    thresh.sort()
    for t in thresh:

        # Generate predictions using t as a threshold
        pred = (features[:, fi] > t)

        # Accuracy is the fraction of predictions that match reality
        acc = (pred == is_virginica).mean()

        # We test whether negating the test is a better threshold:
        acc_neg = ((~pred) == is_virginica).mean()
        if acc_neg > acc:
            acc = acc_neg
            negated = True
        else:
            negated = False

        # If this is better than previous best, then this is now the new best:

        if acc > best_acc:
            best_acc = acc
            best_fi = fi
            best_t = t
            best_is_negated = negated

print('Best threshold is {0} on feature {1} (index {2}), which achieves accuracy of {3:.1%}.'.format(
    best_t, data.feature_names[best_fi], best_fi, best_acc))


Best threshold is 1.6 on feature petal width (cm) (index 3), which achieves accuracy of 94.0%.

In [ ]:
"""
To think of the problem at a higher abstraction level, "What makes up a classification model?" 
We can break it up into three parts:

1. The structure of the model: how excactly will a model make decisions?
2. The search procedure: how do we find the model we need to use?
3. The gain or loss function: how do we decide which of the possibilities tested should be returned?

"""