In [15]:
# enable showing matplotlib image inline
%matplotlib inline
In [5]:
def load_data():
from sklearn import datasets
dataset = datasets.load_digits()
return dataset
digits = load_data()
print(digits.data.shape)
In [16]:
def show_image(image):
import matplotlib.pyplot as plt
plt.figure(1, figsize=(3, 3))
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
show_image(digits.images[0])
In [7]:
def make_model():
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(alpha=0.0001, fit_intercept=True)
return clf
classifier = make_model()
In [9]:
def split_dataset(dataset, test_size=0.3):
from sklearn import cross_validation
from collections import namedtuple
DataSet = namedtuple("DataSet", ["data", "target"])
train_d, test_d, train_t, test_t = cross_validation.train_test_split(dataset.data, dataset.target, test_size=test_size, random_state=0)
left = DataSet(train_d, train_t)
right = DataSet(test_d, test_t)
return left, right
# use 30% of data to test the model
training_set, test_set = split_dataset(digits, 0.3)
print("dataset is splited to train/test = {0} -> {1}, {2}".format(
len(digits.data), len(training_set.data), len(test_set.data))
)
In [10]:
# training the model
classifier.fit(training_set.data, training_set.target)
Out[10]:
既に学習済みのモデルをロードする
In [11]:
# load model
def load_model():
import os
from sklearn.externals import joblib
return joblib.load("./machine.pkl")
classifier = load_model()
In [12]:
def calculate_accuracy(model, dataset):
from sklearn import metrics
predicted = model.predict(dataset.data)
score = metrics.accuracy_score(dataset.target, predicted)
return score
print(calculate_accuracy(classifier, training_set))
print(calculate_accuracy(classifier, test_set))
In [13]:
def show_confusion_matrix(model, dataset):
from sklearn.metrics import classification_report
predicted = model.predict(dataset.data)
target_names = ["#{0}".format(i) for i in range(0, 10)]
print(classification_report(dataset.target, predicted, target_names=target_names))
show_confusion_matrix(classifier, digits)
In [17]:
def plot_learning_curve(model_func, dataset):
from sklearn.learning_curve import learning_curve
import matplotlib.pyplot as plt
import numpy as np
sizes = [i / 10 for i in range(1, 11)]
train_sizes, train_scores, valid_scores = learning_curve(model_func(), dataset.data, dataset.target, train_sizes=sizes, cv=5)
take_means = lambda s: np.mean(s, axis=1)
plt.plot(sizes, take_means(train_scores), label="training")
plt.plot(sizes, take_means(valid_scores), label="test")
plt.ylim(0, 1.1)
plt.title("learning curve")
plt.legend(loc="lower right")
plt.show()
plot_learning_curve(make_model, digits)
In [18]:
def tuning_model(model_func, dataset):
from sklearn.grid_search import GridSearchCV
candidates = [
{"loss": ["hinge", "log"],
"alpha": [1e-5, 1e-4, 1e-3]
}]
searcher = GridSearchCV(model_func(), candidates, cv=5, scoring="f1_weighted")
searcher.fit(dataset.data, dataset.target)
for params, mean_score, scores in sorted(searcher.grid_scores_, key=lambda s: s[1], reverse=True):
print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))
return searcher.best_estimator_
tuned_classifier = tuning_model(make_model, digits)
In [73]:
from sklearn.externals import joblib
joblib.dump(tuned_classifier, "./machine.pkl")
Out[73]:
In [ ]: