# Number Recognizer

その予測を行うためのモデルを、以下のステップに沿って作成していきます。

``````

In [15]:

# enable showing matplotlib image inline
%matplotlib inline

``````

``````

In [5]:

from sklearn import datasets
return dataset

print(digits.data.shape)

``````
``````

(1797, 64)

``````
``````

In [16]:

def show_image(image):
import matplotlib.pyplot as plt

plt.figure(1, figsize=(3, 3))
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()

show_image(digits.images[0])

``````
``````

``````

## Create the Model

``````

In [7]:

def make_model():
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(alpha=0.0001, fit_intercept=True)
return clf

classifier = make_model()

``````

## Training the Model

``````

In [9]:

def split_dataset(dataset, test_size=0.3):
from sklearn import cross_validation
from collections import namedtuple

DataSet = namedtuple("DataSet", ["data", "target"])
train_d, test_d, train_t, test_t = cross_validation.train_test_split(dataset.data, dataset.target, test_size=test_size, random_state=0)

left = DataSet(train_d, train_t)
right = DataSet(test_d, test_t)

return left, right

# use 30% of data to test the model
training_set, test_set = split_dataset(digits, 0.3)
print("dataset is splited to train/test = {0} -> {1}, {2}".format(
len(digits.data), len(training_set.data), len(test_set.data))
)

``````
``````

dataset is splited to train/test = 1797 -> 1257, 540

``````
``````

In [10]:

# training the model
classifier.fit(training_set.data, training_set.target)

``````
``````

Out[10]:

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
eta0=0.0, fit_intercept=True, l1_ratio=0.15,
learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
penalty='l2', power_t=0.5, random_state=None, shuffle=True,
verbose=0, warm_start=False)

``````

``````

In [11]:

import os
from sklearn.externals import joblib

``````

## Evaluate the Model

``````

In [12]:

def calculate_accuracy(model, dataset):
from sklearn import metrics

predicted = model.predict(dataset.data)
score = metrics.accuracy_score(dataset.target, predicted)
return score

print(calculate_accuracy(classifier, training_set))
print(calculate_accuracy(classifier, test_set))

``````
``````

0.964996022275
0.961111111111

``````
``````

In [13]:

def show_confusion_matrix(model, dataset):
from sklearn.metrics import classification_report

predicted = model.predict(dataset.data)
target_names = ["#{0}".format(i) for i in range(0, 10)]

print(classification_report(dataset.target, predicted, target_names=target_names))

show_confusion_matrix(classifier, digits)

``````
``````

precision    recall  f1-score   support

#0       0.99      0.99      0.99       178
#1       0.85      0.99      0.92       182
#2       0.99      0.98      0.99       177
#3       0.99      0.95      0.97       183
#4       0.98      0.98      0.98       181
#5       0.95      0.99      0.97       182
#6       1.00      0.94      0.97       181
#7       0.99      0.99      0.99       179
#8       0.94      0.91      0.93       174
#9       0.98      0.90      0.94       180

avg / total       0.97      0.96      0.96      1797

``````
``````

In [17]:

def plot_learning_curve(model_func, dataset):
from sklearn.learning_curve import learning_curve
import matplotlib.pyplot as plt
import numpy as np

sizes = [i / 10 for i in range(1, 11)]
train_sizes, train_scores, valid_scores = learning_curve(model_func(), dataset.data, dataset.target, train_sizes=sizes, cv=5)

take_means = lambda s: np.mean(s, axis=1)
plt.plot(sizes, take_means(train_scores), label="training")
plt.plot(sizes, take_means(valid_scores), label="test")
plt.ylim(0, 1.1)
plt.title("learning curve")
plt.legend(loc="lower right")
plt.show()

plot_learning_curve(make_model, digits)

``````
``````

``````

## Tuning the Model

``````

In [18]:

def tuning_model(model_func, dataset):
from sklearn.grid_search import GridSearchCV

candidates = [
{"loss": ["hinge", "log"],
"alpha": [1e-5, 1e-4, 1e-3]
}]

searcher = GridSearchCV(model_func(), candidates, cv=5, scoring="f1_weighted")
searcher.fit(dataset.data, dataset.target)

for params, mean_score, scores in sorted(searcher.grid_scores_, key=lambda s: s[1], reverse=True):
print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

return searcher.best_estimator_

tuned_classifier = tuning_model(make_model, digits)

``````
``````

0.902 (+/-0.011) for {'loss': 'log', 'alpha': 0.0001}
0.902 (+/-0.014) for {'loss': 'log', 'alpha': 0.001}
0.897 (+/-0.016) for {'loss': 'hinge', 'alpha': 0.001}
0.895 (+/-0.018) for {'loss': 'hinge', 'alpha': 0.0001}
0.883 (+/-0.029) for {'loss': 'hinge', 'alpha': 1e-05}
0.882 (+/-0.023) for {'loss': 'log', 'alpha': 1e-05}

``````

## Store the Model

``````

In [73]:

from sklearn.externals import joblib

joblib.dump(tuned_classifier, "./machine.pkl")

``````
``````

Out[73]:

['./machine.pkl',
'./machine.pkl_01.npy',
'./machine.pkl_02.npy',
'./machine.pkl_03.npy',
'./machine.pkl_04.npy']

``````
``````

In [ ]:

``````