In [1]:
# グラフが文章中に表示されるようにするおまじない
%matplotlib inline
# autoreload module
%load_ext autoreload
%autoreload 2
# load local package
import sys
import os
current_path = os.getcwd()
sys.path.append(os.path.join(current_path, "../../")) # load project root
image_url、moodを左端に設定したファイルから学習データを読み込みます。 なお、今回値はRekognitionのスコアであり、全項目同じ範囲の値のため正規化は行いません。
In [2]:
def read_data(path, ignore_columns):
import os
import numpy as np
header = []
y = None
X = None
# read data part
with open(data_file, "rb") as f:
header = f.readline().decode("utf-8").replace("\r", "").replace("\n", "").split("\t")
columns = [c for c in range(len(header)) if c not in ignore_columns]
header = [h for i, h in enumerate(header) if i not in [0] + ignore_columns]
data = np.genfromtxt(f, invalid_raise=False, usecols=columns)
y = data[:, 0]
X = data[:, 1:]
return header, y, X
data_file = os.path.join(current_path, "../../data/photo_to_mood.txt")
header, moods, labels = read_data(data_file, [1])
print(header)
print(moods.shape)
print(labels.shape)
今回扱うのは画像の分類問題になります。そこで、分類問題でよく使われるSupport Vector Machineを利用します。
特徴量の数が多いため、有効なものに限って使用します。
In [3]:
def select_features(feature_count, X, y, header):
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
selector = SelectKBest(f_classif, k=feature_count).fit(X, y)
selected = selector.get_support()
get_headers = lambda s: [i_h[1] for i_h in enumerate(header) if s[i_h[0]]]
kbests = sorted(zip(get_headers(selected), selector.scores_[selected]), key=lambda h_s: h_s[1], reverse=True)
return kbests
scores = select_features(10, labels, moods, header)
selected_features = [header.index(s[0]) for s in scores]
print(scores)
print(selected_features)
データとモデルがそろったため、学習させてみます。
パラメーターはGrid Searchで探索します。
In [6]:
def create_model(X, y, selected=()):
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import svm
X_c = X if len(selected) == 0 else X[:, selected]
x_train, x_test, y_train, y_test = train_test_split(X_c, y, test_size=0.2, random_state=42)
candidates = [{'kernel': ['linear'], 'C': [1, 10, 100]}]
clf = GridSearchCV(svm.SVC(C=1), candidates, cv=2, scoring="f1_weighted")
clf.fit(x_train, y_train)
for params, mean_score, scores in sorted(clf.grid_scores_, key=lambda s: s[1], reverse=True):
print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))
model = clf.best_estimator_
y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))
return model
model = create_model(labels, moods, selected_features)
print(model)
最後に、学習させたモデルを保存します。アプリケーション側で、その結果を確認してみてください。
In [7]:
def save(model):
from sklearn.externals import joblib
joblib.dump(model, "./machine.pkl")
print([header[s] for s in sorted(selected_features)])
save(model)
In [ ]: