In [1]:
%matplotlib inline
In [2]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.grid_search import GridSearchCV
from IPython.display import Image
pd.set_option('chained_assignment', None)
plt.style.use('ggplot')
plt.rc('xtick.major', size=0)
plt.rc('ytick.major', size=0)
In [3]:
# 学習用データの読み込み
user_tags = pd.read_csv("user_tags_merge.csv")
In [4]:
# 読み込んだデータの表示
user_tags
Out[4]:
In [5]:
X = user_tags[['person', 'food']] # 'person'タグと'food'タグのみをXとして抽出
X.tail()
Out[5]:
In [6]:
y = user_tags['gender_male'] # 性別をyとして抽出
In [7]:
# 'person'タグと'food'タグ、および性別の関係を散布図として描画
np.random.seed = 0
xmin, xmax = -2, 12
ymin, ymax = -2, 17
index_male = y[y==1].index # 男性
index_female = y[y==0].index # 女性
fig, ax = plt.subplots()
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
sc = ax.scatter(X.loc[index_male, 'food'],
X.loc[index_male, 'person']+(np.random.rand(len(index_male))-0.5)*0.1,
color='b', label='male', alpha=0.3)
sc = ax.scatter(X.loc[index_female, 'food'],
X.loc[index_female, 'person']+(np.random.rand(len(index_female))-0.5)*0.1,
color='r', label='female', alpha=0.3)
ax.set_xlabel('food') # x軸ラベル
ax.set_ylabel('person') # y軸ラベル
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.legend(bbox_to_anchor=(1.4, 1.03))
plt.show()
In [8]:
X = user_tags[['nail', 'person', 'sport', 'food','coffee','cake','beer','sky']] # 説明変数Xの更新(タグを追加)
y = user_tags["gender_male"]
In [10]:
# クロスバリデーション関数の定義
# clf : 識別器モデル
# X : 説明変数
# y : 被説明変数
# K : データの分割数
def cross_val(clf, X, y, K, random_state=0):
cv = KFold(len(y), K, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=cv)
return scores
In [12]:
clf = LogisticRegression() # 識別器モデルとしてロジスティクス回帰を読み込み
for i in range(2,12): # クロスバリデーションのKを2から12で繰り返し実行
scores = cross_val(clf, X, y, i)
print(i)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [13]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=2) # 識別器モデルとして決定木を読み込み
for i in range(2,12):
scores = cross_val(clf, X, y, i)
print(i)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [14]:
X = user_tags[['nail','hair', 'person', 'sport', 'food','night','coffee','wedding','cake','beer', 'dog', 'animal', 'tree','blossom','cat', 'flower','sky','nature','cherry']] # 説明変数Xの更新(タグを追加
y = user_tags["gender_male"]
In [15]:
# タグの統合・整理
X['animal']=X['animal']+X['dog']+X['cat']
X['cosme']=X['hair']+X['nail']
X['nature']=X['nature']+X['sky']+X['flower']+X['tree']+X['blossom']+X['cherry']
X = X.drop(['nail','hair', 'dog', 'cat', 'sky','flower','tree','blossom','cherry'],axis=1)
In [16]:
clf = LogisticRegression() # 識別器モデルとしてロジスティクス回帰を読み込み
for i in range(2,12):
scores = cross_val(clf, X, y, i)
print(i)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [17]:
# 識別器をファイルとして出力
from sklearn.externals import joblib #ライブラリのインポート
clf = LogisticRegression()
clf.fit(X,y) # 識別器を作成
joblib.dump(clf, 'clf.pkl') # clf.pklとしてファイル出力
Out[17]:
In [ ]: