In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
In [3]:
df = pd.read_pickle("df.db")
In [4]:
df.head()
Out[4]:
In [5]:
X = df.iloc[:, 3:]
In [6]:
X.head()
Out[6]:
In [7]:
y = df['流行']
In [8]:
y.head()
Out[8]:
In [9]:
from sklearn.model_selection import train_test_split
In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)
In [11]:
from sklearn.linear_model import LogisticRegression
In [12]:
clf = LogisticRegression()
In [13]:
clf.fit(X_train, y_train)
Out[13]:
In [14]:
y_train_pred = clf.predict(X_train)
In [15]:
from sklearn.metrics import accuracy_score
In [16]:
accuracy_score(y_train, y_train_pred)
Out[16]:
In [17]:
y_val_pred = clf.predict(X_val)
In [18]:
accuracy_score(y_val, y_val_pred)
Out[18]:
In [19]:
from sklearn.metrics import confusion_matrix
In [20]:
cm = confusion_matrix(y_val, y_val_pred)
In [21]:
cm
Out[21]:
In [22]:
cm_t = confusion_matrix(y_train, y_train_pred)
cm_t
Out[22]:
2 / (1/適合率+1/再現率) = 2 * 適合率 * 再現率 / (適合率+再現率)
(2 * 0.93 * 0.94 / (0.93 + 0.94) = 0.93
(2 * 0.61 * 0.55 / (0.61 + 0.55) = 0.58
In [24]:
from sklearn.metrics import classification_report
In [25]:
print(classification_report(y_val, y_val_pred))
In [26]:
def report(y, pred):
print(accuracy_score(y, pred))
cm = confusion_matrix(y, pred)
print(cm)
cr = classification_report(y, pred)
print(cr)
In [27]:
report(y_train, y_train_pred)
In [28]:
def fit_to_pred(clf, X_train, X_val, y_train, y_val):
# 学習
clf.fit(X_train, y_train)
# 学習データで評価
y_train_pred = clf.predict(X_train)
print("y_train_pred: ")
report(y_train, y_train_pred)
# テストデータで評価
y_val_pred = clf.predict(X_val)
print("y_val_pred: ")
report(y_val, y_val_pred)
# 学習済みデータを返す
return clf
In [29]:
clf = LogisticRegression()
fit_to_pred(clf, X_train, X_val, y_train, y_val)
Out[29]:
In [30]:
from sklearn.svm import SVC
In [31]:
svc = SVC(kernel="linear")
fit_to_pred(svc, X_train, X_val, y_train, y_val)
Out[31]:
In [32]:
k_svc = SVC(kernel="rbf")
fit_to_pred(k_svc, X_train, X_val, y_train, y_val)
Out[32]:
In [33]:
from sklearn.tree import DecisionTreeClassifier
In [34]:
tree = DecisionTreeClassifier(max_depth=2)
fit_to_pred(tree, X_train, X_val, y_train, y_val)
Out[34]:
In [35]:
from sklearn.ensemble import RandomForestClassifier
In [36]:
rf = RandomForestClassifier()
fit_to_pred(rf, X_train, X_val, y_train, y_val)
Out[36]:
In [37]:
from sklearn.neighbors import KNeighborsClassifier
In [38]:
knn = KNeighborsClassifier()
fit_to_pred(knn, X_train, X_val, y_train, y_val)
Out[38]:
In [39]:
from sklearn.model_selection import cross_val_score
In [40]:
from sklearn.model_selection import KFold
In [41]:
cv = KFold(5, shuffle=True)
In [42]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv)
Out[42]:
In [43]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv)
Out[43]:
In [44]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv)
Out[44]:
In [45]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv, scoring="f1")
Out[45]:
In [46]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv, scoring="f1")
Out[46]:
In [47]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv, scoring="f1")
Out[47]:
In [48]:
from sklearn.model_selection import GridSearchCV
In [49]:
param_grid = {'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]}
In [50]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)
In [51]:
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1)
In [52]:
grid_search.fit(X, y)
Out[52]:
In [53]:
grid_search.best_score_
Out[53]:
In [54]:
grid_search.best_params_
Out[54]:
In [55]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X, y)
Out[55]:
In [56]:
print(grid_search.best_score_)
print(grid_search.best_params_)
In [57]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
cross_val_score(rf, X, y, cv=cv, scoring="f1")
Out[57]:
In [58]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
fit_to_pred(rf, X_train, X_val, y_train, y_val)
Out[58]:
In [59]:
rf.predict(X_val)
Out[59]:
In [60]:
from sklearn.externals import joblib
In [61]:
joblib.dump(rf, "clf_rf.db")
Out[61]:
In [ ]: