In [1]:
from sklearn.datasets import fetch_lfw_people
from sklearn.cross_validation import train_test_split
lfw_people = fetch_lfw_people(min_faces_per_person=50, resize=0.5)
X = lfw_people.data
y = lfw_people.target
print(lfw_people.images[0].shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
In [2]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
In [3]:
model = Pipeline([('pca', PCA(n_components=100)), ('svc', SVC(kernel='linear', C=1000))])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=lfw_people.target_names))
print(confusion_matrix(y_test, y_pred))
In [4]:
model = Pipeline([('pca', PCA(n_components=100)), #PCA는 성능이 더 좋아지기 힘들다.
('svc', SVC(kernel='rbf', C=1000, gamma=0.00000009))]) #C가 너무 커서 엄청나게 오버피팅이 일어난 상태. 그래서 1/10씩 줄이기
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=lfw_people.target_names))
print(confusion_matrix(y_test, y_pred))
번외로 PCA가 없을 때 성능이 더 올라가는 경우 증명
In [5]:
model = SVC(kernel='rbf', C=1000, gamma=0.00000009)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=lfw_people.target_names))
print(confusion_matrix(y_test, y_pred))
In [1]:
from sklearn.ensemble import RandomForestClassifier
In [2]:
model = RandomForestClassifier(n_estimators=50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=lfw_people.target_names))
print(confusion_matrix(y_test, y_pred))
#0.63이어도 0.5 이상이기만 하면 RF의 성능은 쓸만한 것으로 평가된다.
In [8]:
a = [1,2,3,4,5,6,7]
In [9]:
np.argsort(a)
Out[9]:
In [10]:
np.argsort(a)[::-1][:5]
Out[10]:
In [11]:
ids = np.argsort(model.feature_importances_)[::-1][:10] #argsort는 가장 작은 것부터 큰 것 순서대로 나오게 된다.
zip(ids % 47, ids // 47) # %는 나머지 이고, //는 몫이다.
Out[11]:
In [12]:
ids
Out[12]:
In [13]:
importances = model.feature_importances_.reshape((62, 47))
plt.imshow(importances, interpolation="nearest", cmap=plt.cm.bone)
plt.scatter(ids % 47, ids // 47, facecolors='none', edgecolors='w', linewidths=2, s=300, alpha=0.5)
plt.grid(False)
plt.show()