PCA(Principal Component Analysis)는 주성분 분석이라고도 하며 차원 축소를 통해 최소 차원의 정보로 원래 차원의 정보를 모사(approximate)하려는 작업을 말한다.
PCA는 수학적으로 다음과 같은 직교 변환 행렬 $W$을 찾는 것과 동일하다.
$$ \hat{X} = X W $$여기에서 $X \in \mathbf{R}^{N\times D}$, $\hat{X} \in \mathbf{R}^{N\times \hat{D}}$, $(\hat{D} < D)$, $W \in \mathbf{R}^{D\times \hat{D} }$ 이고 $\hat{X}$의 값은 위에서 설명한 차원 축소 특징을 가져야 한다.
$W$ 값은 공분산 행렬 $XX^T$의 고유값 분해(eigenvalue decomposition)를 사용하여 찾을 수 있다. $W$의 각 열은 가장 큰 고유값부터 $\hat{D}$개의 순차적인 고유값에 대응하는 고유 벡터로 이루어진다.
Scikit-Learn 의 decomposition 서브패키지는 PCA분석을 위한 PCA
클래스를 제공한다. 사용법은 다음과 같다.
components_
n_components_
mean_
:explained_variance_ratio_
In [1]:
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
plt.scatter(X[:,0], X[:,1], s=100)
plt.xlim(-4,4)
plt.ylim(-3,3)
plt.title("original data")
plt.show()
In [2]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
Out[2]:
In [3]:
Z = pca.transform(X)
Z
Out[3]:
In [4]:
w, V = np.linalg.eig(pca.get_covariance())
In [5]:
V.T.dot(X.T).T
Out[5]:
In [6]:
plt.scatter(Z[:,0], Z[:,1], c='r', s=100)
plt.xlim(-4,4)
plt.ylim(-3,3)
plt.title("transformed data")
plt.show()
In [7]:
plt.scatter(Z[:,0], np.zeros_like(Z[:,1]), c='g', s=100)
plt.xlim(-4,4)
plt.ylim(-3,3)
plt.title("transformed and truncated data")
plt.show()
In [8]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:,2:]
plt.scatter(X[:, 0], X[:, 1], c=iris.target, s=200, cmap=plt.cm.jet);
In [9]:
X2 = PCA(2).fit_transform(X)
plt.scatter(X2[:, 0], X2[:, 1], c=iris.target, s=200, cmap=plt.cm.jet)
plt.xlim(-6, 6)
plt.show()
In [10]:
X1 = PCA(1).fit_transform(X)
sns.distplot(X1[iris.target==0], color="b", bins=20, rug=True, kde=False)
sns.distplot(X1[iris.target==1], color="g", bins=20, rug=True, kde=False)
sns.distplot(X1[iris.target==2], color="r", bins=20, rug=True, kde=False)
plt.xlim(-6, 6)
plt.show()
In [11]:
X3 = PCA(2).fit_transform(iris.data)
plt.scatter(X3[:, 0], X3[:, 1], c=iris.target, s=200, cmap=plt.cm.jet);
In [12]:
X4 = PCA(3).fit_transform(iris.data)
from mpl_toolkits.mplot3d import Axes3D
def plot_pca(azim):
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d');
ax.scatter(X4[:,0], X4[:,1], X4[:,2], c=iris.target, s=100, cmap=plt.cm.jet, alpha=1);
ax.view_init(20, azim)
plot_pca(-60)
In [13]:
from ipywidgets import widgets
widgets.interact(plot_pca, azim=widgets.IntSlider(min=0, max=180, step=10, value=0));
In [14]:
from sklearn.datasets import load_digits
digits = load_digits()
X_digits, y_digits = digits.data, digits.target
N=2; M=5;
fig = plt.figure(figsize=(10, 4))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for i in range(N):
for j in range(M):
k = i*M+j
ax = fig.add_subplot(N, M, k+1)
ax.imshow(digits.images[k], cmap=plt.cm.bone, interpolation="none");
ax.grid(False)
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
plt.title(digits.target_names[k])
In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_digits)
print(X_digits.shape)
print(X_pca.shape)
In [16]:
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_digits, s=100, cmap=plt.cm.jet)
plt.axis("equal")
plt.show()
In [17]:
from mpl_toolkits.mplot3d import Axes3D
def plot_pca2(azim):
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d');
ax.scatter(X_pca[:,0], X_pca[:,1], X_pca[:,2], c=y_digits, s=100, cmap=plt.cm.jet, alpha=1);
ax.view_init(20, azim)
plot_pca2(-60)
In [18]:
from ipywidgets import widgets
widgets.interact(plot_pca2, azim=widgets.IntSlider(min=0,max=180,step=10,value=0));
In [19]:
N=2; M=5;
fig = plt.figure(figsize=(10,3.2))
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0.05)
for i in range(N):
for j in range(M):
k = i*M+j
p = fig.add_subplot(N, M, k+1)
p.imshow(pca.components_[k].reshape((8,8)), cmap=plt.cm.bone, interpolation='none')
plt.xticks([])
plt.yticks([])
plt.grid(False)
데이터의 분리성을 향상시키기 위해 비선형 변환 $\phi(x)$ 을 한 데이터에 대해서 다시 PCA 적용하는 방법을 Kernel PCA라고 한다.
$$ x \;\; \rightarrow \;\; \phi(x) \;\; \rightarrow \;\; \text{PCA} \;\; \rightarrow \;\; z $$
In [20]:
A1_mean = [1, 1]
A1_cov = [[2, 1], [1, 1]]
A1 = np.random.multivariate_normal(A1_mean, A1_cov, 50)
A2_mean = [5, 5]
A2_cov = [[2, 1], [1, 1]]
A2 = np.random.multivariate_normal(A2_mean, A2_cov, 50)
A = np.vstack([A1, A2])
B_mean = [5, 0]
B_cov = [[0.8, -0.7], [-0.7, 0.8]]
B = np.random.multivariate_normal(B_mean, B_cov, 50)
AB = np.vstack([A, B])
plt.scatter(A[:,0], A[:,1], c='r')
plt.scatter(B[:,0], B[:,1], c='g')
plt.show()
In [21]:
pca = PCA(n_components=2)
pca.fit(AB)
A_transformed = pca.transform(A)
B_transformed = pca.transform(B)
plt.scatter(A_transformed[:,0], A_transformed[:,1], c="r", s=100)
plt.scatter(B_transformed[:,0], B_transformed[:,1], c="g", s=100)
plt.show()
In [22]:
pca = PCA(n_components=1)
pca.fit(AB)
A_transformed = pca.transform(A)
B_transformed = pca.transform(B)
plt.scatter(A_transformed, np.zeros(len(A_transformed)), c="r", s=100)
plt.scatter(B_transformed, np.zeros(len(B_transformed)), c="g", s=100)
plt.show()
In [23]:
sns.distplot(A_transformed, color="b", bins=20, rug=True, kde=False)
sns.distplot(B_transformed, color="g", bins=20, rug=True, kde=False)
plt.show()
In [24]:
from sklearn.decomposition import KernelPCA
In [25]:
kpca = KernelPCA(kernel="cosine", n_components=2)
kpca.fit(AB)
A_transformed2 = kpca.transform(A)
B_transformed2 = kpca.transform(B)
plt.scatter(A_transformed2[:,0], A_transformed2[:,1], c="r", s=100)
plt.scatter(B_transformed2[:,0], B_transformed2[:,1], c="g", s=100)
plt.show()
In [26]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(kernel="cosine", n_components=1)
kpca.fit(AB)
A_transformed2 = kpca.transform(A)
B_transformed2 = kpca.transform(B)
plt.scatter(A_transformed2, np.zeros(len(A_transformed2)), c="r", s=100)
plt.scatter(B_transformed2, np.zeros(len(B_transformed2)), c="g", s=100)
plt.show()
In [27]:
sns.distplot(A_transformed2, color="b", bins=20, rug=True, kde=False)
sns.distplot(B_transformed2, color="g", bins=20, rug=True, kde=False)
plt.show()
In [28]:
from sklearn.datasets import make_circles
np.random.seed(0)
X, y = make_circles(n_samples=400, factor=.3, noise=.05)
reds = y == 0
blues = y == 1
plt.plot(X[reds, 0], X[reds, 1], "ro")
plt.plot(X[blues, 0], X[blues, 1], "bo")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.show()
In [29]:
kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
kpca.fit(X)
A_transformed2 = kpca.transform(X[reds])
B_transformed2 = kpca.transform(X[blues])
plt.scatter(A_transformed2[:,0], A_transformed2[:,1], c="r", s=100)
plt.scatter(B_transformed2[:,0], B_transformed2[:,1], c="b", s=100)
plt.show()
성분의 수가 같은 PCA로 변환된 데이터의 공분산 행렬의 고유값은 원래 데이터의 공분산 행렬의 고유값과 일치한다. 성분의 수를 줄여야 하는 경우에는 가장 고유값이 작은 성분부터 생략한다.
In [31]:
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
wine = fetch_mldata("wine")
X, y = wine.data, wine.target
pca = PCA().fit(X)
var = pca.explained_variance_
cmap = sns.color_palette()
plt.bar(np.arange(1,len(var)+1), var/np.sum(var), align="center", color=cmap[0])
plt.step(np.arange(1,len(var)+1), np.cumsum(var)/np.sum(var), where="mid", color=cmap[1])
plt.show()
In [32]:
X_pca = PCA(2).fit_transform(X)
cmap = mpl.colors.ListedColormap(sns.color_palette("Set1"))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap=cmap)
plt.show()
In [33]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_pca, y)
xmin, xmax = X_pca[:,0].min(), X_pca[:,0].max()
ymin, ymax = X_pca[:,1].min(), X_pca[:,1].max()
XGrid, YGrid = np.meshgrid(np.arange(xmin, xmax, (xmax-xmin)/1000), np.arange(ymin, ymax, (ymax-ymin)/1000))
ZGrid = np.reshape(clf.predict(np.array([XGrid.ravel(), YGrid.ravel()]).T), XGrid.shape)
cmap = mpl.colors.ListedColormap(sns.color_palette("Set3"))
plt.contourf(XGrid, YGrid, ZGrid, cmap=cmap)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap=cmap)
plt.show()
In [34]:
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
model1 = linear_model.LogisticRegression()
model1.fit(X_digits, y_digits)
pca = decomposition.PCA()
logistic = linear_model.LogisticRegression()
model2 = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
model2.fit(X_digits, y_digits)
Out[34]:
In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_digits, model1.predict(X_digits)))
print(classification_report(y_digits, model2.predict(X_digits)))