In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
In [2]:
# データの読み込み
data = pd.read_csv("data/tab47.csv")
labels = [u'国語', u'社会', u'数学', u'理科', u'音楽', u'美術', u'保体', u'技家', u'英語']
data.head()
Out[2]:
In [3]:
# 主成分分析
pca = PCA(5)
result = pca.fit(data)
In [4]:
# 寄与率と累積寄与率
pd.DataFrame(np.round([result.explained_variance_ratio_, np.cumsum(result.explained_variance_ratio_)], 3), index=["寄与率", "累積寄与率"])
Out[4]:
In [5]:
# 固有ベクトル(主成分軸の係数)
weight = result.components_.T
pd.DataFrame(weight, index=labels)
Out[5]:
In [6]:
# 主成分スコア
score = np.round(result.fit_transform(data), 4)
pd.DataFrame(score)
Out[6]:
In [7]:
# 主成分スコアの配置
x = 0
y = 1
plt.scatter(score.T[x], score.T[y])
for i in range(len(data)):
plt.text(score.T[x][i], score.T[y][i], str(i), fontsize=12)
plt.xlabel('Principal Component %s' % x)
plt.ylabel('Principal Component %s' % y)
Out[7]: