In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import scale
import scipy
import matplotlib.pyplot as plt
In [2]:
# 加载数据集
data = load_iris()
x = data['data']
y = data['target']
# PCA是一种无监督的方法
x_s = scale(x,with_mean=True,with_std=True,axis=0)
In [3]:
# 计算相关矩阵
x_c = np.corrcoef(x_s.T)
# 从相关矩阵中找到特征值和特征向量
eig_var,r_eig_vec = scipy.linalg.eig(x_c)
print 'Eigen values \n%s' % (eig_var)
print '\n Eigen vectors \n%s'%(r_eig_vec)
In [4]:
# 选择最前面两个特征向量
w = r_eig_vec[:,0:2]
# 用合适的特征向量将原来思维的数据集降为二维
x_rd = x_s.dot(w)
In [5]:
# 画出新的二维散点图
plt.figure(1)
plt.scatter(x_rd[:,0],x_rd[:,1],c=y)
plt.xlabel('componet 1')
plt.ylabel('componet 2')
plt.show()
In [12]:
# 变化的比例评比标准
print 'Componet,eigen value,%of variance,cummulative%'
cum_per = 0
per_var = 0
for i,e_val in enumerate(eig_var):
per_var = round((e_val/len(eig_var)),4)
cum_per += per_var
print ('%d,%0.2f,%0.2f,%0.2f')%(i+1,e_val,per_var*100,cum_per*100)
In [ ]: