In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import scale
import scipy
import matplotlib.pyplot as plt

In [2]:
# 加载数据集
data = load_iris()
x = data['data']
y = data['target']

# PCA是一种无监督的方法
x_s = scale(x,with_mean=True,with_std=True,axis=0)

In [3]:
# 计算相关矩阵
x_c = np.corrcoef(x_s.T)

# 从相关矩阵中找到特征值和特征向量
eig_var,r_eig_vec = scipy.linalg.eig(x_c)
print 'Eigen values \n%s' % (eig_var)
print '\n Eigen vectors \n%s'%(r_eig_vec)


Eigen values 
[ 2.91081808+0.j  0.92122093+0.j  0.14735328+0.j  0.02060771+0.j]

 Eigen vectors 
[[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
 [-0.26335492 -0.92555649  0.24203288 -0.12413481]
 [ 0.58125401 -0.02109478  0.14089226 -0.80115427]
 [ 0.56561105 -0.06541577  0.6338014   0.52354627]]

In [4]:
# 选择最前面两个特征向量
w = r_eig_vec[:,0:2]

# 用合适的特征向量将原来思维的数据集降为二维
x_rd = x_s.dot(w)

In [5]:
# 画出新的二维散点图
plt.figure(1)
plt.scatter(x_rd[:,0],x_rd[:,1],c=y)
plt.xlabel('componet 1')
plt.ylabel('componet 2')
plt.show()



In [12]:
# 变化的比例评比标准
print 'Componet,eigen value,%of variance,cummulative%'
cum_per = 0
per_var = 0
for i,e_val in enumerate(eig_var):
    per_var = round((e_val/len(eig_var)),4)
    cum_per += per_var
    print ('%d,%0.2f,%0.2f,%0.2f')%(i+1,e_val,per_var*100,cum_per*100)


Componet,eigen value,%of variance,cummulative%
1,2.91,72.77,72.77
2,0.92,23.03,95.80
3,0.15,3.68,99.48
4,0.02,0.52,100.00
e:\python27\lib\site-packages\ipykernel_launcher.py:6: ComplexWarning: Casting complex values to real discards the imaginary part
  
e:\python27\lib\site-packages\ipykernel_launcher.py:8: ComplexWarning: Casting complex values to real discards the imaginary part
  

In [ ]: