notebook.community

Edit and run



In [1]:

    
from sklearn import datasets
from sklearn import decomposition



In [2]:

    
iris = datasets.load_iris()



In [3]:

    
iris_x = iris.data



In [4]:

    
pca = decomposition.PCA()



In [5]:

    
pca









    Out[5]:





PCA(copy=True, n_components=None, whiten=False)



In [6]:

    
iris_pca = pca.fit_transform(iris_x)



In [10]:

    
print iris_x[:5]
print '-'*50
print iris_pca[:5]









    



[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]
--------------------------------------------------
[[ -2.68420713e+00  -3.26607315e-01   2.15118370e-02   1.00615724e-03]
 [ -2.71539062e+00   1.69556848e-01   2.03521425e-01   9.96024240e-02]
 [ -2.88981954e+00   1.37345610e-01  -2.47092410e-02   1.93045428e-02]
 [ -2.74643720e+00   3.11124316e-01  -3.76719753e-02  -7.59552741e-02]
 [ -2.72859298e+00  -3.33924564e-01  -9.62296998e-02  -6.31287327e-02]]



In [8]:

    
# this shows that 92.5% of the variance of the dataset can be
# explained by the first column
pca.explained_variance_ratio_









    Out[8]:





array([ 0.92461621,  0.05301557,  0.01718514,  0.00518309])



In [11]:

    
# Removing all but 2 rows
pca = decomposition.PCA(n_components=2)
iris_x_prime = pca.fit_transform(iris_x)
iris_x_prime.shape









    Out[11]:





(150, 2)



In [12]:

    
pca.explained_variance_ratio_.sum()









    Out[12]:





0.97763177502480336



In [13]:

    
# get over 98%
pca = decomposition.PCA(n_components=.98)
iris_x_prime = pca.fit(iris_x)
pca.explained_variance_ratio_.sum()









    Out[13]:





0.99481691454981014



In [ ]: