In [1]:
from sklearn import datasets
from sklearn import decomposition

In [2]:
iris = datasets.load_iris()

In [3]:
iris_x = iris.data

In [4]:
pca = decomposition.PCA()

In [5]:
pca


Out[5]:
PCA(copy=True, n_components=None, whiten=False)

In [6]:
iris_pca = pca.fit_transform(iris_x)

In [10]:
print iris_x[:5]
print '-'*50
print iris_pca[:5]


[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]
--------------------------------------------------
[[ -2.68420713e+00  -3.26607315e-01   2.15118370e-02   1.00615724e-03]
 [ -2.71539062e+00   1.69556848e-01   2.03521425e-01   9.96024240e-02]
 [ -2.88981954e+00   1.37345610e-01  -2.47092410e-02   1.93045428e-02]
 [ -2.74643720e+00   3.11124316e-01  -3.76719753e-02  -7.59552741e-02]
 [ -2.72859298e+00  -3.33924564e-01  -9.62296998e-02  -6.31287327e-02]]

In [8]:
# this shows that 92.5% of the variance of the dataset can be
# explained by the first column
pca.explained_variance_ratio_


Out[8]:
array([ 0.92461621,  0.05301557,  0.01718514,  0.00518309])

In [11]:
# Removing all but 2 rows
pca = decomposition.PCA(n_components=2)
iris_x_prime = pca.fit_transform(iris_x)
iris_x_prime.shape


Out[11]:
(150, 2)

In [12]:
pca.explained_variance_ratio_.sum()


Out[12]:
0.97763177502480336

In [13]:
# get over 98%
pca = decomposition.PCA(n_components=.98)
iris_x_prime = pca.fit(iris_x)
pca.explained_variance_ratio_.sum()


Out[13]:
0.99481691454981014

In [ ]: