In [2]:
%matplotlib notebook

Para este exemplo, vamos usar o dataset Iris, que pode ser obtido aqui: https://archive.ics.uci.edu/ml/datasets/Iris


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import decomposition
from sklearn import datasets

In [4]:
tabela = pd.read_csv("exemplo_7/iris.data",header=None,sep=',')

In [5]:
tabela


Out[5]:
0 1 2 3 4
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
7 5.0 3.4 1.5 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
9 4.9 3.1 1.5 0.1 Iris-setosa
10 5.4 3.7 1.5 0.2 Iris-setosa
11 4.8 3.4 1.6 0.2 Iris-setosa
12 4.8 3.0 1.4 0.1 Iris-setosa
13 4.3 3.0 1.1 0.1 Iris-setosa
14 5.8 4.0 1.2 0.2 Iris-setosa
15 5.7 4.4 1.5 0.4 Iris-setosa
16 5.4 3.9 1.3 0.4 Iris-setosa
17 5.1 3.5 1.4 0.3 Iris-setosa
18 5.7 3.8 1.7 0.3 Iris-setosa
19 5.1 3.8 1.5 0.3 Iris-setosa
20 5.4 3.4 1.7 0.2 Iris-setosa
21 5.1 3.7 1.5 0.4 Iris-setosa
22 4.6 3.6 1.0 0.2 Iris-setosa
23 5.1 3.3 1.7 0.5 Iris-setosa
24 4.8 3.4 1.9 0.2 Iris-setosa
25 5.0 3.0 1.6 0.2 Iris-setosa
26 5.0 3.4 1.6 0.4 Iris-setosa
27 5.2 3.5 1.5 0.2 Iris-setosa
28 5.2 3.4 1.4 0.2 Iris-setosa
29 4.7 3.2 1.6 0.2 Iris-setosa
... ... ... ... ... ...
120 6.9 3.2 5.7 2.3 Iris-virginica
121 5.6 2.8 4.9 2.0 Iris-virginica
122 7.7 2.8 6.7 2.0 Iris-virginica
123 6.3 2.7 4.9 1.8 Iris-virginica
124 6.7 3.3 5.7 2.1 Iris-virginica
125 7.2 3.2 6.0 1.8 Iris-virginica
126 6.2 2.8 4.8 1.8 Iris-virginica
127 6.1 3.0 4.9 1.8 Iris-virginica
128 6.4 2.8 5.6 2.1 Iris-virginica
129 7.2 3.0 5.8 1.6 Iris-virginica
130 7.4 2.8 6.1 1.9 Iris-virginica
131 7.9 3.8 6.4 2.0 Iris-virginica
132 6.4 2.8 5.6 2.2 Iris-virginica
133 6.3 2.8 5.1 1.5 Iris-virginica
134 6.1 2.6 5.6 1.4 Iris-virginica
135 7.7 3.0 6.1 2.3 Iris-virginica
136 6.3 3.4 5.6 2.4 Iris-virginica
137 6.4 3.1 5.5 1.8 Iris-virginica
138 6.0 3.0 4.8 1.8 Iris-virginica
139 6.9 3.1 5.4 2.1 Iris-virginica
140 6.7 3.1 5.6 2.4 Iris-virginica
141 6.9 3.1 5.1 2.3 Iris-virginica
142 5.8 2.7 5.1 1.9 Iris-virginica
143 6.8 3.2 5.9 2.3 Iris-virginica
144 6.7 3.3 5.7 2.5 Iris-virginica
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 5 columns


In [6]:
tabela.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']

In [ ]:
tabela

In [7]:
tabela.tail()


Out[7]:
sepal_len sepal_wid petal_len petal_wid class
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

Agora, vamos separar os dados entre as medidas e as espécies.


In [8]:
X = tabela.ix[:,0:4].values
y = tabela.ix[:,4].values

In [ ]:
X

In [19]:
y


Out[19]:
array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica'], dtype=object)

In [70]:
nomes = list(set(y))

In [66]:
tabela.columns


Out[66]:
Index(['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class'], dtype='object')

In [88]:
colors = ['navy', 'turquoise', 'darkorange']
fig,ax = plt.subplots(2,2)

#n, bins, patches = P.hist(x, 10, normed=1, histtype='bar',
#                            color=['crimson', 'burlywood', 'chartreuse'],
#                            label=['Crimson', 'Burlywood', 'Chartreuse'])

# Coluna 0
dados_sepal_len = [X[y==nomes[0],0], X[y==nomes[1],0], X[y==nomes[2],0]]

n, bins, patches = ax[0,0].hist(dados_sepal_len,color=colors, label=list(set(y)))
ax[0,0].set_title('Sepal Length (cm)')

# Coluna 1
dados_sepal_wid = [X[y==nomes[0],1], X[y==nomes[1],1], X[y==nomes[2],1]]

ax[0,1].hist(dados_sepal_wid,color=colors, label=list(set(y)))
#ax[0,1].legend()
ax[0,1].set_title('Sepal Width (cm)')

# Coluna 2
dados_sepal_wid = [X[y==nomes[0],2], X[y==nomes[1],2], X[y==nomes[2],2]]

ax[1,0].hist(dados_sepal_wid,color=colors, label=list(set(y)))
#ax[1,0].legend()
ax[1,0].set_title('Petal Length (cm)')

# Coluna 3
dados_sepal_wid = [X[y==nomes[0],3], X[y==nomes[1],3], X[y==nomes[2],3]]

ax[1,1].hist(dados_sepal_wid,color=colors, label=list(set(y)))
#ax[1,1].legend()
ax[1,1].set_title('Petal Width (cm)')

fig.legend(patches, list(set(y)))


Out[88]:
<matplotlib.legend.Legend at 0x7f0318d0aa90>

Agora, vamos calcular a decomposição em componentes principais:


In [9]:
pca = decomposition.PCA()

In [ ]:
print(pca)

pca agora é uma referência para a função que calcula o PCA de X. Para efetivamente calcularmos os componentes principais, fazemos


In [10]:
pca.fit(X)


Out[10]:
PCA(copy=True, n_components=None, whiten=False)

Daqui pra frente, o objeto pca será onde nossas informações estão armazenadas. Para, por exemplo, verificarmos quais são os autovalores (variâncias) do nosso conjunto de dados, podemos fazer


In [ ]:
print(pca.explained_variance_ratio_)

Podemos ver então que o primeiro componente principal explica 92% dos dados.

Para transformarmos os dados para que fiquem alinhados/projetados nestes componentes principais, usamos


In [11]:
Xnew = pca.transform(X)

In [ ]:
print(X)

In [ ]:
print(Xnew)

Agora, queremos visualizar estes dados. Precisamos então selecionar quantos componentes queremos representar. Se quisermos mostrar dois componentes, fazemos


In [89]:
fig,ax = plt.subplots()
plt.cla()
ax.scatter(Xnew[:, 0], Xnew[:, 1], cmap=plt.cm.spectral)
plt.show()



In [33]:
y[0]


Out[33]:
'Iris-setosa'

In [90]:
list(set(y))


Out[90]:
['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [91]:
Xnew[y=='Iris-setosa']


Out[91]:
array([[ -2.68420713e+00,  -3.26607315e-01,   2.15118370e-02,
          1.00615724e-03],
       [ -2.71539062e+00,   1.69556848e-01,   2.03521425e-01,
          9.96024240e-02],
       [ -2.88981954e+00,   1.37345610e-01,  -2.47092410e-02,
          1.93045428e-02],
       [ -2.74643720e+00,   3.11124316e-01,  -3.76719753e-02,
         -7.59552741e-02],
       [ -2.72859298e+00,  -3.33924564e-01,  -9.62296998e-02,
         -6.31287327e-02],
       [ -2.27989736e+00,  -7.47782713e-01,  -1.74325619e-01,
         -2.71468037e-02],
       [ -2.82089068e+00,   8.21045110e-02,  -2.64251085e-01,
         -5.00996251e-02],
       [ -2.62648199e+00,  -1.70405349e-01,   1.58015103e-02,
         -4.62817610e-02],
       [ -2.88795857e+00,   5.70798026e-01,  -2.73354061e-02,
         -2.66154143e-02],
       [ -2.67384469e+00,   1.06691704e-01,   1.91533300e-01,
         -5.58909660e-02],
       [ -2.50652679e+00,  -6.51935014e-01,   6.92749958e-02,
         -1.66082478e-02],
       [ -2.61314272e+00,  -2.15206320e-02,  -1.07650353e-01,
         -1.57704569e-01],
       [ -2.78743398e+00,   2.27740189e-01,   2.00327788e-01,
         -7.23508674e-03],
       [ -3.22520045e+00,   5.03279909e-01,  -6.84136292e-02,
         -2.19466641e-02],
       [ -2.64354322e+00,  -1.18619490e+00,   1.44505704e-01,
          1.56980962e-01],
       [ -2.38386932e+00,  -1.34475434e+00,  -2.83730664e-01,
          1.92618171e-03],
       [ -2.62252620e+00,  -8.18089675e-01,  -1.45315989e-01,
          1.64740791e-01],
       [ -2.64832273e+00,  -3.19136668e-01,  -3.33942541e-02,
          7.61182133e-02],
       [ -2.19907796e+00,  -8.79244088e-01,   1.14521465e-01,
          2.53269397e-02],
       [ -2.58734619e+00,  -5.20473639e-01,  -2.19572088e-01,
         -6.90819912e-02],
       [ -2.31053170e+00,  -3.97867822e-01,   2.33695607e-01,
         -1.53237396e-02],
       [ -2.54323491e+00,  -4.40031755e-01,  -2.14836370e-01,
          3.84395001e-02],
       [ -3.21585769e+00,  -1.41615572e-01,  -2.99618982e-01,
          1.85704335e-03],
       [ -2.30312854e+00,  -1.05522678e-01,  -4.56800413e-02,
          1.47245500e-01],
       [ -2.35617109e+00,   3.12095891e-02,  -1.29407576e-01,
         -3.01620265e-01],
       [ -2.50791723e+00,   1.39056340e-01,   2.47116338e-01,
          3.53840813e-02],
       [ -2.46905600e+00,  -1.37887315e-01,  -1.01263079e-01,
          5.59704524e-02],
       [ -2.56239095e+00,  -3.74684563e-01,   7.23591574e-02,
         -1.52402868e-02],
       [ -2.63982127e+00,  -3.19290066e-01,   1.39253374e-01,
          6.51410472e-02],
       [ -2.63284791e+00,   1.90075831e-01,  -4.64664636e-02,
         -1.24611153e-01],
       [ -2.58846205e+00,   1.97393079e-01,   7.12750731e-02,
         -6.04762634e-02],
       [ -2.41007734e+00,  -4.18080008e-01,   1.38388240e-01,
          2.30844170e-01],
       [ -2.64763667e+00,  -8.19982633e-01,  -2.30585604e-01,
         -2.84808954e-01],
       [ -2.59715948e+00,  -1.10002193e+00,  -1.63581913e-01,
         -9.89580706e-02],
       [ -2.67384469e+00,   1.06691704e-01,   1.91533300e-01,
         -5.58909660e-02],
       [ -2.86699985e+00,  -7.71930957e-02,   1.56842350e-01,
          1.62452806e-01],
       [ -2.62522846e+00,  -6.06800008e-01,   2.61163156e-01,
          1.75879875e-01],
       [ -2.67384469e+00,   1.06691704e-01,   1.91533300e-01,
         -5.58909660e-02],
       [ -2.98184266e+00,   4.80250049e-01,  -7.97248074e-02,
         -1.10529508e-02],
       [ -2.59032303e+00,  -2.36059337e-01,   7.39012382e-02,
         -1.45563062e-02],
       [ -2.77013891e+00,  -2.71059420e-01,  -8.42415745e-02,
          9.23646573e-02],
       [ -2.85221108e+00,   9.32865367e-01,   3.40961491e-01,
          3.22650607e-01],
       [ -2.99829644e+00,   3.34307575e-01,  -1.99008425e-01,
         -7.58718213e-02],
       [ -2.40551410e+00,  -1.95917258e-01,  -2.70717070e-01,
          1.73785129e-01],
       [ -2.20883295e+00,  -4.42696030e-01,  -3.03487809e-01,
         -1.85857530e-01],
       [ -2.71566519e+00,   2.42681483e-01,   9.05156060e-02,
          1.42989025e-01],
       [ -2.53757337e+00,  -5.10367545e-01,  -1.71918404e-01,
         -1.92165946e-01],
       [ -2.84032130e+00,   2.20576338e-01,  -9.00613765e-02,
         -6.03928106e-02],
       [ -2.54268576e+00,  -5.86281025e-01,   1.11752678e-02,
         -4.83337025e-02],
       [ -2.70391231e+00,  -1.15010852e-01,   8.26957266e-02,
          3.40995730e-02]])

In [123]:
fig2, ax2 = plt.subplots()
for color, i, name in zip(colors, [0, 1, 2], list(set(y))):
    ax2.scatter(Xnew[y == name, 0], Xnew[y == name, 1], color=color, label=names)
ax2.legend(loc='best', shadow=False, scatterpoints=1)
ax2.set_title('PCA of IRIS dataset')


Out[123]:
<matplotlib.text.Text at 0x7f0316c5f400>

In [124]:
colors


Out[124]:
['navy', 'turquoise', 'darkorange']

In [137]:
from mpl_toolkits.mplot3d import Axes3D

fig3d = plt.figure(3)
ax = Axes3D(fig3d)
for color, i, name in zip(colors, [0, 1, 2], list(set(y))):
    ax.scatter(Xnew[y == name, 0], Xnew[y == name, 1], Xnew[y==name, 2], color=color, label=names)
ax.legend(loc='best', shadow=False, scatterpoints=1)
plt.show()



In [ ]: