In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
plt.rcParams['figure.figsize'] = 9, 6
In [2]:
from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, f_classif
In [3]:
iris = datasets.load_iris()
iris.data.shape
Out[3]:
Pouzijeme oblubeny dataset kvietkov ktory ma 150 pozorovani a 4 atributy
K nemu dogenerujeme 20 nahodnych atributov, ktore by mali mat len minimalny vplyv na predikciu zavyslej premennej
In [4]:
# vygenerujme si 20 uplne nahodnych atributov a pricapme ich k povodnym datam
E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))
X = np.hstack((iris.data, E))
y = iris.target
X_indices = np.arange(X.shape[-1])
X_indices
X.shape
Out[4]:
Pre porovnanie sa pozireme na dva riadky povodnych a novych dat
In [5]:
# povodne data
iris.data[:2]
Out[5]:
In [6]:
# data rozsirene o dalsich 20 nahodnych atributov
# len tie prve by mali davat zmysel
X[:2]
Out[6]:
Mozeme skusit najst najdolezitejsie atributy. Mali by to byt prve 4
In [7]:
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X, y)
Out[7]:
In [8]:
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices, scores)
Out[8]:
Naozaj sa nam podarilo najst tie data, ktore suviseli s predikovanou premennou.
Da sa na nieco podobne pouzit PCA?
In [9]:
from sklearn.decomposition import PCA
In [10]:
import sklearn.datasets as ds
data = ds.load_breast_cancer()['data']
pca_trafo = PCA().fit(data)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
line, = ax.plot(pca_trafo.explained_variance_ratio_, '--o')
ax.set_yscale('log') # skus si vyhodit logaritmicku mierku, uvidis, ze je tam asi problem
ax.set_title('Prispevok komponentov k vysvetleniu variancie datasetu')
Out[10]:
Zobrazime si heatmapu toho, ako silno prispievaju jednotlive vlastnosti k tvorbe komponentov a teda ako silno su v nich odrazene. To by nam malo vediet povedat, ktory atribut je vo vyslednych datach najviac odrazeny.
In [11]:
import sklearn.datasets as ds
from sklearn.decomposition import PCA
pca_trafo = PCA()
data = ds.load_breast_cancer()['data']
pca_data = pca_trafo.fit_transform(data)
ax = seaborn.heatmap(np.log(pca_trafo.inverse_transform(np.eye(data.shape[1]))), cmap="hot", cbar=False)
ax.set_xlabel('features')
ax.set_ylabel('components')
Out[11]:
matica nieje uplne nahodna ale su tam 3 pruhy, ktore zobrazuju 3 skupiny vlastnosti, ktore su v komponentoch odrazene vyraznejsie ako ostatne. Zda sa, ze toto su tie najdolezitejsie atributy.
In [14]:
means = np.mean(pca_trafo.inverse_transform(np.eye(data.shape[1])), axis=0)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(means)
ax.set_ylabel('mean contrib. in components')
ax.set_xlabel('feature #')
Out[14]:
In [15]:
# PCA sa pokusa vysvetliv varianciu v datach. Ak ma kazdy atribut inu strednu hodnotu (varianciu), tak nevysvetli mnozstvo informacie v atribute ale len jeho varianciu
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(np.std(data, axis=0))
ax.set_ylabel('standard deviation')
ax.set_xlabel('feature #')
# ax.set_yscale('log')
Out[15]:
In [16]:
import sklearn.datasets as ds
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler # vykona z-normalizaciu na kazdom atribute
z_scaler = StandardScaler()
data = ds.load_breast_cancer()['data']
z_data = z_scaler.fit_transform(data)
pca_trafo = PCA().fit(z_data)
plt.plot(pca_trafo.explained_variance_ratio_, '--o') # mnozstvo vysvetlenej variancie per atribut
plt.plot(pca_trafo.explained_variance_ratio_.cumsum(), '--o') # kumulativna suma vysvetlenej variancie ak si chcem vybrat atributy
plt.ylim((0,1.0))
Out[16]:
In [17]:
import sklearn.datasets as ds
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
z_scaler = StandardScaler()
data = ds.load_breast_cancer()['data']
pca_trafo = PCA()
z_data = z_scaler.fit_transform(data)
pca_data = pca_trafo.fit_transform(z_data)
ax = seaborn.heatmap(np.log(pca_trafo.inverse_transform(np.eye(data.shape[1]))), cmap="hot", cbar=False)
ax.set_xlabel('features')
ax.set_ylabel('components')
Out[17]:
In [18]:
means = np.mean(pca_trafo.inverse_transform(np.eye(data.shape[1])), axis=0)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(means)
ax.set_ylabel('mean contrib. in components')
ax.set_xlabel('feature #')
Out[18]:
In [19]:
iris = datasets.load_iris()
iris.data.shape
# vygenerujme si 20 uplne nahodnych atributov a pricapme ich k povodnym datam
E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))
X = np.hstack((iris.data, E))
y = iris.target
print('Tvar povodnych dat', iris.data.shape)
print('Tvar upravenych dat', X.shape)
X_indices = np.arange(X.shape[-1])
In [20]:
z_scaler = StandardScaler()
pca_trafo = PCA()
z_data = z_scaler.fit_transform(X)
pca_data = pca_trafo.fit_transform(z_data)
ax = seaborn.heatmap(np.log(pca_trafo.inverse_transform(np.eye(X.shape[1]))), cmap="hot", cbar=False)
ax.set_xlabel('features')
ax.set_ylabel('components')
Out[20]:
Z tejto heatmapy sa neda vycitat ziadny jasny trend. Skusme este tie priemery prispevkov do komponentov per atribut.
In [21]:
means = np.mean(pca_trafo.inverse_transform(np.eye(X.shape[1])), axis=0)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(X_indices, means)
ax.set_ylabel('mean contrib. in components')
ax.set_xlabel('feature #')
Out[21]:
Toto vobec nevizera ako vysoke hodnoty dolezitosti pre prve 4 atributy a nizke pre ostatne. Toto vizera skor nahodne.
To znamena, ze PCA sa nesnazi najlepsie zakodovat atributy, ktore najviac suvusua s predikovanou hodnotou ale jednoducho tie, ktore maju v sebe najviac variancie. PCA sa v tomto pripade snazilo len co najlepsie komprimovat nahodne data.
Zistit kolko je variancie v jednotlivych atributoch moze byt zaujimave, ale da sa to zistit aj podstatne jednoduchsie ako pocitat PCA nad celym datasetom.
In [ ]: