In [570]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
data = pd.read_csv('/Users/tillbey/Entwicklung/test/data.csv', index_col=0)
# data = data.iloc[:,0:-1]
data
# plt.errorbar(range(5), data['Europe'].values, [1,2,0.5,1,2], capsize=10, ls='-.')
data.plot()
plt.show()
In [568]:
def make_pca_scatter(data, pos, marker):
# normData = StandardScaler().fit_transform(data.values)
# normData = pd.DataFrame(normData, index=data.index, columns=data.columns)
normData = data
pca = PCA(n_components=3)
Y = pca.fit_transform(normData.values)
Y = pd.DataFrame(Y, index=data.index)
for candidate, colour in zip(Y.itertuples(), ('red', 'green', 'yellow', 'black', 'brown')):
plt.scatter(candidate[1], candidate[2], label=candidate.Index, c=colour, marker=marker)
plt.annotate(candidate.Index, candidate[1:3], [i+10 for i in candidate[1:3]], textcoords='offset pixels')
# plt.arrow(*candidate[1:3], 1, 1)
return pca
In [569]:
plt.figure(figsize=(6, 4), dpi=100)
pca = make_pca_scatter(data, 0, 'o')
make_pca_scatter(data[data.index != 'Le Pen'], 1, 'x')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.tight_layout()
plt.show()
In [557]:
pca.explained_variance_ratio_
Out[557]:
In [558]:
var_exp = list(pca.explained_variance_ratio_)
cum_var_exp = [sum(var_exp[:i+1]) for i, k in enumerate(var_exp)]
plt.figure(figsize=(6, 4))
plt.bar(range(3), var_exp, alpha=0.5, align='center',
label='individual explained variance')
plt.step(range(3), cum_var_exp, where='mid',
label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
In [559]:
pca.components_
Out[559]:
In [560]:
pca.n_features_
Out[560]:
In [561]:
X
Out[561]:
In [ ]: