In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *
import seaborn as sns
sns.set(font_scale=1.5)
import numpy as np
import pandas as pd
In [2]:
from sklearn.datasets import load_wine
wine_datasets = load_wine()
# feature names
wine_datasets.feature_names
Out[2]:
In [3]:
# features of wine data
wine_features = pd.DataFrame(wine_datasets.data, columns=wine_datasets.feature_names)
wine_features.head()
Out[3]:
In [4]:
# summary of wine features
wine_features.describe()
Out[4]:
In [5]:
from collections import Counter
# target of wine data
wine_target = wine_datasets.target
Counter(wine_target)
Out[5]:
In [6]:
# number of features in original wine data
n_features = wine_features.shape[1]
# number of clusters (given by data)
n_clusters = len(Counter(wine_target))
n_features, n_clusters
Out[6]:
In [7]:
# normalize wine features
from sklearn.preprocessing import StandardScaler
normed_wine_features = pd.DataFrame(
StandardScaler().fit_transform(wine_features),
columns=wine_datasets.feature_names
)
normed_wine_features.describe()
Out[7]:
In [8]:
from sklearn.decomposition import PCA
# PCA to full features (not decomposition)
full_pca = PCA(n_components=n_features, random_state=14).fit(normed_wine_features)
In [9]:
# plot number of compressed features vs. Accumulative Variance Ratio
plt.plot(range(1, n_features + 1), np.cumsum(full_pca.explained_variance_ratio_))
plt.xlabel("Number of features compressed by PCA")
plt.ylabel("Accumulative Variance Ratio")
Out[9]:
In [10]:
threshold_accum_var_ratio = 0.8
pca_n_features = int(np.nanmin(np.where(
np.cumsum(full_pca.explained_variance_ratio_) > threshold_accum_var_ratio, range(1, n_features + 1), np.nan
)))
pca_n_features
Out[10]:
In [11]:
# PCA decomposition
partial_pca = PCA(n_components=pca_n_features, random_state=14)
decomposed_pca_wine_features = pd.DataFrame(
partial_pca.fit_transform(normed_wine_features),
columns=['x%02d' % x for x in range(1, pca_n_features + 1)]
)
In [12]:
# Visualize decomposed PCA data with t-SNE
from sklearn.manifold import TSNE
pca_tsne = TSNE(n_components=2, random_state=14)
visualized_pca_wine_features = pd.DataFrame(
pca_tsne.fit_transform(decomposed_pca_wine_features),
columns=['x%02d' % x for x in range(1, 3)]
)
In [13]:
ax = None
for c in range(n_clusters):
ax = visualized_pca_wine_features.iloc[
list(np.where(np.array(wine_target) == c)[0]), :
].plot(
kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('wine features decomposed by PCA (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')
Out[13]:
In [14]:
# k-Means with decomposed data by PCA
from sklearn.cluster import KMeans
pca_km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=100, random_state=14)
pca_clusters = pca_km.fit_predict(decomposed_pca_wine_features)
In [15]:
ax = None
for c in range(n_clusters):
ax = visualized_pca_wine_features.iloc[
list(np.where(np.array(pca_clusters) == c)[0]), :
].plot(
kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='cluster %d' % c, ax=ax
)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('wine features decomposed by PCA (k-Means clustering)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')
Out[15]:
In [16]:
from sklearn.metrics import normalized_mutual_info_score, adjusted_mutual_info_score
# NMI and AMI score
pca_nmi_score = normalized_mutual_info_score(wine_target, pca_clusters)
pca_ami_score = adjusted_mutual_info_score(wine_target, pca_clusters)
pca_nmi_score, pca_ami_score
Out[16]:
In [17]:
from subspacekmeans import SubspaceKMeans
skm = SubspaceKMeans(n_clusters=n_clusters, n_jobs=-1, random_state=14)
skm_clusters = skm.fit_predict(normed_wine_features)
# dimension of clustered-space
skm.m_
Out[17]:
In [18]:
# Visualize Cluster-Space
transformed_wine_features = pd.DataFrame(
skm.transform(normed_wine_features),
columns=['x%02d' % x for x in range(1, n_features + 1)]
)
In [19]:
ax = None
for c in range(n_clusters):
ax = transformed_wine_features.iloc[
list(np.where(np.array(wine_target) == c)[0]), :
].plot(
kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Cluster-Space of wine features (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')
Out[19]:
In [20]:
ax = None
for c in range(n_clusters):
ax = transformed_wine_features.iloc[
list(np.where(np.array(skm_clusters) == c)[0]), :
].plot(
kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='cluster %d' % c, ax=ax
)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Cluster-Space of wine features (predicted clusters)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')
Out[20]:
In [21]:
# Visualize Noise-Space
noise_tsne = TSNE(n_components=2, random_state=14)
visualized_noise_wine_features = pd.DataFrame(
noise_tsne.fit_transform(transformed_wine_features.iloc[:, skm.m_:]),
columns=['x%02d' % x for x in range(1, 3)]
)
In [22]:
ax = None
for c in range(n_clusters):
ax = visualized_noise_wine_features.iloc[
list(np.where(np.array(wine_target) == c)[0]), :
].plot(
kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Noise-Space of wine features (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')
Out[22]:
In [23]:
ax = None
for c in range(n_clusters):
ax = visualized_noise_wine_features.iloc[
list(np.where(np.array(skm_clusters) == c)[0]), :
].plot(
kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Noise-Space of wine features (predicted clusters)')
plt.xlabel('1st feature of the Noise-Space')
plt.ylabel('2nd feature')
Out[23]:
In [24]:
# NMI and AMI scores
skm_nmi_score = normalized_mutual_info_score(wine_target, skm_clusters)
skm_ami_score = adjusted_mutual_info_score(wine_target, skm_clusters)
skm_nmi_score, skm_ami_score
Out[24]:
In [25]:
# summary of scores
pd.DataFrame({
'NMI': {
'PCA': pca_nmi_score,
'Subspace k-Means': skm_nmi_score,
},
'AMI': {
'PCA': pca_ami_score,
'Subspace k-Means': skm_ami_score,
},
})
Out[25]: