In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *
import seaborn as sns
sns.set(font_scale=1.5)
import numpy as np
import pandas as pd

Load wine data from scikit-learn datasets


In [2]:
from sklearn.datasets import load_wine

wine_datasets = load_wine()

# feature names
wine_datasets.feature_names


Out[2]:
['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [3]:
# features of wine data
wine_features = pd.DataFrame(wine_datasets.data, columns=wine_datasets.feature_names)

wine_features.head()


Out[3]:
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline
0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0
1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0
2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0
3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0
4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0

In [4]:
# summary of wine features
wine_features.describe()


Out[4]:
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline
count 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000
mean 13.000618 2.336348 2.366517 19.494944 99.741573 2.295112 2.029270 0.361854 1.590899 5.058090 0.957449 2.611685 746.893258
std 0.811827 1.117146 0.274344 3.339564 14.282484 0.625851 0.998859 0.124453 0.572359 2.318286 0.228572 0.709990 314.907474
min 11.030000 0.740000 1.360000 10.600000 70.000000 0.980000 0.340000 0.130000 0.410000 1.280000 0.480000 1.270000 278.000000
25% 12.362500 1.602500 2.210000 17.200000 88.000000 1.742500 1.205000 0.270000 1.250000 3.220000 0.782500 1.937500 500.500000
50% 13.050000 1.865000 2.360000 19.500000 98.000000 2.355000 2.135000 0.340000 1.555000 4.690000 0.965000 2.780000 673.500000
75% 13.677500 3.082500 2.557500 21.500000 107.000000 2.800000 2.875000 0.437500 1.950000 6.200000 1.120000 3.170000 985.000000
max 14.830000 5.800000 3.230000 30.000000 162.000000 3.880000 5.080000 0.660000 3.580000 13.000000 1.710000 4.000000 1680.000000

In [5]:
from collections import Counter

# target of wine data
wine_target = wine_datasets.target

Counter(wine_target)


Out[5]:
Counter({0: 59, 1: 71, 2: 48})

In [6]:
# number of features in original wine data
n_features = wine_features.shape[1]
# number of clusters (given by data)
n_clusters = len(Counter(wine_target))

n_features, n_clusters


Out[6]:
(13, 3)

In [7]:
# normalize wine features
from sklearn.preprocessing import StandardScaler

normed_wine_features = pd.DataFrame(
    StandardScaler().fit_transform(wine_features),
    columns=wine_datasets.feature_names
)

normed_wine_features.describe()


Out[7]:
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline
count 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02 1.780000e+02
mean 7.943708e-15 3.592632e-16 -4.066660e-15 -7.983626e-17 -7.983626e-17 -3.991813e-17 9.979533e-16 -5.588538e-16 -1.656602e-15 -3.442939e-16 1.636643e-15 2.235415e-15 -1.197544e-16
std 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00 1.002821e+00
min -2.434235e+00 -1.432983e+00 -3.679162e+00 -2.671018e+00 -2.088255e+00 -2.107246e+00 -1.695971e+00 -1.868234e+00 -2.069034e+00 -1.634288e+00 -2.094732e+00 -1.895054e+00 -1.493188e+00
25% -7.882448e-01 -6.587486e-01 -5.721225e-01 -6.891372e-01 -8.244151e-01 -8.854682e-01 -8.275393e-01 -7.401412e-01 -5.972835e-01 -7.951025e-01 -7.675624e-01 -9.522483e-01 -7.846378e-01
50% 6.099988e-02 -4.231120e-01 -2.382132e-02 1.518295e-03 -1.222817e-01 9.595986e-02 1.061497e-01 -1.760948e-01 -6.289785e-02 -1.592246e-01 3.312687e-02 2.377348e-01 -2.337204e-01
75% 8.361286e-01 6.697929e-01 6.981085e-01 6.020883e-01 5.096384e-01 8.089974e-01 8.490851e-01 6.095413e-01 6.291754e-01 4.939560e-01 7.131644e-01 7.885875e-01 7.582494e-01
max 2.259772e+00 3.109192e+00 3.156325e+00 3.154511e+00 4.371372e+00 2.539515e+00 3.062832e+00 2.402403e+00 3.485073e+00 3.435432e+00 3.301694e+00 1.960915e+00 2.971473e+00

PCA (FYI)


In [8]:
from sklearn.decomposition import PCA

# PCA to full features (not decomposition)
full_pca = PCA(n_components=n_features, random_state=14).fit(normed_wine_features)

In [9]:
# plot number of compressed features vs. Accumulative Variance Ratio
plt.plot(range(1, n_features + 1), np.cumsum(full_pca.explained_variance_ratio_))
plt.xlabel("Number of features compressed by PCA")
plt.ylabel("Accumulative Variance Ratio")


Out[9]:
<matplotlib.text.Text at 0x114cf3080>
  • Usually, when we decide the number of compressed features from PCA,
  • set the threshold of Accumulative Variance Ratio (for example, 0.8),
  • and search minimum features which accumulative variance ratio above the threshold.

In [10]:
threshold_accum_var_ratio = 0.8
pca_n_features = int(np.nanmin(np.where(
    np.cumsum(full_pca.explained_variance_ratio_) > threshold_accum_var_ratio, range(1, n_features + 1), np.nan
)))
pca_n_features


Out[10]:
5

In [11]:
# PCA decomposition
partial_pca = PCA(n_components=pca_n_features, random_state=14)
decomposed_pca_wine_features = pd.DataFrame(
    partial_pca.fit_transform(normed_wine_features),
    columns=['x%02d' % x for x in range(1, pca_n_features + 1)]
)

In [12]:
# Visualize decomposed PCA data with t-SNE
from sklearn.manifold import TSNE

pca_tsne = TSNE(n_components=2, random_state=14)
visualized_pca_wine_features = pd.DataFrame(
    pca_tsne.fit_transform(decomposed_pca_wine_features),
    columns=['x%02d' % x for x in range(1, 3)]
)

In [13]:
ax = None
for c in range(n_clusters):
    ax = visualized_pca_wine_features.iloc[
        list(np.where(np.array(wine_target) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('wine features decomposed by PCA (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')


Out[13]:
<matplotlib.text.Text at 0x114fdb978>

In [14]:
# k-Means with decomposed data by PCA
from sklearn.cluster import KMeans

pca_km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=100, random_state=14)
pca_clusters = pca_km.fit_predict(decomposed_pca_wine_features)

In [15]:
ax = None
for c in range(n_clusters):
    ax = visualized_pca_wine_features.iloc[
        list(np.where(np.array(pca_clusters) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='cluster %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('wine features decomposed by PCA (k-Means clustering)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')


Out[15]:
<matplotlib.text.Text at 0x11526a0b8>

In [16]:
from sklearn.metrics import normalized_mutual_info_score, adjusted_mutual_info_score

# NMI and AMI score
pca_nmi_score = normalized_mutual_info_score(wine_target, pca_clusters)
pca_ami_score = adjusted_mutual_info_score(wine_target, pca_clusters)
pca_nmi_score, pca_ami_score


Out[16]:
(0.87589846754078748, 0.8716230315171426)

Subspace k-Means


In [17]:
from subspacekmeans import SubspaceKMeans

skm = SubspaceKMeans(n_clusters=n_clusters, n_jobs=-1, random_state=14)
skm_clusters = skm.fit_predict(normed_wine_features)
# dimension of clustered-space
skm.m_


Out[17]:
2

In [18]:
# Visualize Cluster-Space
transformed_wine_features = pd.DataFrame(
    skm.transform(normed_wine_features),
    columns=['x%02d' % x for x in range(1, n_features + 1)]
)

In [19]:
ax = None
for c in range(n_clusters):
    ax = transformed_wine_features.iloc[
        list(np.where(np.array(wine_target) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Cluster-Space of wine features (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')


Out[19]:
<matplotlib.text.Text at 0x1153a35f8>

In [20]:
ax = None
for c in range(n_clusters):
    ax = transformed_wine_features.iloc[
        list(np.where(np.array(skm_clusters) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='cluster %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Cluster-Space of wine features (predicted clusters)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')


Out[20]:
<matplotlib.text.Text at 0x1154eaa90>

In [21]:
# Visualize Noise-Space
noise_tsne = TSNE(n_components=2, random_state=14)
visualized_noise_wine_features = pd.DataFrame(
    noise_tsne.fit_transform(transformed_wine_features.iloc[:, skm.m_:]),
    columns=['x%02d' % x for x in range(1, 3)]
)

In [22]:
ax = None
for c in range(n_clusters):
    ax = visualized_noise_wine_features.iloc[
        list(np.where(np.array(wine_target) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Noise-Space of wine features (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')


Out[22]:
<matplotlib.text.Text at 0x1154d9a90>

In [23]:
ax = None
for c in range(n_clusters):
    ax = visualized_noise_wine_features.iloc[
        list(np.where(np.array(skm_clusters) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Noise-Space of wine features (predicted clusters)')
plt.xlabel('1st feature of the Noise-Space')
plt.ylabel('2nd feature')


Out[23]:
<matplotlib.text.Text at 0x115766f98>

In [24]:
# NMI and AMI scores
skm_nmi_score = normalized_mutual_info_score(wine_target, skm_clusters)
skm_ami_score = adjusted_mutual_info_score(wine_target, skm_clusters)
skm_nmi_score, skm_ami_score


Out[24]:
(0.87589846754078737, 0.8716230315171426)

In [25]:
# summary of scores
pd.DataFrame({
    'NMI': {
        'PCA': pca_nmi_score,
        'Subspace k-Means': skm_nmi_score,
    },
    'AMI': {
       'PCA': pca_ami_score,
        'Subspace k-Means': skm_ami_score,
    },
})


Out[25]:
AMI NMI
PCA 0.871623 0.875898
Subspace k-Means 0.871623 0.875898