In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *
import seaborn as sns
sns.set(font_scale=1.5)
import numpy as np
import pandas as pd

Load wine data from scikit-learn datasets



In [2]:

    
from sklearn.datasets import load_wine

wine_datasets = load_wine()

# feature names
wine_datasets.feature_names









    Out[2]:





['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']



In [3]:

    
# features of wine data
wine_features = pd.DataFrame(wine_datasets.data, columns=wine_datasets.feature_names)

wine_features.head()









    Out[3]:







  
    
      
      alcohol
      malic_acid
      ash
      alcalinity_of_ash
      magnesium
      total_phenols
      flavanoids
      nonflavanoid_phenols
      proanthocyanins
      color_intensity
      hue
      od280/od315_of_diluted_wines
      proline
    
  
  
    
      0
      14.23
      1.71
      2.43
      15.6
      127.0
      2.80
      3.06
      0.28
      2.29
      5.64
      1.04
      3.92
      1065.0
    
    
      1
      13.20
      1.78
      2.14
      11.2
      100.0
      2.65
      2.76
      0.26
      1.28
      4.38
      1.05
      3.40
      1050.0
    
    
      2
      13.16
      2.36
      2.67
      18.6
      101.0
      2.80
      3.24
      0.30
      2.81
      5.68
      1.03
      3.17
      1185.0
    
    
      3
      14.37
      1.95
      2.50
      16.8
      113.0
      3.85
      3.49
      0.24
      2.18
      7.80
      0.86
      3.45
      1480.0
    
    
      4
      13.24
      2.59
      2.87
      21.0
      118.0
      2.80
      2.69
      0.39
      1.82
      4.32
      1.04
      2.93
      735.0



In [4]:

    
# summary of wine features
wine_features.describe()









    Out[4]:







  
    
      
      alcohol
      malic_acid
      ash
      alcalinity_of_ash
      magnesium
      total_phenols
      flavanoids
      nonflavanoid_phenols
      proanthocyanins
      color_intensity
      hue
      od280/od315_of_diluted_wines
      proline
    
  
  
    
      count
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
      178.000000
    
    
      mean
      13.000618
      2.336348
      2.366517
      19.494944
      99.741573
      2.295112
      2.029270
      0.361854
      1.590899
      5.058090
      0.957449
      2.611685
      746.893258
    
    
      std
      0.811827
      1.117146
      0.274344
      3.339564
      14.282484
      0.625851
      0.998859
      0.124453
      0.572359
      2.318286
      0.228572
      0.709990
      314.907474
    
    
      min
      11.030000
      0.740000
      1.360000
      10.600000
      70.000000
      0.980000
      0.340000
      0.130000
      0.410000
      1.280000
      0.480000
      1.270000
      278.000000
    
    
      25%
      12.362500
      1.602500
      2.210000
      17.200000
      88.000000
      1.742500
      1.205000
      0.270000
      1.250000
      3.220000
      0.782500
      1.937500
      500.500000
    
    
      50%
      13.050000
      1.865000
      2.360000
      19.500000
      98.000000
      2.355000
      2.135000
      0.340000
      1.555000
      4.690000
      0.965000
      2.780000
      673.500000
    
    
      75%
      13.677500
      3.082500
      2.557500
      21.500000
      107.000000
      2.800000
      2.875000
      0.437500
      1.950000
      6.200000
      1.120000
      3.170000
      985.000000
    
    
      max
      14.830000
      5.800000
      3.230000
      30.000000
      162.000000
      3.880000
      5.080000
      0.660000
      3.580000
      13.000000
      1.710000
      4.000000
      1680.000000



In [5]:

    
from collections import Counter

# target of wine data
wine_target = wine_datasets.target

Counter(wine_target)









    Out[5]:





Counter({0: 59, 1: 71, 2: 48})



In [6]:

    
# number of features in original wine data
n_features = wine_features.shape[1]
# number of clusters (given by data)
n_clusters = len(Counter(wine_target))

n_features, n_clusters









    Out[6]:





(13, 3)



In [7]:

    
# normalize wine features
from sklearn.preprocessing import StandardScaler

normed_wine_features = pd.DataFrame(
    StandardScaler().fit_transform(wine_features),
    columns=wine_datasets.feature_names
)

normed_wine_features.describe()









    Out[7]:







  
    
      
      alcohol
      malic_acid
      ash
      alcalinity_of_ash
      magnesium
      total_phenols
      flavanoids
      nonflavanoid_phenols
      proanthocyanins
      color_intensity
      hue
      od280/od315_of_diluted_wines
      proline
    
  
  
    
      count
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
      1.780000e+02
    
    
      mean
      7.943708e-15
      3.592632e-16
      -4.066660e-15
      -7.983626e-17
      -7.983626e-17
      -3.991813e-17
      9.979533e-16
      -5.588538e-16
      -1.656602e-15
      -3.442939e-16
      1.636643e-15
      2.235415e-15
      -1.197544e-16
    
    
      std
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
      1.002821e+00
    
    
      min
      -2.434235e+00
      -1.432983e+00
      -3.679162e+00
      -2.671018e+00
      -2.088255e+00
      -2.107246e+00
      -1.695971e+00
      -1.868234e+00
      -2.069034e+00
      -1.634288e+00
      -2.094732e+00
      -1.895054e+00
      -1.493188e+00
    
    
      25%
      -7.882448e-01
      -6.587486e-01
      -5.721225e-01
      -6.891372e-01
      -8.244151e-01
      -8.854682e-01
      -8.275393e-01
      -7.401412e-01
      -5.972835e-01
      -7.951025e-01
      -7.675624e-01
      -9.522483e-01
      -7.846378e-01
    
    
      50%
      6.099988e-02
      -4.231120e-01
      -2.382132e-02
      1.518295e-03
      -1.222817e-01
      9.595986e-02
      1.061497e-01
      -1.760948e-01
      -6.289785e-02
      -1.592246e-01
      3.312687e-02
      2.377348e-01
      -2.337204e-01
    
    
      75%
      8.361286e-01
      6.697929e-01
      6.981085e-01
      6.020883e-01
      5.096384e-01
      8.089974e-01
      8.490851e-01
      6.095413e-01
      6.291754e-01
      4.939560e-01
      7.131644e-01
      7.885875e-01
      7.582494e-01
    
    
      max
      2.259772e+00
      3.109192e+00
      3.156325e+00
      3.154511e+00
      4.371372e+00
      2.539515e+00
      3.062832e+00
      2.402403e+00
      3.485073e+00
      3.435432e+00
      3.301694e+00
      1.960915e+00
      2.971473e+00

PCA (FYI)



In [8]:

    
from sklearn.decomposition import PCA

# PCA to full features (not decomposition)
full_pca = PCA(n_components=n_features, random_state=14).fit(normed_wine_features)



In [9]:

    
# plot number of compressed features vs. Accumulative Variance Ratio
plt.plot(range(1, n_features + 1), np.cumsum(full_pca.explained_variance_ratio_))
plt.xlabel("Number of features compressed by PCA")
plt.ylabel("Accumulative Variance Ratio")









    Out[9]:





<matplotlib.text.Text at 0x114cf3080>

Usually, when we decide the number of compressed features from PCA,
set the threshold of Accumulative Variance Ratio (for example, 0.8),
and search minimum features which accumulative variance ratio above the threshold.



In [10]:

    
threshold_accum_var_ratio = 0.8
pca_n_features = int(np.nanmin(np.where(
    np.cumsum(full_pca.explained_variance_ratio_) > threshold_accum_var_ratio, range(1, n_features + 1), np.nan
)))
pca_n_features









    Out[10]:





5



In [11]:

    
# PCA decomposition
partial_pca = PCA(n_components=pca_n_features, random_state=14)
decomposed_pca_wine_features = pd.DataFrame(
    partial_pca.fit_transform(normed_wine_features),
    columns=['x%02d' % x for x in range(1, pca_n_features + 1)]
)



In [12]:

    
# Visualize decomposed PCA data with t-SNE
from sklearn.manifold import TSNE

pca_tsne = TSNE(n_components=2, random_state=14)
visualized_pca_wine_features = pd.DataFrame(
    pca_tsne.fit_transform(decomposed_pca_wine_features),
    columns=['x%02d' % x for x in range(1, 3)]
)



In [13]:

    
ax = None
for c in range(n_clusters):
    ax = visualized_pca_wine_features.iloc[
        list(np.where(np.array(wine_target) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('wine features decomposed by PCA (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')









    Out[13]:





<matplotlib.text.Text at 0x114fdb978>



In [14]:

    
# k-Means with decomposed data by PCA
from sklearn.cluster import KMeans

pca_km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=100, random_state=14)
pca_clusters = pca_km.fit_predict(decomposed_pca_wine_features)



In [15]:

    
ax = None
for c in range(n_clusters):
    ax = visualized_pca_wine_features.iloc[
        list(np.where(np.array(pca_clusters) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='cluster %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('wine features decomposed by PCA (k-Means clustering)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')









    Out[15]:





<matplotlib.text.Text at 0x11526a0b8>



In [16]:

    
from sklearn.metrics import normalized_mutual_info_score, adjusted_mutual_info_score

# NMI and AMI score
pca_nmi_score = normalized_mutual_info_score(wine_target, pca_clusters)
pca_ami_score = adjusted_mutual_info_score(wine_target, pca_clusters)
pca_nmi_score, pca_ami_score









    Out[16]:





(0.87589846754078748, 0.8716230315171426)

Subspace k-Means



In [17]:

    
from subspacekmeans import SubspaceKMeans

skm = SubspaceKMeans(n_clusters=n_clusters, n_jobs=-1, random_state=14)
skm_clusters = skm.fit_predict(normed_wine_features)
# dimension of clustered-space
skm.m_









    Out[17]:





2



In [18]:

    
# Visualize Cluster-Space
transformed_wine_features = pd.DataFrame(
    skm.transform(normed_wine_features),
    columns=['x%02d' % x for x in range(1, n_features + 1)]
)



In [19]:

    
ax = None
for c in range(n_clusters):
    ax = transformed_wine_features.iloc[
        list(np.where(np.array(wine_target) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Cluster-Space of wine features (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')









    Out[19]:





<matplotlib.text.Text at 0x1153a35f8>



In [20]:

    
ax = None
for c in range(n_clusters):
    ax = transformed_wine_features.iloc[
        list(np.where(np.array(skm_clusters) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='cluster %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Cluster-Space of wine features (predicted clusters)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')









    Out[20]:





<matplotlib.text.Text at 0x1154eaa90>



In [21]:

    
# Visualize Noise-Space
noise_tsne = TSNE(n_components=2, random_state=14)
visualized_noise_wine_features = pd.DataFrame(
    noise_tsne.fit_transform(transformed_wine_features.iloc[:, skm.m_:]),
    columns=['x%02d' % x for x in range(1, 3)]
)



In [22]:

    
ax = None
for c in range(n_clusters):
    ax = visualized_noise_wine_features.iloc[
        list(np.where(np.array(wine_target) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Noise-Space of wine features (Ground Truth)')
plt.xlabel('1st feature')
plt.ylabel('2nd feature')









    Out[22]:





<matplotlib.text.Text at 0x1154d9a90>



In [23]:

    
ax = None
for c in range(n_clusters):
    ax = visualized_noise_wine_features.iloc[
        list(np.where(np.array(skm_clusters) == c)[0]), :
    ].plot(
        kind='scatter', x='x01', y='x02', color=sns.color_palette('husl', 4)[c], label='class %d' % c, ax=ax
    )
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('the Noise-Space of wine features (predicted clusters)')
plt.xlabel('1st feature of the Noise-Space')
plt.ylabel('2nd feature')









    Out[23]:





<matplotlib.text.Text at 0x115766f98>



In [24]:

    
# NMI and AMI scores
skm_nmi_score = normalized_mutual_info_score(wine_target, skm_clusters)
skm_ami_score = adjusted_mutual_info_score(wine_target, skm_clusters)
skm_nmi_score, skm_ami_score









    Out[24]:





(0.87589846754078737, 0.8716230315171426)



In [25]:

    
# summary of scores
pd.DataFrame({
    'NMI': {
        'PCA': pca_nmi_score,
        'Subspace k-Means': skm_nmi_score,
    },
    'AMI': {
       'PCA': pca_ami_score,
        'Subspace k-Means': skm_ami_score,
    },
})









    Out[25]:







  
    
      
      AMI
      NMI
    
  
  
    
      PCA
      0.871623
      0.875898
    
    
      Subspace k-Means
      0.871623
      0.875898

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0
3	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0
4	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
count	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000
mean	13.000618	2.336348	2.366517	19.494944	99.741573	2.295112	2.029270	0.361854	1.590899	5.058090	0.957449	2.611685	746.893258
std	0.811827	1.117146	0.274344	3.339564	14.282484	0.625851	0.998859	0.124453	0.572359	2.318286	0.228572	0.709990	314.907474
min	11.030000	0.740000	1.360000	10.600000	70.000000	0.980000	0.340000	0.130000	0.410000	1.280000	0.480000	1.270000	278.000000
25%	12.362500	1.602500	2.210000	17.200000	88.000000	1.742500	1.205000	0.270000	1.250000	3.220000	0.782500	1.937500	500.500000
50%	13.050000	1.865000	2.360000	19.500000	98.000000	2.355000	2.135000	0.340000	1.555000	4.690000	0.965000	2.780000	673.500000
75%	13.677500	3.082500	2.557500	21.500000	107.000000	2.800000	2.875000	0.437500	1.950000	6.200000	1.120000	3.170000	985.000000
max	14.830000	5.800000	3.230000	30.000000	162.000000	3.880000	5.080000	0.660000	3.580000	13.000000	1.710000	4.000000	1680.000000

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
count	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02	1.780000e+02
mean	7.943708e-15	3.592632e-16	-4.066660e-15	-7.983626e-17	-7.983626e-17	-3.991813e-17	9.979533e-16	-5.588538e-16	-1.656602e-15	-3.442939e-16	1.636643e-15	2.235415e-15	-1.197544e-16
std	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00	1.002821e+00
min	-2.434235e+00	-1.432983e+00	-3.679162e+00	-2.671018e+00	-2.088255e+00	-2.107246e+00	-1.695971e+00	-1.868234e+00	-2.069034e+00	-1.634288e+00	-2.094732e+00	-1.895054e+00	-1.493188e+00
25%	-7.882448e-01	-6.587486e-01	-5.721225e-01	-6.891372e-01	-8.244151e-01	-8.854682e-01	-8.275393e-01	-7.401412e-01	-5.972835e-01	-7.951025e-01	-7.675624e-01	-9.522483e-01	-7.846378e-01
50%	6.099988e-02	-4.231120e-01	-2.382132e-02	1.518295e-03	-1.222817e-01	9.595986e-02	1.061497e-01	-1.760948e-01	-6.289785e-02	-1.592246e-01	3.312687e-02	2.377348e-01	-2.337204e-01
75%	8.361286e-01	6.697929e-01	6.981085e-01	6.020883e-01	5.096384e-01	8.089974e-01	8.490851e-01	6.095413e-01	6.291754e-01	4.939560e-01	7.131644e-01	7.885875e-01	7.582494e-01
max	2.259772e+00	3.109192e+00	3.156325e+00	3.154511e+00	4.371372e+00	2.539515e+00	3.062832e+00	2.402403e+00	3.485073e+00	3.435432e+00	3.301694e+00	1.960915e+00	2.971473e+00