wine.csv in the data folder.KMeans where n_clusters = 3 and compare the clusters to the Wine column.KMeans and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine column.
In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.learning_curve import learning_curve
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.cluster import AgglomerativeClustering
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
plt.style.use('fivethirtyeight')
%matplotlib inline
figsize(12,5)
In [139]:
wine=pd.read_csv('../data/wine.csv')
wine_col1=pd.read_csv('../data/wine.csv')
In [140]:
wine=wine.drop('Wine',1)
wine_label=wine_col1['Wine']
In [141]:
wine.head()
Out[141]:
In [142]:
wine_scale=scale(wine)
In [143]:
#Notes on what these metrics mean
#A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class
#A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster.
#The V-measure is the harmonic mean between homogeneity and completeness:
# cont'd v = 2 * (homogeneity * completeness) / (homogeneity + completeness)
#The Rand Index computes a similarity measure between two clusterings by considering all pairs
#of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
#Silhoette: The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.
#Negative values generally indicate that a sample has been assigned to the wrong cluster,
#as a different cluster is more similar.
In [50]:
km=KMeans(n_clusters=3)
km.fit(wine_scale)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(wine_label, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(wine_label, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(wine_label, km.labels_))
print("Adjusted Rand-Index: %.3f"
% metrics.adjusted_rand_score(wine_label, km.labels_))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(wine_scale, wine_label, sample_size=1000))
In [55]:
km=KMeans(n_clusters=2)
km.fit(wine_scale)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(wine_label, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(wine_label, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(wine_label, km.labels_))
print("Adjusted Rand-Index: %.3f"
% metrics.adjusted_rand_score(wine_label, km.labels_))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(wine_scale, wine_label, sample_size=1000))
In [145]:
centers = [[1, 1], [-1, -1], [1, -1]]
X = wine_scale
y = wine_label
fig = plt.figure(1, figsize=(5, 4))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=60, azim=140)
km=KMeans(n_clusters=3)
km.fit(wine_scale)
#Note: Having trouble offsetting labels, since they are overlapping and thus not useful I scrapped them
# for name, label in [('One', 0), ('Two', 1), ('Three', 2)]:
# ax.text3D(X[y == label, 0].mean(),
# X[y == label, 1].mean()+1.5,
# X[y == label, 2].mean(), name,
# horizontalalignment='center',
# bbox=dict(alpha=1, edgecolor='w', facecolor='w')
ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=y)
x_surf = [X[:, 0].min(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].max()]
y_surf = [X[:, 0].max(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].min()]
x_surf = np.array(x_surf)
y_surf = np.array(y_surf)
v0 = pca.transform(pca.components_[0])
v0 /= v0[-1]
v1 = pca.transform(pca.components_[1])
v1 /= v1[-1]
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
plt.show()
In [61]:
pca=PCA(n_components=3)
pca.fit(wine_scale)
print(pca.explained_variance_ratio_)
In [ ]:
In [165]:
#first way I tried to do this. For me this graph is easier to interprete when changing cluster number, 3 looks like the best option
centers = [[1, 1], [-1, -1], [1, -1]]
X = wine_scale
y = wine_label
fig = plt.figure(1, figsize=(5, 4))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=60, azim=140)
# pca = decomposition.PCA(n_components=3)
# pca.fit(X)
# X = pca.transform(X)
reduced_data = PCA(n_components=3).fit_transform(X)
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
X=reduced_data
kmeans.fit(X)
#Note: Having trouble offsetting labels, since they are overlapping and thus not useful I scrapped them
# for name, label in [('One', 0), ('Two', 1), ('Three', 2)]:
# ax.text3D(X[y == label, 0].mean(),
# X[y == label, 1].mean()+1.5,
# X[y == label, 2].mean(), name,
# horizontalalignment='center',
# bbox=dict(alpha=1, edgecolor='w', facecolor='w')
ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=y)
x_surf = [X[:, 0].min(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].max()]
y_surf = [X[:, 0].max(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].min()]
x_surf = np.array(x_surf)
y_surf = np.array(y_surf)
v0 = pca.transform(pca.components_[0])
v0 /= v0[-1]
v1 = pca.transform(pca.components_[1])
v1 /= v1[-1]
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
plt.show()
In [159]:
#second way I saw to to do this, interpreting this graph with different cluster sizes is less inutitive to me
reduced_data = PCA(n_components=3).fit_transform(X)
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
kmeans.fit(reduced_data)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() + 1, reduced_data[:, 0].max() - 1
y_min, y_max = reduced_data[:, 1].min() + 1, reduced_data[:, 1].max() - 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title('K-means clustering on the Wine dataset (PCA-reduced data)'
'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
In [162]:
#I see a substantial improvement for most scores with PCa but the Silouette score still isn't very good. That means
# my clusters are overlapping. I know that from actually looking at the Wine targets that three clusters is correct and
#therefore this tells me that you can do a relatively good job clustered but you're clusters may still be overlapping.
reduced_data = PCA(n_components=2).fit_transform(X)
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
kmeans.fit(reduced_data)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(wine_label, kmeans.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(wine_label, kmeans.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(wine_label, kmeans.labels_))
print("Adjusted Rand-Index: %.3f"
% metrics.adjusted_rand_score(wine_label, kmeans.labels_))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(wine_scale, wine_label, sample_size=1000))
In [148]:
#here is agglometive clustering
centers = [[1, 1], [-1, -1], [1, -1]]
X = wine_scale
y = wine_label
fig = plt.figure(1, figsize=(5, 4))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=60, azim=140)
ward = AgglomerativeClustering(n_clusters=3)
ward.fit(X)
#Note: Having trouble offsetting labels, since they are overlapping and thus not useful I scrapped them
# for name, label in [('One', 0), ('Two', 1), ('Three', 2)]:
# ax.text3D(X[y == label, 0].mean(),
# X[y == label, 1].mean()+1.5,
# X[y == label, 2].mean(), name,
# horizontalalignment='center',
# bbox=dict(alpha=1, edgecolor='w', facecolor='w')
ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=y)
x_surf = [X[:, 0].min(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].max()]
y_surf = [X[:, 0].max(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].min()]
x_surf = np.array(x_surf)
y_surf = np.array(y_surf)
v0 = pca.transform(pca.components_[0])
v0 /= v0[-1]
v1 = pca.transform(pca.components_[1])
v1 /= v1[-1]
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
Out[148]:
In [164]:
#here is agglometive clustering
centers = [[1, 1], [-1, -1], [1, -1]]
X = wine_scale
y = wine_label
fig = plt.figure(1, figsize=(5, 4))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=60, azim=140)
reduced_data = PCA(n_components=3).fit_transform(X)
ward = AgglomerativeClustering(n_clusters=3)
X=reduced_data
ward.fit(X)
#Note: Having trouble offsetting labels, since they are overlapping and thus not useful I scrapped them
# for name, label in [('One', 0), ('Two', 1), ('Three', 2)]:
# ax.text3D(X[y == label, 0].mean(),
# X[y == label, 1].mean()+1.5,
# X[y == label, 2].mean(), name,
# horizontalalignment='center',
# bbox=dict(alpha=1, edgecolor='w', facecolor='w')
ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=y)
x_surf = [X[:, 0].min(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].max()]
y_surf = [X[:, 0].max(), X[:, 0].max(),
X[:, 0].min(), X[:, 0].min()]
x_surf = np.array(x_surf)
y_surf = np.array(y_surf)
v0 = pca.transform(pca.components_[0])
v0 /= v0[-1]
v1 = pca.transform(pca.components_[1])
v1 /= v1[-1]
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
Out[164]:
In [166]:
reduced_data = PCA(n_components=2).fit_transform(X)
ward = AgglomerativeClustering(n_clusters=3)
ward.fit(reduced_data)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(wine_label, ward.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(wine_label, ward.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(wine_label, ward.labels_))
print("Adjusted Rand-Index: %.3f"
% metrics.adjusted_rand_score(wine_label, ward.labels_))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(wine_scale, wine_label, sample_size=1000))
In [ ]: