wine.csv
in the data folder.KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [1]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 2)
In [2]:
!pwd
In [3]:
# dataset is in homeworks/data, so we back up a level
df = pd.read_csv('..\data\wine.csv')
In [4]:
# do we need to fiddle with a header row or the delimiter (unlikely since csv, but...)?
df.head(5)
Out[4]:
In [5]:
df.tail(5)
# and maybe we'll see how many 'real' categories of wine there are
Out[5]:
In [6]:
# nope! it looks like it read correctly!
df.info()
In [7]:
# ok, so we have a category (Wine), which we'll be omitting for our unsupervised learning, and 13 characteristics for each obs
# because we have 178 entries, and everything shows up as 178 non-null numeric values, we don't appear to have any missing data
# !!!!!!!
In [8]:
from sklearn.learning_curve import learning_curve
from sklearn import feature_extraction
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
In [9]:
X = df.drop('Wine', axis = 1)
y = df['Wine']
In [10]:
kmn = KMeans(n_clusters=3, n_init=20, random_state=24)
kmn.fit(X)
Out[10]:
In [11]:
y_pred = kmn.predict(X)
In [12]:
print classification_report(y, y_pred)
In [13]:
# oof! looks like we didn't classify any samples as Wine=3?
# Most likely explanation: we DIDN'T SCALE OUR VARIABLES, so only a couple of them are doing any work in the K-Means
In [14]:
X2 = scale(np.array(X.values))
In [15]:
kmn.fit(X2)
y_pred = kmn.predict(X2)
In [16]:
print classification_report(y, y_pred)
In [18]:
# actually, that seems to have made things worse?
print y_pred
print y.values
In [19]:
# NOPE! It's because our predictions are generating values over the set [0, 1, 2] rather than [1, 2, 3]
# and, because it's unsupervised, the algorithm is permuting them in an arbitrary way (could just as easily be
# [2, 0, 1] with a different random seed)
# could use 'applymap' to adjust y2_pred to match y's categories...
# in this case, what each label is 'supposed' to be is obvious, so it wouldn't be too hard...
# [bunch of error-filled code trying to use applymap]
# hm, let's just set this aside for now, and say "when we know what to label the categories, it doesn't look so bad"
In [20]:
# one alternative may be a confusion matrix, without the expectation that the big values will be on the diagonal
In [21]:
# we imported PCA from scikit earlier, so we just need to implement it
# here, we create a couple alternatives: one with 2 components, one with 3, and the third with "enough components to
# explain 99% of the variance" (per documentation)
pca2 = PCA(n_components=2)
pca3 = PCA(n_components=3)
pca99 = PCA(n_components=0.99)
In [22]:
# we use the scaled dataset from above
pca2.fit(X2)
pca3.fit(X2)
pca99.fit(X2)
Out[22]:
In [23]:
print pca99.explained_variance_ratio_
# So it looks like the 99% of variance PCA has 12 components (on a variable space of 13 dimensions)
print 'variance explained by pca2: ', sum(pca2.explained_variance_ratio_)
print 'variance explained by pca3: ', sum(pca3.explained_variance_ratio_)
print 'variance explained by pca99: ', sum(pca99.explained_variance_ratio_)
In [24]:
# so it looks like the first PCA component is the same no matter how many components we use in our model (variance
# explained is additive)
# from the chain of values for pca99, the 'best' cut point is probably either 2, 4, or 5 components -- since the dataset
# is fairly small (and the score for 2 is only a shade over 50%), I think we probably want to go with 5
pca5 = PCA(n_components=5)
pca5.fit(X2)
Xpca = pca5.fit_transform(X2)
In [25]:
# plotting on PCA2 space to visualize clusters
# first: from scikit documentation -- we end up fitting KMeans on PCA2 so that we can draw pretty boundaries in the space
reduced_data = PCA(n_components=2).fit_transform(X2)
kmeans = KMeans(init='k-means++', n_clusters=3)
kmeans.fit(reduced_data)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() + 1, reduced_data[:, 0].max() - 1
y_min, y_max = reduced_data[:, 1].min() + 1, reduced_data[:, 1].max() - 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
In [26]:
# an alternative from Lab 15 -- we can plot in PCA2 space but use our PCA5 model to actually classify
Y_hat_kmeans = kmeans.fit(Xpca).labels_
In [27]:
plt.scatter(Xpca[:, 0], Xpca[:, 1], c=Y_hat_kmeans)
Out[27]:
In [28]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
# not printed as pretty, but the values are correct
distx = squareform(pdist(Xpca, metric='euclidean'))
distx
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(distx, method='ward'), color_threshold=100)
plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
In [29]:
# so we can see from the dedrogram that we get three clusters when we set the threshold between about 90 and 180;
# if we go finer, by 70 we're already getting another couple clusters popping up... so even without the a priori knowledge
# of the number of types of wine in the dataset, we may have ended up with three classes anyway
# would love to plot these as labels on a scatter, although I guess it wouldn't look too different from the other one...
# Y_hat_hier = kmeans.fit(Xpca).labels_
In [ ]: