wine.csv
in the data folder.KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [362]:
# Standard imports
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import preprocessing
from time import time
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
pd.set_option('display.max_rows', 10)
%matplotlib inline
np.random.seed(123)
In [363]:
print('Pandas:', pd.__version__)
print('Numpy:', np.__version__)
print('Matplotlib:', mpl.__version__)
In [364]:
raw_data = pd.read_csv('../data/wine.csv', delimiter=",")
labels = raw_data.Wine
data = raw_data.drop('Wine', axis=1)
In [365]:
sample_size, features = raw_data.shape
Exploratory Analysis
In [366]:
data.describe()
Out[366]:
In [367]:
scaled_data = preprocessing.scale(data)
scaled_data = pd.DataFrame(scaled_data)
In [368]:
scaled_data.columns = data.columns
In [369]:
raw_data.groupby('Wine').describe()
Out[369]:
In [370]:
raw_data['Wine'].hist(bins=3)
Out[370]:
Distance Matrix & Dendogram with scaled data
In [371]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
scaled_data_array = scaled_data.as_matrix()
# not printed as pretty, but the values are correct
distx = squareform(pdist(scaled_data, metric='euclidean'))
print(distx)
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(scaled_data, method='single'), color_threshold=10)
plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
Determine if 3 clusters is true or not
In [372]:
##### cluster data into K=1..10 clusters #####
#K, KM, centroids,D_k,cIdx,dist,avgWithinSS = kmeans.run_kmeans(X,10)
from scipy.cluster.vq import kmeans,vq
from scipy.spatial.distance import cdist
K = range(1,10)
# scipy.cluster.vq.kmeans
KM = [kmeans(scaled_data_array,k) for k in K] # apply kmeans 1 to 10
centroids = [cent for (cent,var) in KM] # cluster centroids
D_k = [cdist(scaled_data_array, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]
avgWithinSS = [sum(d)/scaled_data.shape[0] for d in dist]
kIdx = 2
# plot elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, avgWithinSS, 'b*-')
ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12,
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
tt = plt.title('Elbow for K-Means clustering')
Awesome plotting function!
In [373]:
def plot_clusters(orig,pred,nx,ny,kmeans=False,legend=True):
data = orig
import matplotlib.pyplot as plt
p0 = plt.plot(orig[pred==0,nx],orig[pred==0,ny],'ro',label='Wine 1')
p2 = plt.plot(orig[pred==2,nx],orig[pred==2,ny],'go',label='Wine 2')
p1 = plt.plot(orig[pred==1,nx],orig[pred==1,ny],'bo',label='Wine 3')
tt= plt.title('Wine Data set, KMeans clustering with K=3')
if kmeans:
#CENTROIDS
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='^', s=169, linewidths=3,
color='orange', zorder=10)
if legend:
ll=plt.legend()
return (p0,p1,p2)
Predict with K_Means (Scaled Data)
In [374]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(scaled_data).labels_
(pl0,pl1,pl2) = plot_clusters(scaled_data.as_matrix(),Y_hat_kmeans,0,5,kmeans)
With original Data and markers
In [375]:
groups = raw_data.groupby('Wine')
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
ax.plot(group.Alcohol, group['Malic.acid'], marker='o', linestyle='', ms=6, label=name)
plt.ylim(0,7)
plt.xlim(10,16)
ax.legend()
plt.show()
Predict with K_Means (Raw Data)
In [376]:
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(data).labels_
print(data.columns)
(pl0,pl1,pl2) = plot_clusters(data.as_matrix(),Y_hat_kmeans,0,1,kmeans)
Perform PCA on scaled data (Though it doesn't matter)
In [377]:
#Check out covariance matrix
#print(np.cov(data,rowvar=False)) #too large so not running
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(scaled_data)
#pca.components_
pca.mean_
Out[377]:
Explained Variance Ratio
In [378]:
plt.plot(pca.explained_variance_ratio_);
print('Explained Variance',sum(pca.explained_variance_ratio_[0:5]))
Let's perform KMeans on PCA Results with 6 components
In [379]:
PCA_data = PCA(n_components=6).fit_transform(scaled_data)
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(scaled_data).labels_
(pl0,pl1,pl2) = plot_clusters(PCA_data,Y_hat_kmeans,0,1,kmeans)
In [380]:
PCA_data = PCA(n_components=5).fit_transform(scaled_data)
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(PCA_data).labels_
(pl0,pl1,pl2) = plot_clusters(PCA_data,Y_hat_kmeans,0,1,kmeans)