wine.csv
in the data folder.KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as datasets
from sklearn.cluster import KMeans
%matplotlib inline
In [3]:
wine = pd.read_csv('../data/wine.csv')
wine.info()
In [4]:
# drop the first column of the dataset
modified_wine = wine.drop("Wine",1)
modified_wine
Out[4]:
In [4]:
In [5]:
# X, Y = wine.make_blobs(centers=4, cluster_std=0.5, random_state=1)
f1 = plt.figure(1)
plt.scatter(wine["Wine"], wine["Alcohol"]);
f1.show()
f2 = plt.figure(2)
plt.scatter(wine["Wine"], wine["Malic.acid"]);
f2.show()
f3 = plt.figure(3)
plt.scatter(wine["Wine"], wine["Ash"]);
f3.show()
f4 = plt.figure(4)
plt.scatter(wine["Wine"], wine["Acl"]);
f4.show()
f5 = plt.figure(5)
plt.scatter(wine["Wine"], wine["Mg"]);
f5.show()
f6 = plt.figure(6)
plt.scatter(wine["Wine"], wine["Phenols"]);
f6.show()
f7 = plt.figure(7)
plt.scatter(wine["Wine"], wine["Flavanoids"]);
f7.show()
f8 = plt.figure(8)
plt.scatter(wine["Wine"], wine["Nonflavanoid.phenols"]);
f8.show()
f9 = plt.figure(9)
plt.scatter(wine["Wine"], wine["Proanth"]);
f9.show()
f10 = plt.figure(10)
plt.scatter(wine["Wine"], wine["Color.int"]);
f10.show()
f11 = plt.figure(11)
plt.scatter(wine["Wine"], wine["Hue"]);
f11.show()
f12 = plt.figure(12)
plt.scatter(wine["Wine"], wine["OD"]);
f12.show()
f13 = plt.figure(13)
plt.scatter(wine["Wine"], wine["Proline"]);
f13.show()
In [6]:
first_half_wine = wine[["Wine","Alcohol","Malic.acid","Ash","Acl","Mg","Phenols"]]
_ = pd.scatter_matrix(first_half_wine,diagonal='kde')
In [7]:
second_half_wine = wine[["Wine","Flavanoids","Nonflavanoid.phenols","Proanth","Color.int","Hue","OD","Proline"]]
_ = pd.scatter_matrix(second_half_wine,diagonal='kde')
In [8]:
_ = pd.scatter_matrix(wine,diagonal='kde')
In [9]:
from sklearn.cluster import KMeans
# using modified_wine DF instead of wine because I dropped the first column.
X = modified_wine.values
y = modified_wine["Alcohol"].values
X
Out[9]:
In [10]:
X[:,1]
Out[10]:
In [11]:
plt.scatter(X[:,1], X[:,12]);
In [12]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(X).labels_
plt.scatter(X[:,1], X[:,12], c=Y_hat_kmeans);
In [13]:
mu = kmeans.cluster_centers_
mu
Out[13]:
In [14]:
plt.scatter(X[:,1], X[:,12], c=Y_hat_kmeans, alpha=0.6)
plt.scatter(mu[:,1], mu[:,12], s=100, c=np.unique(Y_hat_kmeans))
print mu
In [15]:
from IPython.core.pylabtools import figsize
%matplotlib inline
figsize(12,5)
In [16]:
# look at the actual covariance matrix
print np.cov(X,rowvar=False)
In [34]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scaled_X = scale.fit_transform(X)
In [43]:
from sklearn.decomposition import PCA
pca = PCA()
# pca = PCA(n_components=10)
In [44]:
X_pca = pca.fit_transform(scaled_X)
In [45]:
pca.components_
Out[45]:
In [46]:
pca.mean_
Out[46]:
In [47]:
_ = plt.scatter(X_pca[:,1], X_pca[:,2])
In [48]:
# How many Components did you need to explain 99% of variance in this dataset?
# Answer: the graph below indicates that 1 component creates 99% of variance for this dataset.
plt.plot(pca.explained_variance_ratio_);
In [49]:
sum(pca.explained_variance_ratio_)
Out[49]:
In [50]:
# trying kMeans again with PCA
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(X_pca).labels_
mu = kmeans.cluster_centers_
plt.scatter(X_pca[:,1], X_pca[:,12], c=Y_hat_kmeans, alpha=0.4)
plt.scatter(mu[:,1], mu[:,12], s=100, c=np.unique(Y_hat_kmeans))
Out[50]:
In [51]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
# not printed as pretty, but the values are correct
distx = squareform(pdist(X, metric='euclidean'))
distx
Out[51]:
In [52]:
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(distx, method='single'), color_threshold=10)
plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
In [23]:
In [ ]: