wine.csv in the data folder.KMeans where n_clusters = 3 and compare the clusters to the Wine column.KMeans and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine column.
In [209]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
In [210]:
wine_all=pd.read_csv('../data/wine.csv')
In [211]:
wine_all.head()
Out[211]:
In [212]:
wine_all.describe()
Out[212]:
In [213]:
wine_all.info()
In [214]:
X = wine_all.ix[:,1:]
In [215]:
X=X.values
X
Out[215]:
In [216]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
In [217]:
X = scale.fit_transform(X)
X
Out[217]:
In [218]:
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
In [219]:
Y_hat_kmeans = kmeans.fit(X).labels_
In [220]:
Y_hat_kmeans
Out[220]:
In [221]:
plt.scatter(X[:,0], X[:,1], c=Y_hat_kmeans);
In [222]:
mu = kmeans.cluster_centers_
mu
Out[222]:
In [223]:
plt.scatter(X[:,0], X[:,1], c=Y_hat_kmeans, alpha=0.4)
plt.scatter(mu[:,0], mu[:,1], s=100, c=np.unique(Y_hat_kmeans));
In [224]:
wine_all['cluster']= Y_hat_kmeans
In [225]:
wine_all.groupby(['cluster','Wine']).count()
Out[225]:
In [226]:
#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
In [227]:
X_pca = pca.fit_transform(X)
In [228]:
sum(pca.explained_variance_ratio_)
Out[228]:
In [229]:
pca.components_
Out[229]:
In [230]:
pca.mean_
Out[230]:
In [234]:
_ = plt.scatter(X_pca[:,0], X_pca[:,1], c=Y_hat_kmeans)
In [235]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
# not printed as pretty, but the values are correct
distx = squareform(pdist(X_pca, metric='euclidean'))
distx
Out[235]:
In [236]:
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(distx, method='ward'), color_threshold=100)
plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
In [ ]: