wine.csv
in the data folder.KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_score,train_test_split
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
In [66]:
df = pd.read_csv("../data/wine.csv")
In [68]:
km = KMeans(n_clusters=3)
In [69]:
X = df.drop('Wine', 1).values
Y = df['Wine'].values
In [70]:
km.fit(X)
Out[70]:
In [71]:
km.predict(X)
Out[71]:
In [72]:
Y
Out[72]:
In [90]:
#PCA
pca = PCA(2)
In [91]:
X2 = pca.fit_transform(X)
In [92]:
X2.shape
Out[92]:
In [93]:
X2[1]
Out[93]:
In [94]:
X[1]
Out[94]:
In [95]:
km2 = KMeans(3)
In [96]:
km2.fit(X2)
Out[96]:
In [97]:
km2.predict(X2)
Out[97]:
In [98]:
plt.scatter(X2[:,0], X2[:,1], c=Y);
In [102]:
#that is not great, scale it
from sklearn.preprocessing import scale
In [116]:
scaled = scale(df.drop('Wine',1).values)
In [117]:
pca2 = PCA(2)
In [123]:
X3 = pca2.fit_transform(scaled)
In [125]:
km3 = KMeans(3)
In [126]:
km3.fit(X3)
Out[126]:
In [128]:
km3.predict(X3)
Out[128]:
In [131]:
Y
Out[131]:
In [134]:
plt.scatter(X3[:,0], X3[:,1], c=Y)
Out[134]:
In [135]:
#Try KMeans and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine column.