wine.csv
in the data folder.KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.cross_validation as cv
import sys
import re
import os
import pprint
import random
# from fastcluster import *
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, silhouette_score
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn import preprocessing
from collections import Counter
from sklearn.metrics import matthews_corrcoef
from datetime import datetime
from collections import Counter
from fuzzywuzzy import fuzz
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform
pd.set_option('display.max_rows', 15)
pd.set_option('display.precision', 4)
np.set_printoptions(precision = 4, suppress = True)
%matplotlib inline
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__
print 'Numpy version ' + np.__version__
In [2]:
wine = pd.read_csv('./wine.csv', sep = ',')
In [3]:
wine.head()
Out[3]:
In [4]:
wine.info()
In [5]:
wine.describe() # No weird looking values or anything suggesting NaNs. Different scales - preprocess data.
Out[5]:
In [6]:
# Drop column that will not be used in models.
wine_category = wine['Wine'].values
wine.drop('Wine', axis = 1, inplace = True)
In [7]:
wine.head()
Out[7]:
In [8]:
# Correlations - some feats significantly correlated.
wine.corr()
Out[8]:
In [9]:
# Scatter plot suggests that there might be 2 or 3 underlying components.
pd.tools.plotting.scatter_matrix(wine, figsize = (15, 20), alpha = 0.2, diagonal = 'kde');
In [10]:
# Make KMeans etc. output (starts with 0) comparable to wine.wine (start with 1).
wine_category1 = wine_category - 1
In [11]:
# Need to scale data.
scaler = StandardScaler()
In [12]:
wine_std = scaler.fit_transform(wine)
wine_std
Out[12]:
In [13]:
kmeans = KMeans(n_clusters = 3, n_jobs = -1)
In [14]:
kmeans1 = kmeans.fit(wine_std)
In [15]:
kmeans1_pred = kmeans1.labels_
print kmeans1_pred
In [16]:
print wine_category1 # eye-balling true values - looks ok
In [17]:
print kmeans1.cluster_centers_
In [18]:
# PCA.
pca = PCA()
In [19]:
# Plot number of components and explained variance.
explained_variance = []
components = np.arange(1, 14, 1)
for comp in components:
pca = PCA(n_components = comp)
pca.fit_transform(wine_std)
explained_variance.append(pca.explained_variance_ratio_.sum())
plt.plot(components, explained_variance)
plt.tight_layout()
# That's a bit weird - one basically needs 11 components to cover 99% of variance.
In [20]:
# Use scaled data, 11 components.
pca2 = PCA(n_components = 11)
In [21]:
wine_pca = pca2.fit_transform(wine_std)
In [22]:
kmeans2 = KMeans(n_clusters = 3, n_jobs = -1, random_state = 1)
In [23]:
kmeans2_pca = kmeans2.fit(wine_pca)
In [24]:
print 'predicted labels\n', kmeans2_pca.labels_
In [25]:
print 'true labels\n', wine_category1 #Looks ok as well.
In [39]:
# Plot PCA components - test data.
plt.scatter(wine_pca[:, 0], wine_pca[:, 1], c = kmeans2_pca.labels_, s = 50, alpha = 0.5)
plt.scatter(kmeans2_pca.cluster_centers_[:,0], kmeans2_pca.cluster_centers_[:,1], s=100, c = np.unique(kmeans2_pca.labels_))
Out[39]:
In [40]:
plt.scatter(wine_pca[:, 0], wine_pca[:, 2], c = kmeans2_pca.labels_, s = 50, alpha = 0.5)
plt.scatter(kmeans2_pca.cluster_centers_[:, 0], kmeans2_pca.cluster_centers_[:,2], s=100, c=np.unique(kmeans2_pca.labels_))
Out[40]:
In [41]:
plt.scatter(wine_pca[:, 3], wine_pca[:, 7], c = kmeans2_pca.labels_, s = 50, alpha = 0.5) # Looks not so good.
plt.scatter(kmeans2_pca.cluster_centers_[:,3], kmeans2_pca.cluster_centers_[:,7], s=100, c=np.unique(kmeans2_pca.labels_))
Out[41]:
In [ ]:
# Let's try hierarchical clustering. Use scaled PCA data.
In [33]:
distance_matrix = squareform(pdist(wine_pca, metric='euclidean'))
distance_matrix
Out[33]:
In [34]:
# perform clustering and plot the dendrogram
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(distance_matrix, method='ward'), labels = wine_category1)
plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontsize=14);
In [ ]: