wine.csv
in the data folder.KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [70]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
# Limit the rows displayed in dataframe by inserting this line along with your imports.
pd.set_option('display.max_rows', 10)
In [71]:
# Create a data frame from the listings dataset
wine = pd.read_csv('../data/wine.csv', delimiter=",", header=0)
In [72]:
# Check the header of the file
print wine.head()
In [6]:
# Check for missing values
print wine.info()
In [7]:
# Check for values in the target variable
print wine.Wine.value_counts()
KMeans
where n_clusters = 3
and compare the clusters to the Wine
column.
In [86]:
# Import the KMeans package
from sklearn.cluster import KMeans
# Separate the X and Y data fields
y = wine.Wine
X = wine.drop('Wine', axis=1)
# Scale the dataset using the standard scaler
X_scaled = preprocessing.scale(X)
In [76]:
# Create a bivariate scatterplot matrix
scatter_matrix = pd.scatter_matrix(X, diagonal="kde")
In [77]:
# Run KMeans off of the X dataset
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans = kmeans.fit(X_scaled).labels_
In [78]:
# View the predictions made by the KMeans model
print Y_hat_kmeans
In [79]:
# Import the metrics package from scikit-learn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
# Print the confusion matrix
cm = confusion_matrix(y, Y_hat_kmeans)
print(cm)
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [ ]:
# The confusion matrix reveals that a total of 59 + 65 + 48 = 172 of the wines were correctly grouped
# Note: The predicted labels do not match the actual labels because we were just grouping based on the X values
# That means that 178 - 172 = 6 of the wines were incorrectly categorized
In [87]:
# Import the PCA estimator
from sklearn.decomposition import PCA
# Use PCA to fit the X data
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
In [88]:
# View the reduced dataset (the first two variables are much larger in magnitude than the others)
print X_pca[:10,:]
In [89]:
# Calculate the explained variance by all of the variables
print pca.explained_variance_ratio_
plt.plot(pca.explained_variance_ratio_);
In [ ]:
# To explain 99% of the variance in the data, we need virtually all of the components
# However, the top three factors alone account for 36% + 19% + 11% = 66% of the variance
In [90]:
# Plot a scatterplot while color-coding the points with their predictions
plt.scatter(X_pca[:,0], X_pca[:,1], c=Y_hat_kmeans_pca);
KMeans
and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine
column.
In [91]:
# Run KMeans off of the X dataset
kmeans = KMeans(n_clusters=3, init='random', n_init=10 , max_iter = 300, random_state=1)
Y_hat_kmeans_pca = kmeans.fit(X_pca).labels_
In [92]:
# View the predictions made by the KMeans model
print Y_hat_kmeans_pca
In [93]:
# Import the metrics package from scikit-learn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
# Print the confusion matrix
cm = confusion_matrix(y, Y_hat_kmeans_pca)
print(cm)
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [ ]:
# Using the PCA transformed dataset was just as accurate as using the original dataset
In [94]:
# Import distance matrix for hierarchical clustering
from scipy.spatial.distance import pdist, squareform
distx = squareform(pdist(X_pca, metric='euclidean'))
distx
Out[94]:
In [95]:
# Use scipy.cluster.hierarchy.linkage to create the hierarchy and the dendrogram to plot it.
from scipy.cluster.hierarchy import linkage, dendrogram
R = dendrogram(linkage(X_pca, method='single'), color_threshold=10)
plt.xlabel('points')
plt.ylabel('Height')
plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
In [ ]: