In [77]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
%matplotlib inline
import time
In [108]:
df = pd.read_csv("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/cleveland.csv", header=None, error_bad_lines=False)
# Define the features and the outcome.
X = df.iloc[:, :13]
y = df.iloc[:, 13]
# Replace missing values (marked by ?) with a 0.
X = X.replace(to_replace='?', value=0)
# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis.
y = np.where(y > 0, 0, 1)
unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))
print(len(df))
In [155]:
%%timeit pass
X_norm = normalize(X)
# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_norm)
# Calculate predicted values.
y_pred = KMeans(init='k-means++', n_clusters=6).fit_predict(X_pca)
# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()
# Check the solution against the data.
print('Comparing k-means clusters against the data:')
print(pd.crosstab(y_pred, y))
# Accuracy tables.
table = pd.crosstab(y_pred, y, margins=True)
In [156]:
%%timeit pass
# Each batch will be made up of 200 data points.
minibatchkmeans = MiniBatchKMeans(
init='random',
n_clusters=6,
batch_size=400)
minibatchkmeans.fit(X_pca)
# Add the new predicted cluster memberships to the data frame.
predict_mini = minibatchkmeans.predict(X_pca)
# Check the MiniBatch model against our earlier one.
print('Comparing k-means and mini batch k-means solutions:')
print(pd.crosstab(predict_mini,y))
When we increase the number of clusters it seems to be more overlap between then than when compared to the 2 clusters. Moreover, more outliers appear within each cluster. I don´t find any solution compelling enough, at least compared to the first example with two clusters, as when the number of clousters grows the overalp between them too, being the distances between clusters too small. From a computational perspective, the increase from two to six clusters does not affect the computational time 7% when multiplying by three the number of clusters. When comparing the nomal k-means algorithm with the minibatch with batches of 200 samples, the time required by the former is 9.3 times bigger than the latter (299 ms per loop vs 32.1 ms per loop).