In [1]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
In [2]:
import numpy as np
from sklearn import manifold
from sklearn import datasets
digits = datasets.load_digits(n_class=6)
X = digits.data / 255.
y = digits.target
n_samples, n_features = X.shape
n_neighbors = 30
print("Computing t-SNE embedding")
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
X_tsne = tsne.fit_transform(X)
In [3]:
sns.kdeplot(X_tsne[:, 0], X_tsne[:, 1], shade=True, shade_lowest=False)
Out[3]:
In [45]:
from sklearn.cluster import KMeans
from clumpy.k_medoids import KMedoids
from hdbscan import HDBSCAN
kmeans_4 = KMeans(n_clusters=4, random_state=12)
kmeans_5 = KMeans(n_clusters=5, random_state=12)
kmeans_6 = KMeans(n_clusters=6, random_state=12)
kmedoids_4 = KMedoids(n_clusters=4, random_state=12)
kmedoids_6 = KMedoids(n_clusters=6, random_state=12)
hdbscan = HDBSCAN()
labels_4 = kmeans_4.fit_predict(X)
labels_5 = kmeans_5.fit_predict(X)
labels_6 = kmeans_6.fit_predict(X)
labels_hdb = hdbscan.fit_predict(X)
labels_medoids_4 = kmedoids_4.fit_predict(X)
labels_medoids_6 = kmedoids_6.fit_predict(X)
hdbscan.n_clusters = len(np.unique(hdbscan.labels_))
In [5]:
def to_dataframe(labels):
return pd.DataFrame({
'x': X_tsne[:, 0],
'y': X_tsne[:, 1],
'labels': labels
})
colors = ['Reds', 'Blues', 'Oranges', 'Greens', 'Purples', 'summer']
def plot_cluster(labels):
cluster_ids = np.unique(labels)
data = to_dataframe(labels)
#for cluster_id in cluster_ids:
# cluster_data = data.loc[data['labels'] == cluster_id]
# ax = sns.kdeplot(
# cluster_data.x,
# cluster_data.y,
# shade=True,
# shade_lowest=False,
# cmap=colors[cluster_id])
sns.lmplot('x', 'y', hue='labels', data=data, fit_reg=False)
In [6]:
plot_cluster(labels_4)
In [7]:
plot_cluster(labels_5)
In [8]:
plot_cluster(labels_6)
In [9]:
plot_cluster(labels_hdb)
In [46]:
plot_cluster(labels_medoids_4)
In [47]:
plot_cluster(labels_medoids_6)
In [49]:
from clumpy.similarity import cluster_similarity
from clumpy.similarity.cluster_graph import to_similarity_matrix
hdbscan.n_clusters = 11
print to_similarity_matrix(kmeans_4, kmeans_5, X)
print cluster_similarity(kmeans_4, kmeans_5, X)
print cluster_similarity(kmeans_4, kmeans_6, X)
print cluster_similarity(kmeans_5, kmeans_6, X)
print cluster_similarity(kmedoids_6, kmedoids_4, X)
print cluster_similarity(kmeans_6, hdbscan, X)
In [39]:
from clumpy.similarity import clusterer_embedding
alg_names = ['kmeans_4', 'kmeans_5', 'kmeans_6', 'hdbscan', 'kmedoids_4', 'kmedoids_6']
algs = [kmeans_4, kmeans_5, kmeans_6, hdbscan, kmedoids_4, kmedoids_6]
embedding = clusterer_embedding(algs, X)
In [54]:
def plot_embedding(embedding, alg_names=[]):
data = pd.DataFrame({
"x": embedding[:, 0],
'y': embedding[:, 1],
'labels': alg_names})
g = sns.lmplot('x', 'y', data=data, hue='labels', fit_reg=False)
g.set(yticks=[])
g.set(xticks=[])
In [59]:
plot_embedding(embedding, alg_names=alg_names)
In [13]:
np.unique(hdbscan.labels_)
Out[13]: