HyperLogLogを使ったクラスタリングのための効率的なk推定手法の提案

精度の比較

IRISのアヤメのデータに対してクラスタリングをかける。このデータはラベル数が3とあらかじめ分かっているので、k=3をHyperKEstomatorで推定できるかを確認する


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec

np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target

kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)

estimators = {'k_means_iris_3': KMeans(n_clusters=3),
              'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}


fignum = 1
for name, est in estimators.items():
    fig = plt.figure(fignum, figsize=(9, 5))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

    plt.cla()
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('Petal width')
    ax.set_ylabel('Sepal length')
    ax.set_zlabel('Petal length')
    fignum = fignum + 1



In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec

np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target

kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)

estimators = {'k_means_iris_3': KMeans(n_clusters=3),
              'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}


fignum = 1
for name, est in estimators.items():
    fig = plt.figure(fignum, figsize=(9, 5))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

    plt.cla()
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('Petal width')
    ax.set_ylabel('Sepal length')
    ax.set_zlabel('Petal length')
    fignum = fignum + 1



In [2]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Plot the ground truth
fig = plt.figure(fignum, figsize=(9, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

plt.cla()

for name, label in [('Setosa', 0),
                    ('Versicolour', 1),
                    ('Virginica', 2)]:
    ax.text3D(X[y == label, 3].mean(),
              X[y == label, 0].mean() + 1.5,
              X[y == label, 2].mean(), name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y)

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
plt.show()


実行時間の比較

DPGMMはディリクレ過程を使ってクラスタ数も含めて推定する手法。これと実行時間を比較した。


In [19]:
import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.mixture import DPGMM
from sklearn import datasets

from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec


np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target



estimators = {'dpgmm': DPGMM(),
              'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}

## DPGMM
try_count = 5
dpgmm_elapsed_times = []
hyper_kmeans_elapsed_times = []
for i in xrange(1, try_count):
    start = time.time()
    for j in xrange(0, i):
        dpgmm_model = DPGMM()
        dpgmm_model.fit(X)
    dpgmm_elapsed_times.append(time.time() - start)
    start = time.time()
    for j in xrange(0, i):
        kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
        kestimator.train(X)
        kmeans_model= KMeans(n_clusters=3)
        kmeans_model.fit(X)
    hyper_kmeans_elapsed_times.append(time.time() - start)

x = range(1, try_count)


fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
axes.plot(x, dpgmm_elapsed_times, 'r')
axes.plot(x, hyper_kmeans_elapsed_times, 'g')
axes.set_xlabel('x')
axes.set_ylabel('y')
axes.set_title('Comparison between DPGMM and HyperKMeans');



In [ ]: