IRISのアヤメのデータに対してクラスタリングをかける。このデータはラベル数が3とあらかじめ分かっているので、k=3をHyperKEstomatorで推定できるかを確認する
In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}
fignum = 1
for name, est in estimators.items():
fig = plt.figure(fignum, figsize=(9, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
est.fit(X)
labels = est.labels_
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
fignum = fignum + 1
In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}
fignum = 1
for name, est in estimators.items():
fig = plt.figure(fignum, figsize=(9, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
est.fit(X)
labels = est.labels_
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
fignum = fignum + 1
In [2]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Plot the ground truth
fig = plt.figure(fignum, figsize=(9, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
for name, label in [('Setosa', 0),
('Versicolour', 1),
('Virginica', 2)]:
ax.text3D(X[y == label, 3].mean(),
X[y == label, 0].mean() + 1.5,
X[y == label, 2].mean(), name,
horizontalalignment='center',
bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y)
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
plt.show()
DPGMMはディリクレ過程を使ってクラスタ数も含めて推定する手法。これと実行時間を比較した。
In [19]:
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import DPGMM
from sklearn import datasets
from kHLL.kestimate import HyperKEstimator
from kHLL.hash.image import md5_for_vec
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
estimators = {'dpgmm': DPGMM(),
'k_means_iris_hyper': KMeans(n_clusters=kestimator.getK())}
## DPGMM
try_count = 5
dpgmm_elapsed_times = []
hyper_kmeans_elapsed_times = []
for i in xrange(1, try_count):
start = time.time()
for j in xrange(0, i):
dpgmm_model = DPGMM()
dpgmm_model.fit(X)
dpgmm_elapsed_times.append(time.time() - start)
start = time.time()
for j in xrange(0, i):
kestimator = HyperKEstimator(1, 5, md5_for_vec, 20)
kestimator.train(X)
kmeans_model= KMeans(n_clusters=3)
kmeans_model.fit(X)
hyper_kmeans_elapsed_times.append(time.time() - start)
x = range(1, try_count)
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
axes.plot(x, dpgmm_elapsed_times, 'r')
axes.plot(x, hyper_kmeans_elapsed_times, 'g')
axes.set_xlabel('x')
axes.set_ylabel('y')
axes.set_title('Comparison between DPGMM and HyperKMeans');
In [ ]: