In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
digits_traiin = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', header=None)

In [3]:
digits_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', header=None)

In [4]:
X_train = digits_traiin[np.arange(64)]
Y_train = digits_traiin[64]

In [7]:
X_tesy = digits_test[np.arange(64)]
Y_test = digits_test[64]

In [8]:
from sklearn.cluster import KMeans #导入Kmeans模型

In [9]:
kmeans = KMeans(n_clusters=10)#设置中心数量为10

In [10]:
kmeans.fit(X_train)


Out[10]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
y_pred = kmeans.predict(X_tesy)

In [12]:
#使用ARI进行聚类性能评估
from sklearn import metrics

In [13]:
print metrics.adjusted_rand_score(Y_test, y_pred)


0.663057794933

In [14]:
#利用轮廓系数评价不同类族数量
from sklearn.metrics import silhouette_score


Out[14]:
<matplotlib.axes.AxesSubplot at 0x11225db50>

In [15]:
#初始化原始数据点
x1 = np.array([1,2,3,1,5,6,5,5,6,7,8,9,7,9])
x2 = np.array([1,3,2,2,8,6,7,6,7,1,2,1,1,3])
X = np.array(zip(x1,x2)).reshape(len(x1),2)

In [29]:



Out[29]:
<matplotlib.collections.PathCollection at 0x113577b10>

In [32]:
plt.subplot(3,2,1) #分割出3*2个子图,并在一号子图作图
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Instances')
plt.scatter(x1,x2)
colors = ['b','g','r','c','m','y','k','b']
markers = ['o','s','D','v','^','p','*','+']
clusters=[2,3,4,5,8]
subplot_counter = 1
sc_scores=[]
#在一号子图做出原始数据点阵分布
for t in clusters:
    subplot_counter+=1
    plt.subplot(3,2,subplot_counter)
    kmeans_model = KMeans(n_clusters=t).fit(X)
    for i, l in enumerate(kmeans_model.labels_):
        plt.plot(x1[i],x2[i],color=colors[l], marker=markers[l],ls='None')
    plt.xlim([0,10])
    plt.ylim([0,10])
    sc_score = silhouette_score(X,kmeans_model.labels_,metric='euclidean')
    sc_scores.append(sc_score)
    plt.title('K=%s,silhouette coefficient=%0.03f'%(t,sc_score))
plt.figure()


Out[32]:
<matplotlib.figure.Figure at 0x11371ab90>
<matplotlib.figure.Figure at 0x11371ab90>

In [33]:
plt.plot(clusters, sc_scores,'*-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient Score')
plt.show()



In [34]:
from scipy.spatial.distance import cdist

In [51]:
#使用均匀分布函数随机三个簇,每个10个样本
cluster1 = np.random.uniform(5,15,(2,10))
cluster2 = np.random.uniform(55,65,(2,10))
cluster3 = np.random.uniform(35,45,(2,10))

In [37]:
#绘制30个数据样本的分布图像
X=np.hstack((cluster1,cluster2,cluster3))

In [38]:
X


Out[38]:
array([[ 1.28600192,  0.8485312 ,  1.34406422,  1.1163339 ,  0.67541803,
         0.80242334,  0.72691934,  1.1640541 ,  0.56663667,  0.69349049,
         6.09010394,  6.3401809 ,  6.3397448 ,  5.98153699,  6.10681759,
         5.70601182,  5.54882011,  5.56655686,  5.61394002,  5.99704258,
         3.82811531,  3.94689521,  3.9975668 ,  3.7531011 ,  3.58777756,
         3.90755793,  4.06945004,  4.06648043,  3.99454492,  4.09264447],
       [ 1.39709414,  1.10872917,  0.97124146,  0.88234763,  1.25274393,
         0.59347155,  0.85089681,  1.09675432,  1.15178532,  1.25109913,
         6.43151041,  6.4298607 ,  6.45276557,  6.18723749,  5.70790896,
         6.36031912,  5.62770569,  6.18665773,  5.85017161,  5.52860609,
         3.55426974,  4.15074908,  3.82041377,  4.48434513,  4.25089974,
         3.7135369 ,  4.08399628,  3.62924781,  3.86921546,  3.94085738]])

In [39]:
X = X.T

In [40]:
X


Out[40]:
array([[ 1.28600192,  1.39709414],
       [ 0.8485312 ,  1.10872917],
       [ 1.34406422,  0.97124146],
       [ 1.1163339 ,  0.88234763],
       [ 0.67541803,  1.25274393],
       [ 0.80242334,  0.59347155],
       [ 0.72691934,  0.85089681],
       [ 1.1640541 ,  1.09675432],
       [ 0.56663667,  1.15178532],
       [ 0.69349049,  1.25109913],
       [ 6.09010394,  6.43151041],
       [ 6.3401809 ,  6.4298607 ],
       [ 6.3397448 ,  6.45276557],
       [ 5.98153699,  6.18723749],
       [ 6.10681759,  5.70790896],
       [ 5.70601182,  6.36031912],
       [ 5.54882011,  5.62770569],
       [ 5.56655686,  6.18665773],
       [ 5.61394002,  5.85017161],
       [ 5.99704258,  5.52860609],
       [ 3.82811531,  3.55426974],
       [ 3.94689521,  4.15074908],
       [ 3.9975668 ,  3.82041377],
       [ 3.7531011 ,  4.48434513],
       [ 3.58777756,  4.25089974],
       [ 3.90755793,  3.7135369 ],
       [ 4.06945004,  4.08399628],
       [ 4.06648043,  3.62924781],
       [ 3.99454492,  3.86921546],
       [ 4.09264447,  3.94085738]])

In [42]:
plt.scatter(X[:,0],X[:,1])
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()



In [43]:
#测试9中不同聚类中心数量下,每种情况的聚类质量并作图
K = range(0,10)

In [49]:
meandistortions=[]
X


Out[49]:
array([[ 1.28600192,  1.39709414],
       [ 0.8485312 ,  1.10872917],
       [ 1.34406422,  0.97124146],
       [ 1.1163339 ,  0.88234763],
       [ 0.67541803,  1.25274393],
       [ 0.80242334,  0.59347155],
       [ 0.72691934,  0.85089681],
       [ 1.1640541 ,  1.09675432],
       [ 0.56663667,  1.15178532],
       [ 0.69349049,  1.25109913],
       [ 6.09010394,  6.43151041],
       [ 6.3401809 ,  6.4298607 ],
       [ 6.3397448 ,  6.45276557],
       [ 5.98153699,  6.18723749],
       [ 6.10681759,  5.70790896],
       [ 5.70601182,  6.36031912],
       [ 5.54882011,  5.62770569],
       [ 5.56655686,  6.18665773],
       [ 5.61394002,  5.85017161],
       [ 5.99704258,  5.52860609],
       [ 3.82811531,  3.55426974],
       [ 3.94689521,  4.15074908],
       [ 3.9975668 ,  3.82041377],
       [ 3.7531011 ,  4.48434513],
       [ 3.58777756,  4.25089974],
       [ 3.90755793,  3.7135369 ],
       [ 4.06945004,  4.08399628],
       [ 4.06648043,  3.62924781],
       [ 3.99454492,  3.86921546],
       [ 4.09264447,  3.94085738]])

In [48]:
for i in K:
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X)
    meandistortions.append(sum(np.min(cdist(X,kmeans.cluster_centers_,'euclidean'), axis=1))/X.shape[0])


/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cluster/k_means_.py:88: RuntimeWarning: divide by zero encountered in log
  n_local_trials = 2 + int(np.log(n_clusters))
---------------------------------------------------------------------------
OverflowError                             Traceback (most recent call last)
<ipython-input-48-a2cd9df90603> in <module>()
      1 for i in K:
      2     kmeans = KMeans(n_clusters=i)
----> 3     kmeans.fit(X)
      4     meandistortions.append(sum(np.min(cdist(X,kmeans.cluster_centers_,'euclidean'), axis=1))/X.shape[0])

/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in fit(self, X, y)
    891                 tol=self.tol, random_state=random_state, copy_x=self.copy_x,
    892                 n_jobs=self.n_jobs, algorithm=self.algorithm,
--> 893                 return_n_iter=True)
    894         return self
    895 

/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in k_means(X, n_clusters, init, precompute_distances, n_init, max_iter, verbose, tol, random_state, copy_x, n_jobs, algorithm, return_n_iter)
    344                 X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
    345                 precompute_distances=precompute_distances, tol=tol,
--> 346                 x_squared_norms=x_squared_norms, random_state=random_state)
    347             # determine if these results are the best so far
    348             if best_inertia is None or inertia < best_inertia:

/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _kmeans_single_elkan(X, n_clusters, max_iter, init, verbose, x_squared_norms, random_state, tol, precompute_distances)
    393     # init
    394     centers = _init_centroids(X, n_clusters, init, random_state=random_state,
--> 395                               x_squared_norms=x_squared_norms)
    396     centers = np.ascontiguousarray(centers)
    397     if verbose:

/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _init_centroids(X, k, init, random_state, x_squared_norms, init_size)
    682     if isinstance(init, string_types) and init == 'k-means++':
    683         centers = _k_init(X, k, random_state=random_state,
--> 684                           x_squared_norms=x_squared_norms)
    685     elif isinstance(init, string_types) and init == 'random':
    686         seeds = random_state.permutation(n_samples)[:k]

/Users/wizardholy/soft/dunas/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials)
     86         # specific results for other than mentioning in the conclusion
     87         # that it helped.
---> 88         n_local_trials = 2 + int(np.log(n_clusters))
     89 
     90     # Pick first center randomly

OverflowError: cannot convert float infinity to integer

In [ ]: