In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
digits_traiin = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', header=None)
In [3]:
digits_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', header=None)
In [4]:
X_train = digits_traiin[np.arange(64)]
Y_train = digits_traiin[64]
In [7]:
X_tesy = digits_test[np.arange(64)]
Y_test = digits_test[64]
In [8]:
from sklearn.cluster import KMeans #导入Kmeans模型
In [9]:
kmeans = KMeans(n_clusters=10)#设置中心数量为10
In [10]:
kmeans.fit(X_train)
Out[10]:
In [11]:
y_pred = kmeans.predict(X_tesy)
In [12]:
#使用ARI进行聚类性能评估
from sklearn import metrics
In [13]:
print metrics.adjusted_rand_score(Y_test, y_pred)
In [14]:
#利用轮廓系数评价不同类族数量
from sklearn.metrics import silhouette_score
Out[14]:
In [15]:
#初始化原始数据点
x1 = np.array([1,2,3,1,5,6,5,5,6,7,8,9,7,9])
x2 = np.array([1,3,2,2,8,6,7,6,7,1,2,1,1,3])
X = np.array(zip(x1,x2)).reshape(len(x1),2)
In [29]:
Out[29]:
In [32]:
plt.subplot(3,2,1) #分割出3*2个子图,并在一号子图作图
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Instances')
plt.scatter(x1,x2)
colors = ['b','g','r','c','m','y','k','b']
markers = ['o','s','D','v','^','p','*','+']
clusters=[2,3,4,5,8]
subplot_counter = 1
sc_scores=[]
#在一号子图做出原始数据点阵分布
for t in clusters:
subplot_counter+=1
plt.subplot(3,2,subplot_counter)
kmeans_model = KMeans(n_clusters=t).fit(X)
for i, l in enumerate(kmeans_model.labels_):
plt.plot(x1[i],x2[i],color=colors[l], marker=markers[l],ls='None')
plt.xlim([0,10])
plt.ylim([0,10])
sc_score = silhouette_score(X,kmeans_model.labels_,metric='euclidean')
sc_scores.append(sc_score)
plt.title('K=%s,silhouette coefficient=%0.03f'%(t,sc_score))
plt.figure()
Out[32]:
In [33]:
plt.plot(clusters, sc_scores,'*-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient Score')
plt.show()
In [34]:
from scipy.spatial.distance import cdist
In [51]:
#使用均匀分布函数随机三个簇,每个10个样本
cluster1 = np.random.uniform(5,15,(2,10))
cluster2 = np.random.uniform(55,65,(2,10))
cluster3 = np.random.uniform(35,45,(2,10))
In [37]:
#绘制30个数据样本的分布图像
X=np.hstack((cluster1,cluster2,cluster3))
In [38]:
X
Out[38]:
In [39]:
X = X.T
In [40]:
X
Out[40]:
In [42]:
plt.scatter(X[:,0],X[:,1])
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
In [43]:
#测试9中不同聚类中心数量下,每种情况的聚类质量并作图
K = range(0,10)
In [49]:
meandistortions=[]
X
Out[49]:
In [48]:
for i in K:
kmeans = KMeans(n_clusters=i)
kmeans.fit(X)
meandistortions.append(sum(np.min(cdist(X,kmeans.cluster_centers_,'euclidean'), axis=1))/X.shape[0])
In [ ]: