In [19]:
%pylab inline
In [6]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
In [38]:
print iris.DESCR
In [14]:
print X[0:10]
In [15]:
print y
In [72]:
# Compute clustering with Means
k_means = KMeans(n_clusters=3)
k_means.fit(X)
Out[72]:
In [73]:
import numpy as np
from sklearn.cluster import KMeans
k_means_labels = k_means.labels_
print k_means_labels
k_means_cluster_centers = k_means.cluster_centers_
print k_means_cluster_centers
k_means_labels_unique = np.unique(k_means_labels)
print k_means_labels_unique
In [74]:
import pylab as pl
fig = pl.figure(1)
for center in k_means_cluster_centers:
pl.plot(center[0], center[1], 'o')
pl.title('KMeans')
pl.show()
In [75]:
print range(3)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
print colors
print zip(range(3), colors)
for k, col in zip(range(3), colors):
print k, col
In [76]:
import pylab as pl
fig = pl.figure(1)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
for k, col in zip(range(3), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
pl.plot(X[my_members, 2], X[my_members, 3], 'w',
markerfacecolor=col, marker='.')
pl.plot(cluster_center[2], cluster_center[3], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
pl.title('KMeans Clustering by Petal')
pl.xticks(())
pl.yticks(())
pl.show()
In [77]:
import pylab as pl
fig = pl.figure(1)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
for k, col in zip(range(3), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
pl.plot(X[my_members, 0], X[my_members, 1], 'w',
markerfacecolor=col, marker='.')
pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
pl.title('KMeans Clustering By Sepal')
pl.xticks(())
pl.yticks(())
pl.show()
In [54]:
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(X, k_means_labels)
print( "The average silhouette_score is :", silhouette_avg)
In [57]:
c = cluster.KMeans(n_clusters=3)
k_data = c.fit_predict(X)
print c.inertia_
In [60]:
wcss = []
for i in range(2,20):
c = KMeans(n_clusters=i)
k_data = c.fit_predict(X)
wcss.append((i, c.inertia_))
print i, c.inertia_
In [71]:
import matplotlib.pyplot as plt
plotary = [ele[1] for ele in wcss ]
plt.plot(plotary)
plt.ylabel('WSS')
plt.xticks(range(0,20), range(2,22))
plt.show()
In [63]:
silhouette_ary = []
for i in range(2,20):
c = KMeans(n_clusters=i)
c.fit(X)
k_means_labels = c.labels_
silhouette_avg = silhouette_score(X, k_means_labels)
silhouette_ary.append((i, silhouette_avg))
print i, silhouette_avg
In [70]:
import matplotlib.pyplot as plt
plotary = [ele[1] for ele in silhouette_ary]
plt.plot(plotary)
plt.ylabel('Silhoette')
plt.xticks(range(0,20), range(2,22))
plt.show()
In [78]:
print X[0:10]
In [88]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)
pca.fit(X)
Out[88]:
In [89]:
X_reduced = pca.transform(X)
print "Reduced dataset shape:", X_reduced.shape
In [90]:
print X_reduced[0:10]
In [106]:
import pylab as pl
pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y)
Out[106]:
In [105]:
print "Meaning of the 2 components:"
for component in pca.components_:
print " + ".join("%.3f x %s" % (value, name)
for value, name in zip(component, iris.feature_names))
In [122]:
from sklearn.feature_extraction.text import CountVectorizer
jieba.add_word('洪智坤')
ary = ['【更新】柯P:洪智坤洩漏公文案還沒看到公文 今處理',
'留洪智坤 柯:殘障求職不易',
'人事處議處洪智坤 柯P:不清楚議處結果']
corpus = []
corpus.extend([' '.join(jieba.cut(ele)) for ele in ary])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
word = vectorizer.get_feature_names()
for w in word:
print w,
print
print X.toarray()
In [123]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(X)
weight = tfidf.toarray()
for w in word:
print w,
print
print weight
In [177]:
from xml.dom import minidom
from xml.etree import ElementTree
import jieba.analyse
with open('1435449602.xml', 'r') as f:
events=ElementTree.fromstring(f.read())
corpus = []
ary= []
for elem in events.findall('./channel/item'):
guid = elem.find('guid').text
title = elem.find('title').text
description = elem.find('description').text
pubDate = elem.find('pubDate').text
source = elem.find('source').text
ary.append(title)
corpus.append(' '.join(jieba.analyse.extract_tags(description, 20)))
In [178]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
word = vectorizer.get_feature_names()
In [179]:
weight= X.toarray()
In [115]:
#for ele in word:
# print ele,
In [121]:
import scipy as sp
def tfidf(t, d, D):
tf = float(d.count(t)) / sum(d.count(w) for w in set(d))
print "tf", tf
idf = sp.log(float(len(D)) / (len([doc for doc in D if t in doc])))
print "idf", idf
return tf * idf
a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a, abb, abc]
print tfidf('a', a, D)
print tfidf('b', abb, D)
print tfidf('c', abc, D)
In [126]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(X)
weight = tfidf.toarray()
In [180]:
print weight
In [181]:
from sklearn import cluster
c = cluster.KMeans(n_clusters=4)
k_data = c.fit_predict(weight)
In [188]:
for idx, clusterid in enumerate(k_data):
if clusterid == 1:
print idx, clusterid , ary[idx]
In [136]:
silhouette_ary = []
for i in range(2,20):
c = KMeans(n_clusters=i)
c.fit(weight)
k_means_labels = c.labels_
silhouette_avg = silhouette_score(weight, k_means_labels)
silhouette_ary.append((i, silhouette_avg))
print i, silhouette_avg
In [137]:
import matplotlib.pyplot as plt
plotary = [ele[1] for ele in silhouette_ary]
plt.plot(plotary)
plt.ylabel('Silhoette')
plt.xticks(range(0,20), range(2,22))
plt.show()
In [167]:
from sklearn import cluster
c = cluster.KMeans(n_clusters=4)
k_data = c.fit_predict(weight)
In [168]:
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(weight)
print pca_data.shape
In [169]:
print pca_data[0:10]
In [170]:
fig, axes = plt.subplots(ncols=1, nrows=1,figsize=(10,5))
axes.scatter(pca_data[:,0], pca_data[:,1], c=['rgbyc'[i] for i in k_data ], s=200)
axes.set_title('pca plot')
Out[170]:
In [189]:
for idx, clusterid in enumerate(k_data):
if clusterid == 3:
print idx, clusterid , ary[idx]
In [ ]: