In [ ]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
#import matplotlib as mpl
import seaborn as sns
%matplotlib inline
Generating our data
In [ ]:
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=30,center_box=(-10.0, 10.0),random_state=0)
In [ ]:
sns.heatmap(X, xticklabels=False, yticklabels=False, linewidths=0,cbar=False)
Computing the pairwise distances for visualization purposes
We can compute pairwise distances using the sklearn.metrics functions summarized here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
In [ ]:
euclidean_dists = metrics.euclidean_distances(X)
Visualizing the data using the heatmap of pairwise distances
In [ ]:
sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False)
In [ ]:
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
print "The total error of the clustering is: ", error
print '\nCluster labels'
print labels
print '\n Cluster Centroids'
print centroids
There are 3 functions in all the clustering classes, fit() predict() and fit_predict().
fit() is building the model from the training data (e.g. finding the centroids),
predict() is assigning labels to test data after building the model,
fit_predict() is doing both in the same data (e.g in kmeans, it finds the centroids and assigns the labels to the dataset)
In [ ]:
#print original and cluster data
idx = np.argsort(labels)
rX = X[idx,:]
sns.heatmap( rX,xticklabels=False, yticklabels=False, linewidths=0,cbar=False)
In [ ]:
#Rearrange so that all same labels are consecutive
#print labels
#print labels[idx]
rearranged_dists = euclidean_dists[idx,:][:,idx]
sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False)
In [ ]:
error = np.zeros(11)
error[0] = 0;
for k in range(1,11):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X)
error[k] = kmeans.inertia_
plt.plot(range(1,len(error)),error[1:])
plt.xlabel('Number of clusters')
plt.ylabel('Error')
Making this a function
In [ ]:
def evaluate_clusters(X,max_clusters):
error = np.zeros(max_clusters+1)
error[0] = 0;
for k in range(1,max_clusters+1):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X)
error[k] = kmeans.inertia_
plt.plot(range(1,len(error)),error[1:])
plt.xlabel('Number of clusters')
plt.ylabel('Error')
evaluate_clusters(X,10)
If $T$ is a ground truth label assignment and $C$ the clustering. Let $a$ be: the number of pairs of elements that have the same label in $T$ and the same label in $C$. Also let $b$ be: the number of pairs of elements that have different labels in $T$ and different labels in $C$. Then the Rand Index is: $$\text{RI}(T,C) = \frac{a+b}{n\choose 2} $$
However the RI score does not guarantee that random label assignments will get a value close to zero (esp. if the number of clusters is in the same order of magnitude as the number of samples). To counter this effect we can discount the expected RI $E[\text{RI}]$ of random labelings by defining the adjusted Rand index as follows: $$\text{ARI} = \frac{\text{RI} - E[\text{RI}]}{\max(\text{RI}) - E[\text{RI}]}$$
Range?
In [ ]:
ri = metrics.adjusted_rand_score(labels,y)
print ri
In [ ]:
def ri_evaluate_clusters(X,max_clusters,ground_truth):
ri = np.zeros(max_clusters+1)
ri[0] = 0;
for k in range(1,max_clusters+1):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X)
ri[k] = metrics.adjusted_rand_score(kmeans.labels_,ground_truth)
plt.plot(range(1,len(ri)),ri[1:])
plt.xlabel('Number of clusters')
plt.ylabel('Adjusted Rand Index')
ri_evaluate_clusters(X,10,y)
Advantages/Disadvantages?
If the ground truth labels are not known, evaluation must be performed using the model itself. The Silhouette Coefficient (sklearn.metrics.silhouette_score) is an example of such an evaluation, where a higher Silhouette Coefficient score relates to a model with better defined clusters. Let $a$ be the mean distance between a sample and all other points in the same class and $b$ be the mean distance between a sample and all other points in the next nearest cluster. Then the Silhoeutte Coefficient for a clustering is: $$s = \frac{b - a}{\max(a, b)}$$
Range?
In [ ]:
sc = metrics.silhouette_score(X, labels, metric='euclidean')
print sc
In [ ]:
def sc_evaluate_clusters(X,max_clusters):
s = np.zeros(max_clusters+1)
s[0] = 0;
s[1] = 0;
for k in range(2,max_clusters+1):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X)
s[k] = metrics.silhouette_score(X,kmeans.labels_,metric='cosine')
plt.plot(range(2,len(s)),s[2:])
plt.xlabel('Number of clusters')
plt.ylabel('Adjusted Rand Index')
#sc_evaluate_clusters(X,10)
In [ ]:
from sklearn.datasets import fetch_20newsgroups
"""
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
'rec.autos',
'rec.sport.baseball'
]"""
categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)
print news_data.target, len(news_data.target)
print news_data.target_names
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=4, max_df=0.8)
data = vectorizer.fit_transform(news_data.data)
In a large text corpus, some words will be very present (e.g. “the”, “a”, “is” in English) hence carrying very little meaningful information about the actual contents of the document. If we were to feed the direct count data directly to a classifier those very frequent terms would shadow the frequencies of rarer yet more interesting terms.
In order to re-weight the count features into floating point values suitable for usage by a classifier it is very common to use the tf–idf transform.
Tf means term-frequency while tf–idf means term-frequency times inverse document-frequency. This is a originally a term weighting scheme developed for information retrieval (as a ranking function for search engines results), that has also found good use in document classification and clustering.
If $N$ is the total number of documents in the corpus $D$ then
$$\text{idf}(t,D)=\frac{N}{|\{d\in D\mid t\in d \}|}$$$$\text{tf-idf}(t,d)=\text{tf}(t,d)\times \text{idf}(t,D)$$
In [ ]:
print type(data), data.shape
In [ ]:
fig, ax1 = plt.subplots(1,1,figsize=(15,10))
sns.heatmap(data[1:100,1:200].todense(), xticklabels=False, yticklabels=False,
linewidths=0, cbar=False, ax=ax1)
In [ ]:
print news_data.target
print news_data.target_names
In [ ]:
evaluate_clusters(data, 10)
In [ ]:
ri_evaluate_clusters(data,10,news_data.target)
In [ ]:
sc_evaluate_clusters(data,10)
In [ ]:
k=4
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
kmeans.fit_predict(data)
In [ ]:
print("Top terms per cluster:")
asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
order_centroids = asc_order_centroids[:,::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
print "Cluster %d:" % i
for ind in order_centroids[i, :10]:
print ' %s' % terms[ind]
print
In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("../theme/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: