In [97]:
from IPython.core.display import Image
Image(filename= 'Mor Consulting CMYK.jpg')
Out[97]:
In [62]:
%pylab inline
In [98]:
import numpy as np
from sklearn.datasets.samples_generator import make_blobs, make_moons
# Generate sample data
np.random.seed(0)
centres = [[1, 1], [-0.5, 0], [1, -1]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.3,0.3]])
figure(figsize=(10, 10))
colors = ['r','b','g']
for k, col in zip(range(3), colors):
my_members = labels_true == k
cluster_center = centres[k]
scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20)
scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
In [64]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
##############################################################################
# Compute clustering with 3 Clusters
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_
##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
centres,
squared=True)
order = distance.argmin(axis=0)
# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)
scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)
my_members = k_means_3_labels == order[k]
scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)
cluster_center = k_means_3_cluster_centres[order[k]]
scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)
scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
title('KMeans 3')
Out[64]:
In [65]:
# Generate sample data
np.random.seed(0)
centres = [[1, 0.75], [1, -0.75], [0, 0]]
X0, labels0_true = make_blobs(n_samples=300, centers=centres[0], cluster_std=[[0.6,0.1]])
X1, labels1_true = make_blobs(n_samples=300, centers=centres[1], cluster_std=[[0.6,0.1]])
X2, labels2_true = make_blobs(n_samples=300, centers=centres[2], cluster_std=[[0.6,0.1]])
X = np.concatenate((X0,X1,X2))
labels_true = np.concatenate((labels0_true,labels1_true+1,labels2_true+2))
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
my_members = labels_true == k
cluster_center = centres[k]
scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20)
scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
axis('equal')
Out[65]:
In [66]:
##############################################################################
# Compute clustering with 3 Clusters
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_
##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
centres,
squared=True)
order = distance.argmin(axis=0)
# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)
scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)
my_members = k_means_3_labels == order[k]
scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)
cluster_center = k_means_3_cluster_centres[order[k]]
scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)
scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
axis('equal')
title('KMeans 3')
Out[66]:
In [71]:
# Generate sample data
np.random.seed(0)
centres = [[1, 1, 0], [-0.5, 0, 0], [1, -1, 0]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.4,0.4,0.4]])
colors = ['r','b','g']
figure(figsize=(20, 6.666))
subplot(1,3,1)
for k, col in zip(range(3), colors):
my_members = labels_true == k
scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20)
cluster_center = centres[k]
scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
axis('equal')
subplot(1,3,2)
for k, col in zip(range(3), colors):
my_members = labels_true == k
scatter(X[my_members, 0], X[my_members, 2], c=col, marker='o',s=20)
cluster_center = centres[k]
scatter(cluster_center[0], cluster_center[2], c=col, marker='o', s=200)
axis('equal')
subplot(1,3,3)
for k, col in zip(range(3), colors):
my_members = labels_true == k
scatter(X[my_members, 2], X[my_members, 1], c=col, marker='o',s=20)
cluster_center = centres[k]
scatter(cluster_center[2], cluster_center[1], c=col, marker='o', s=200)
axis('equal')
In [72]:
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_
##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
centres,
squared=True)
order = distance.argmin(axis=0)
# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)
scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)
my_members = k_means_3_labels == order[k]
scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)
cluster_center = k_means_3_cluster_centres[order[k]]
scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)
scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
title('KMeans 3')
Out[72]:
In [73]:
# Generate sample data
np.random.seed(0)
centres = [[1, 1, 0, 0, 0, 0, 0, 0, 0], [-0.5, 0, 0, 0, 0, 0, 0, 0, 0], [1, -1, 0, 0, 0, 0, 0, 0, 0]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=0.4)
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_
##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
centres,
squared=True)
order = distance.argmin(axis=0)
# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)
scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)
my_members = k_means_3_labels == order[k]
scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)
cluster_center = k_means_3_cluster_centres[order[k]]
scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)
scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
title('KMeans 3')
Out[73]:
In [74]:
# Generate sample data
np.random.seed(0)
centres = [[1, 1, 0], [-0.5, 0, 0], [1, -1, 0]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.4,0.4,1.5]])
colors = ['r','b','g']
figure(figsize=(12, 4))
subplot(1,3,1)
for k, col in zip(range(3), colors):
my_members = labels_true == k
scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20)
cluster_center = centres[k]
scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
axis('equal')
subplot(1,3,2)
for k, col in zip(range(3), colors):
my_members = labels_true == k
scatter(X[my_members, 0], X[my_members, 2], c=col, marker='o',s=20)
cluster_center = centres[k]
scatter(cluster_center[0], cluster_center[2], c=col, marker='o', s=200)
axis('equal')
subplot(1,3,3)
for k, col in zip(range(3), colors):
my_members = labels_true == k
scatter(X[my_members, 2], X[my_members, 1], c=col, marker='o',s=20)
cluster_center = centres[k]
scatter(cluster_center[2], cluster_center[1], c=col, marker='o', s=200)
axis('equal')
In [75]:
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_
##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
centres,
squared=True)
order = distance.argmin(axis=0)
# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)
scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)
my_members = k_means_3_labels == order[k]
scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)
cluster_center = k_means_3_cluster_centres[order[k]]
scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)
scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
title('KMeans 3')
Out[75]:
In [112]:
[X, true_labels] = make_moons(n_samples=1000, noise=.05)
scatter(X[:, 0], X[:, 1], c=col, marker='o',s=20)
axis('equal')
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(X)
k_means_2_labels = k_means_2.labels_
k_means_2_cluster_centers = k_means_2.cluster_centers_
for k, col in zip(range(2), colors):
my_members = k_means_2_labels == k
scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)
cluster_center = k_means_2_cluster_centers[k]
scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)
In [76]:
# Generate sample data
np.random.seed(1)
centres = [[1, -1, 1.1, 1, 0, 2, 1], [1, -1, 1, 2, 0, 0, 0], [1, -1, 1.5, 0, 0, 0, 0]]
X, labels_true = make_blobs(n_samples=300, centers=centres, cluster_std = 0.4)
k_means_3a = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3a.fit(X)
k_means_3a_labels = k_means_3a.labels_
k_means_3a_cluster_centres = k_means_3a.cluster_centers_
k_means_3b = KMeans(init='random', n_clusters=3, n_init=1, max_iter=1)
k_means_3b.fit(X)
k_means_3b_labels = k_means_3b.labels_
k_means_3b_cluster_centres = k_means_3b.cluster_centers_
distance_a = euclidean_distances(k_means_3a_cluster_centres,
centres,
squared=True)
order_a = distance_a.argmin(axis=0)
distance_b = euclidean_distances(k_means_3b_cluster_centres,
centres,
squared=True)
order_b = distance_b.argmin(axis=0)
# KMeans 3
figure(figsize=(10, 4))
wrong_a = 0
correct_a = 0
wrong_b = 0
correct_b = 0
for k in range(3):
wrong_a += np.sum((k_means_3a_labels == order_a[k]) & (labels_true != k))
correct_a += np.sum((k_means_3a_labels == order_a[k]) & (labels_true == k))
wrong_b += np.sum((k_means_3b_labels == order_b[k]) & (labels_true != k))
correct_b += np.sum((k_means_3b_labels == order_b[k]) & (labels_true == k))
subplot(1,2,1)
bar(range(2),[wrong_a, correct_a])
xticks([0.5,1.5],('Wrong','Correct'))
title('Run 1')
subplot(1,2,2)
bar(range(2),[wrong_b, correct_b])
xticks([0.5,1.5],('Wrong','Correct'))
title('Run 2')
Out[76]:
In [77]:
bar(range(2),[k_means_3a.inertia_, k_means_3b.inertia_])
xticks([0.5,1.5],('Run 1','Run 2'))
title('Inertia')
Out[77]:
In [78]:
# Generate sample data
np.random.seed(1)
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std= [0.5])
# train 3 different models
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(X)
k_means_2_labels = k_means_2.labels_
k_means_2_cluster_centers = k_means_2.cluster_centers_
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_
k_means_4 = KMeans(init='k-means++', n_clusters=4, n_init=10)
k_means_4.fit(X)
k_means_4_labels = k_means_4.labels_
k_means_4_cluster_centers = k_means_4.cluster_centers_
figure(figsize=(10, 5))
bar(range(3),[k_means_2.inertia_, k_means_3.inertia_, k_means_4.inertia_])
xticks([0.5,1.5,2.5],('k=2\n inertia =','k=3','k=4'))
title("Inertia")
Out[78]:
In [79]:
from sklearn import metrics
# Determine the silhouette scores
k_means_2_silhouette_score = metrics.silhouette_score(X, k_means_2_labels, metric='euclidean')
k_means_3_silhouette_score = metrics.silhouette_score(X, k_means_3_labels, metric='euclidean')
k_means_4_silhouette_score = metrics.silhouette_score(X, k_means_4_labels, metric='euclidean')
figure(figsize=(10, 5))
bar(range(3),[k_means_2_silhouette_score, k_means_3_silhouette_score, k_means_4_silhouette_score])
xticks([0.5,1.5,2.5],('k=2','k=3','k=4'))
title("Silhouette Coefficient")
Out[79]:
In [80]:
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)
sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)
figure(figsize=(10, 10))
imshow(similarity)
Out[80]:
In [81]:
figure(figsize=(12, 4))
# reorder according to class labels
ind_2 = np.argsort(k_means_2_labels)
subplot(1,3,1)
imshow(similarity[ind_2].T[ind_2])
title('K=2 \n' + str(k_means_2_silhouette_score))
ind_3 = np.argsort(k_means_3_labels)
subplot(1,3,2)
imshow(similarity[ind_3].T[ind_3])
title('K=3 \n' + str(k_means_3_silhouette_score))
ind_4 = np.argsort(k_means_4_labels)
subplot(1,3,3)
imshow(similarity[ind_4].T[ind_4])
title('K=4 \n' + str(k_means_4_silhouette_score))
Out[81]:
In [82]:
################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)
sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)
# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
s = np.sort(A[i])
s = s[::-1] #reverse order
A[i][A[i]<s[num_neighbours]] = 0
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)
# OR
# Mutual k-nearest neighbour graph
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)
In [83]:
# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))
# Un-normalised Laplacian Matrix
L = D - A
In [84]:
# Perform an Eigenvalue deccomposition
from scipy.linalg import eig
[eig_vals, eig_vecs] = eig(L)
sorted_inds = np.argsort(eig_vals.real, axis=0)
In [85]:
bar(range(10), eig_vals.real[sorted_inds[0:10]])
Out[85]:
In [86]:
F = eig_vecs[:,sorted_inds[0:3]]
In [87]:
# Now perform k-means in this more amenable space
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(F)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_
ind_3 = np.argsort(k_means_3_labels)
imshow(similarity[ind_3].T[ind_3])
Out[87]:
In [88]:
################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)
sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)
# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
s = np.sort(A[i])
s = s[::-1] #reverse order
A[i][A[i]<s[num_neighbours]] = 0
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)
# OR
# Mutual k-nearest neighbour graph
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)
# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))
In [89]:
# Unnormalised Laplacian
L = np.matrix(D - A)
# Calculate inverse squre root of D - D^(-0.5)
D_minus_sqrt = np.matrix(np.diag(1/sqrt(np.sum(A, axis=0))))
# Normalised Laplacian Matrix
Lsym = D_minus_sqrt * L * D_minus_sqrt
[eig_vals, eig_vecs] = eig(L)
sorted_inds = np.argsort(eig_vals.real, axis=0)
F = eig_vecs[:,sorted_inds[0:3]]
In [90]:
import sklearn.preprocessing as preprocessing
Fnorm = preprocessing.normalize(F, norm='l2')
In [91]:
# Now perform k-means in this more amenable space
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(F)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_
ind_3 = np.argsort(k_means_3_labels)
imshow(similarity[ind_3].T[ind_3])
Out[91]:
In [92]:
################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)
sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)
# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
s = np.sort(A[i])
s = s[::-1] #reverse order
A[i][A[i]<s[num_neighbours]] = 0
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)
# OR
# Mutual k-nearest neighbour graph
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)
# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))
# Unnormalised Laplacian
L = np.matrix(D - A)
In [93]:
[eig_vals, eig_vecs] = eig(L, b = D)
sorted_inds = np.argsort(eig_vals.real, axis=0)
In [94]:
F = eig_vecs[:,sorted_inds[0:3]]
# Now perform k-means in this more amenable space
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(F)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_
ind_3 = np.argsort(k_means_3_labels)
imshow(similarity[ind_3].T[ind_3])
Out[94]:
In [113]:
[X, true_labels] = make_moons(n_samples=1000, noise=.05)
################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)
sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)
# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
s = np.sort(A[i])
s = s[::-1] #reverse order
A[i][A[i]<s[num_neighbours]] = 0
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)
# OR
# Mutual k-nearest neighbour graph
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)
# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))
# Un-normalised Laplacian Matrix
L = D - A
# Perform an Eigenvalue deccomposition
from scipy.linalg import eig
[eig_vals, eig_vecs] = eig(L)
sorted_inds = np.argsort(eig_vals.real, axis=0)
bar(range(10), eig_vals.real[sorted_inds[0:10]])
Out[113]:
In [114]:
F = eig_vecs[:,sorted_inds[0:2]]
# Now perform k-means in this more amenable space
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(F)
k_means_2_labels = k_means_2.labels_
k_means_2_cluster_centers = k_means_2.cluster_centers_
ind_3 = np.argsort(k_means_2_labels)
imshow(similarity[ind_3].T[ind_3])
Out[114]:
In [116]:
for k, col in zip(range(2), colors):
my_members = k_means_2_labels == k
scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)
cluster_center = k_means_2_cluster_centers[k]
In [ ]: