Title: Simple Clustering With SciPy
Slug: scipy_simple_clustering
Summary: Simple Clustering With SciPy
Date: 2016-05-01 12:00
Category: Python
Tags: Other
Authors: Chris Albon
In [14]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.cluster import vq
In [15]:
# create 100 coordinate pairs (i.e. two values), then add 5 to all of them
year_1 = np.random.randn(100, 2) + 5
# create 30 coordinatee pairs (i.e. two values), then subtract 5 to all of them
year_2 = np.random.randn(30, 2) - 5
# create 50 coordinatee pairs (i.e. two values)
year_3 = np.random.randn(50, 2)
In [16]:
print('year 1 battles:', year_1[0:3])
print('year 2 battles:', year_2[0:3])
print('year 3 battles:', year_3[0:3])
In [17]:
# vertically stack year_1, year_2, and year_3 elements
battles = np.vstack([year_1, year_2, year_3])
In [18]:
# calculate the centroid coordinates of each cluster
# and the variance of all the clusters
centroids, variance = vq.kmeans(battles, 3)
In [19]:
centroids
Out[19]:
In [20]:
variance
Out[20]:
In [21]:
identified, distance = vq.vq(battles, centroids)
In [22]:
identified
Out[22]:
In [23]:
distance
Out[23]:
In [24]:
cluster_1 = battles[identified == 0]
cluster_2 = battles[identified == 1]
cluster_3 = battles[identified == 2]
In [25]:
print(cluster_1[0:3])
print(cluster_2[0:3])
print(cluster_3[0:3])
In [26]:
# create a scatter plot there the x-axis is the first column of battles
# the y-axis is the second column of battles, the size is 100, and
# the color of each point is determined by the indentified variable
plt.scatter(battles[:,0], battles[:,1], s=100, c=identified)
Out[26]: