Often we have access to a data set that we surmise (or are told) contains information from various subgroups. The data represents 125 distinct events.
Take columns 4, 5, and 6 from the provided data.csv and determine if subgroups do in fact exist, and if so how to define them.
In [2]:
from IPython.display import Image
import os
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import cluster, decomposition
from mpl_toolkits.mplot3d import Axes3D
font = {'size': 20}
matplotlib.rc('font', **font)
curdir = !pwd
rootdir = os.path.abspath(curdir[0])
def get_columns(columns):
with open(os.path.join(rootdir, 'data.csv')) as f:
for line in f:
yield [line.split(',')[i - 1] for i in columns]
Timeseries graphs are the most prevalent visualization of data, but they are limited in their ability to display the interconnection between metrics.
Questions:
In [3]:
def timeseries(col):
""" Make a simple timeseries view of a column """
data = np.array(list(get_columns([col])), dtype=np.float)
fig = plt.figure(figsize=(15,8))
ax = plt.subplot(111)
plt.grid(lw=2)
plt.plot(data, 'bo')
plt.xlabel("Column %d" % col)
plt.show()
In [4]:
timeseries(4)
In [5]:
timeseries(5)
In [6]:
timeseries(6)
In [7]:
def histogram(col):
""" Flatten the timeseries """
data = np.array(list(get_columns([col])), dtype=np.float)
fig = plt.figure(figsize=(15,8))
ax = plt.subplot(111)
plt.grid(lw=2)
plt.hist(data, bins=20)
plt.xlabel("Column %d" % col)
plt.show()
In [8]:
histogram(4)
In [9]:
histogram(5)
In [10]:
histogram(6)
In [11]:
def scatter(x, y):
data = np.array(list(get_columns([x,y])), dtype=np.float)
fig = plt.figure(figsize=(15,8))
ax = plt.subplot(111)
plt.grid(lw=2)
plt.plot(data[:,0], data[:,1], 'bo')
plt.xlabel("Column %d" % x)
plt.ylabel("Column %d" % y)
plt.show()
In [12]:
scatter(4,5)
In [13]:
scatter(5,6)
In [14]:
def scatter3d(x,y,z):
""" Scatter plots, now it 3D! """
data = np.array(list(get_columns([x,y,z])), dtype=np.float)
fig = plt.figure(figsize=(15,8))
ax = fig.add_subplot(111, projection='3d')
plt.plot(data[:,0], data[:,1], data[:,2], 'bo')
ax.set_xlabel('Column %d' % x)
ax.set_ylabel('Column %d' % y)
ax.set_zlabel('Column %d' % z)
grid_wt = 0.5
ax.w_xaxis._axinfo.update({'grid' : {'color': (0, 0, 0, grid_wt)}})
ax.w_yaxis._axinfo.update({'grid' : {'color': (0, 0, 0, grid_wt)}})
ax.w_zaxis._axinfo.update({'grid' : {'color': (0, 0, 0, grid_wt)}})
plt.grid(lw=2)
plt.show()
In [15]:
scatter3d(4,5,6)
It's important to recogize that there is not just one clustering technique that always works. There are a wide variety of methods and each has their own set of advantages and disadvantages. Getting a feel for the type of data you are working with prior to applying a algorithm can help you make an informed decision.
Here's a nice visualization from scikit-learn on six different clustering algorithm and how they perform given four very different distributions of data.
In [16]:
Image(url='http://scikit-learn.org/0.11/_images/plot_cluster_comparison_11.png')
Out[16]:
Given a set of observations $(x_1, x_2, ..., x_n)$, where each observation is a d-dimensional real vector, k-means clustering aims to partition the n observations into k sets where $(k \leq n)$ and $\boldsymbol S = {S_1, S_2, ..., S_k}$ so as to minimize the within-cluster sum of squares (WCSS):
$\displaystyle \underset{\mathbf{S}} {\operatorname{arg\,min}} \sum_{i=1}^{k} \sum_{\mathbf x_j \in S_i} \left\| \mathbf x_j - \boldsymbol\mu_i \right\|^2 $
where $\boldsymbol\mu_i$ is the mean of points in $S_i$.
Here, we'll walk through the K-Means clustering algorithm with a 2 dimensional data set to get a feel for how it works and then apply the algorithm on our data set via the scikit-learn module.
In [17]:
def create_points(sets):
"""
Create test data points for a list of groups
"""
def gen(n, mu, s):
return np.random.randn(n * 2).reshape(n, 2) * s + np.array(mu)
pts = None
for idx, s in enumerate(sets):
if pts is None:
pts = np.hstack([gen(*s), idx * np.ones((s[0],1))])
else:
new_pts = np.hstack([gen(*s), idx * np.ones((s[0],1))])
pts = np.vstack([pts, new_pts])
np.random.shuffle(pts)
return pts
def calc_labels(centers, pts):
"""
Label the points based on the current approximation of the cluster centers
"""
for pt in pts[:,:2]:
ss = np.inf
label = None
for idx, c in enumerate(np.array(centers)):
dist = pt - c
new_ss = np.sum(dist * dist)
if new_ss < ss:
label = idx
ss = new_ss
yield label
def scatter(pts, centers=None, title=None, find_groups=False):
"""
Viz the data. Optionally show the center locations and color code the points.
"""
colors = 'brygk'
markers = 'o^s*+'
fig = plt.figure(figsize=(15,8))
ax = plt.subplot(111)
plt.grid(lw=2)
for g in np.unique(pts[:,2]):
kwargs = {
'marker': markers[int(g)],
's': 120,
}
if find_groups:
kwargs['c'] = [colors[int(i)] for i in calc_labels(centers, pts[pts[:,2] == g])]
plt.scatter(
pts[pts[:,2] == g][:,0],
pts[pts[:,2] == g][:,1],
**kwargs)
if centers is not None:
for idx, center in enumerate(centers):
plt.text(*center, s='ABCDEF'[idx])
if title:
plt.title(title)
ax.set_aspect('equal')
plt.show()
In [18]:
sets = [
(30, (0, 5), 4),
(30, (30, 10), 4),
(30, (10, 0), 2),
]
pts = create_points(sets)
scatter(pts, title='The original data set')
In [19]:
centers = [
(3,10),
(5,0),
(10,5),
]
scatter(pts, centers, title='The initial guess for the centers - (Way off)')
In [20]:
scatter(pts, centers, title='Assign groups by which is closest', find_groups=True)
In [21]:
labels = np.array([g for g in calc_labels(centers, pts)])
new_centers = []
for idx, c in enumerate(centers):
new_centers.append(tuple(pts[labels == idx][:, :2].mean(axis=0)))
centers = new_centers
scatter(pts, centers, title='Move guesses to the center of their cluster and repeat', find_groups=True)
In [22]:
def clustering(x, y, z, groups=2, normalize=False):
""" Show the K-Means clustering result with optional normalization and number of clusters """
data = np.array(list(get_columns([x,y,z])), dtype=np.float)
if normalize:
data = (data - data.mean(axis=0)) / data.std(axis=0)
kmeans = cluster.KMeans(n_clusters=groups, n_init=15)
kmeans.fit(data)
colors = 'rbyg'
fig = plt.figure(figsize=(15,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data[:,0], data[:,1], data[:,2], marker='o', c=[colors[g] for g in kmeans.labels_], alpha=0.7, s=40)
ax.set_xlabel('Column %d' % x)
ax.set_ylabel('Column %d' % y)
ax.set_zlabel('Column %d' % z)
grid_wt = 0.5
ax.w_xaxis._axinfo.update({'grid' : {'color': (0, 0, 0, grid_wt)}})
ax.w_yaxis._axinfo.update({'grid' : {'color': (0, 0, 0, grid_wt)}})
ax.w_zaxis._axinfo.update({'grid' : {'color': (0, 0, 0, grid_wt)}})
plt.grid(lw=2)
plt.show()
In [23]:
clustering(4,5,6)
In [24]:
clustering(4,5,6,groups=4)
In [25]:
clustering(4,5,6,groups=3,normalize=True)
Questions:
In [26]:
def principal_component_analysis(cols, groups=2, normalize=False):
data = np.array(list(get_columns(cols)), dtype=np.float)
if normalize:
data = (data - data.mean(axis=0)) / data.std(axis=0)
pca = decomposition.PCA(n_components=2)
pca.fit(data)
X = pca.transform(data)
kmeans = cluster.KMeans(n_clusters=groups, n_init=15)
kmeans.fit(data)
colors = 'rbyg'
fig = plt.figure(figsize=(15,8))
ax = plt.subplot(111)
plt.grid(lw=2)
plt.scatter(X[:,0], X[:,1], marker='o', c=[colors[g] for g in kmeans.labels_], alpha=0.7, s=40)
ldata = (pca.components_[0][0], cols[0], pca.components_[0][1], cols[1], pca.components_[0][2], cols[2])
plt.xlabel("x = %g(C%d) %+g(C%d) %+g(C%d)" % ldata)
ldata = (pca.components_[1][0], cols[0], pca.components_[1][1], cols[1], pca.components_[1][2], cols[2])
plt.ylabel("y = %g(C%d) %+g(C%d) %+g(C%d)" % ldata)
plt.show()
In [27]:
principal_component_analysis([4,5,6])
The new $x$ and $y$ are a linear combination of the three columns with the new $x$ is mostly a mappping from column 6.
In [28]:
principal_component_analysis([4,5,6],groups=3)
The two larger clusters are pretty clear, but since the data wasn't normalized C6 gets the most attention and same "vertical" split appears as in the 3d representation.
In [29]:
principal_component_analysis([4,5,6],groups=3,normalize=True)
The three groupings seem to be the most appropriate clustering and was reasonably straight forward to validate with the 3D graph.
In [29]: