In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from __future__ import division
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.grid'] = False
plt.rcParams['figure.figsize'] = (10,16)
import logging
logger = logging.getLogger()
Because: $$\sqrt{\frac{{|x_1|}^2+{|x_2|}^2}{2}} \geq \frac{|x_1|+|x_2|}{2}$$ We have: $$\sqrt{\frac{{|x_1 - x_2|}^2+{|y_1 - y_2|}^2}{2}} \geq \frac{|x_1 - x_2|+|y_1 - y_2|}{2}$$ So: $$E[d(\mathbf{x}, \mathbf{y})] \geq \frac{\sqrt{2}}{3}$$
While: $$\sqrt{{|x_1 - x_2|}^2+{|y_1 - y_2|}^2} \leq |x_1 - x_2|+|y_1 - y_2|$$ So: $$E[d(\mathbf{x}, \mathbf{y})] \leq \frac{2}{3}$$
Above all: $$\frac{\sqrt{2}}{3} \leq E[d(\mathbf{x}, \mathbf{y})] \leq \frac{2}{3}$$
This algorithm can only be used for relatively small datasets.
procedure:
We begin with every point in its own cluster. As time goes on, larger clusters will be constructed by combining two smaller clusters.
Hence we have to decide in advance:
How to represent cluster?
How to choose clusters to merge?
When to stop?
There is no substantial change in the option for stopping citeria and combining citeria when we move from Euclidean to Non-Euclidean spaces.
In [2]:
# Example 7.2
logger.setLevel('WARN')
points = np.array([
[4, 10],
[7, 10],
[4, 8],
[6, 8],
[3, 4],
[10, 5],
[12, 6],
[11, 4],
[2, 2],
[5, 2],
[9, 3],
[12, 3]
],
dtype=np.float
)
x, y = points[:,0], points[:,1]
cluster = range(len(x))
#cluster_colors = plt.get_cmap('hsv')(np.linspace(0, 1.0, len(cluster)))
cluster_colors = sns.color_palette("hls", len(cluster))
plt.scatter(x, y, c=map(lambda x: cluster_colors[x], cluster))
df_points = pd.DataFrame({
'x': x,
'y': y,
'cluster': cluster
}
)
df_points
Out[2]:
In [3]:
logger.setLevel('WARN')
class Hierarchical_cluster():
def __init__(self):
pass
def clustroid_calc(self, df_points, calc_func=np.mean):
clustroid = df_points.groupby('cluster').aggregate(calc_func)
logger.info('\n clustroid:{}'.format(clustroid))
return clustroid
def candidate_merge(self, clustroid):
from scipy.spatial.distance import pdist, squareform
clustroid_array = clustroid.loc[:,['x','y']].as_matrix()
dist = squareform(pdist(clustroid_array, 'euclidean'))
cluster = clustroid.index
df_dist = pd.DataFrame(dist, index=cluster, columns=cluster)
df_dist.replace(0, np.nan, inplace=True)
logger.info('\n dist:{}'.format(df_dist))
flat_index = np.nanargmin(df_dist.as_matrix())
candidate_iloc = np.unravel_index(flat_index, df_dist.shape)
candidate_loc = [cluster[x] for x in candidate_iloc]
logger.info('candidate cluster:{}'.format(candidate_loc))
new_cluster, old_cluster = candidate_loc
return new_cluster, old_cluster
def combine(self, df_points, show=False):
clustroid = self.clustroid_calc(df_points)
new_cluster, old_cluster = self.candidate_merge(clustroid)
df_points.cluster.replace(old_cluster, new_cluster, inplace=True)
new_order, old_order = df_points.merge_order[[new_cluster, old_cluster]]
df_points.merge_order[new_cluster] = {'l': new_order, 'r': old_order}
if show:
plt.figure()
plt.scatter(df_points.x, df_points.y, c=map(lambda x: cluster_colors[x], df_points.cluster))
return df_points
def cluster(self, df_points, cluster_nums=1, show=False):
assert cluster_nums > 0, 'The number of cluster should be positive.'
df_points['merge_order'] = [[x] for x in range(len(df_points.x))]
while len(set(df_points.cluster)) > cluster_nums:
df_points = self.combine(df_points, show)
In [4]:
logger.setLevel('WARN')
df_p = df_points.copy()
test = Hierarchical_cluster()
test.cluster(df_p, 1, show=True)
In [5]:
import json
print json.dumps(df_p.merge_order[0], sort_keys=True, indent=4)
The algorithm is $O(n^3) = \sum_{i=n}^{2} C_n^2$, since it computes the distances between each pair of clusters in iteration.
Optimize:
At first, computing the distance between all pairs. $O(n^2)$.
Save the distances information into a priority queue, in order to get the smallest distance in one step. $O(n^2)$.
When merging two clusters, we remove all entries involving them in the priority queue. $O(n \lg n) = 2n \times O(\lg n)$.
Compute all the distances between the new cluster and the remaining clusters.
In [6]:
plt.figure(figsize=(10,16))
plt.imshow(plt.imread('./res/fig7_7.png'))
Out[6]:
We want to pick points that have a good chance of lying in different clusters.
two approaches:
Cluster a sample of the data, and pick a point from the $k$ clusters.
Pick points that are as far away from one another as possible.
Pick the first point at random;
WHILE there are fewer than k points DO
ADD the point whose minimum disance from the selected points is as large as possible;
END;
In [7]:
plt.figure(figsize=(10,16))
plt.imshow(plt.imread('./res/fig7_9.png'))
Out[7]:
We can use a binary search to find the best values for $k$.
In [8]:
plt.scatter(df_points.x, df_points.y)
Out[8]:
In [9]:
df_points['cluster'] = 0
df_points
Out[9]:
In [10]:
logger.setLevel('WARN')
class k_means_cluster():
def __init__(self, max_itier=15):
self.max_itier = max_itier
def pick_init_points(self, df_points, k):
df_clusters = df_points.sample(k, axis=0)
df_clusters.reset_index(drop=True, inplace=True)
df_clusters['cluster'] = df_clusters.index
return df_clusters
def assign_point_to_cluster(self, point, df_clusters):
from scipy.spatial.distance import cdist
logger.info('\n point:{}\n df_clusters:{}'.format(point, df_clusters))
p = point[['x','y']].to_frame().T
c = df_clusters.loc[:,['x','y']]
logger.info('\n p:{}\n c:{}'.format(p, c))
dist = cdist(p, c)
logger.info('dist:{}'.format(dist))
cluster = np.argmin(dist)
logger.info('cluster:{}'.format(cluster))
return pd.Series([cluster, point.x, point.y], index=['cluster', 'x', 'y'])
def calc_centriod(self, df_points):
centroid = df_points.groupby('cluster').mean()
logger.info('\n centroid:\n{}'.format(centroid))
return centroid
def cluster(self, df_points, k):
df_clusters = self.pick_init_points(df_points, k)
for _ in xrange(self.max_itier):
df_points = df_points.apply(self.assign_point_to_cluster, args=(df_clusters,), axis=1)
logger.info('iter: \n df_points:\n{}'.format(df_points))
clusters = self.calc_centriod(df_points)
#todo: stop condition
df_clusters = clusters
return df_points, df_clusters
test = k_means_cluster()
k = 3
cluster_colors = sns.color_palette("hls", k)
df_points_res, df_clusters_res = test.cluster(df_points, k)
plt.scatter(df_points_res.x, df_points_res.y, c=map(lambda x: cluster_colors[x], df_points_res.cluster.astype(np.int)))
Out[10]:
In [14]:
plt.imshow(plt.imread('./res/fig7_10.png'))
Out[14]:
The points of the data file are read in chunks.
The main-memory data other than the chunk consists of three types of objects:
The Discard Set:
simple summaries of the clusters themselves.
The Compressed Set:
summaries of the points that have been found close to one another, but not close to any cluster. Each represented set of points is called a minicluster.
The Retained Set:
remaining points.
In [15]:
plt.imshow(plt.imread('./res/fig7_11.png'))
Out[15]:
The discard and compressed sets are represented by $2d + 1$ values, if the data is $d$-dimensional.
These numbers are:
The number of points represented, $N$.
The sum of the components of all the points in each dimension. a vector $SUM$ of length $d$.
The sum of the squares of the components of all the points in each dimension. a vector $SUMSQ$ of length $d$.
Our real goal is to represent a set of points by their count, their centroid and the standard deviation in each dimension.
count: $N$.
centriod: $SUM_i / N$.
standard deviation: $SUMSQ_i / N - (SUM_i / N)^2$
First, all points that are sufficiently close to the centriod of a cluster are added to that cluster, Discard Set.
For the points that are not sufficiently close to any centriod, we cluster them, along with the points in the retained set.
Merge minicusters of compressed set if possible.
Points of discard and compressed set are written out to secondary memory.
Finally, if this is the last chunk of input data, we need do something with the compressed and retained set.
Add $p$ to a cluster if it not only has the centriod closest to $p$, but it is very unlikely that, after all the points have been processed, some other cluster centriod will be found to be nearer to $p$.
complex statiscal calculation.
We can measure the probability that, if $p$ belongs to a cluster, it would be found as far as it is from the centriod of that cluster.
normally distributed, independent $\to$ Mahalanobis distance:
Let $p = [p_1, p_2, \dotsc, p_d]$ be a point and $c = [c_1, c_2, \dotsc, c_d]$ be the centriod of a cluster.
$$\sqrt{\sum_{i=1}^{d} ( \frac{p_i - c_i}{\sigma_i} )^2 }$$
We choose that cluster whose centriod has the least Mahalanobis distance, and we add $p$ to that cluster provided the Mahalanobis distance is less than a threshold. 概率论上足够接近。
Process:(key factor: fixed fraction for moving and how close is sufficient to merge).
Initialization
Sample, and then cluster.
Hierarchial clustering is advisable.
pick a subset as representative points(as far from one another as possible in the same cluster).
Move each of the representative poins a fixed fraction of the distance between its location and the centriod of its cluster. (Euclidean space).
Merge: if two clusters have a pair of representative points that are sufficiently close.
Repeat until convergence.
Point assignment:
We assign $p$ to the cluster of the representative point that is closest to $p$.
In [7]:
plt.figure(figsize=(10,5))
plt.imshow(plt.imread('./res/fig7_12.png'))
plt.figure(figsize=(12,5))
plt.imshow(plt.imread('./res/fig7_13.png'))
plt.figure(figsize=(12,5))
plt.imshow(plt.imread('./res/fig7_14.png'))
Out[7]:
The following features form the representation of a cluster.
$N$, the number of points in the cluster.
The clustroid of the cluster and its ROWSUM: $\sum_{p_i \in C} \| p - p_i \|_2$
$k$ points that are closest to the clustroid, and their rowsums.
assumption: new clustriod would be one of them.
$k$ points that are furthest from the clustroid, and their rowsums.
assumption: if two clusters are close, then a pair of points distant from their respective clustriods would be close.
Each leaf of the tree holds as many cluster representations as can fit $\to$ B-tree or R-tree.
An interior node of the cluster tree holds a sample of the clustroids of its subtrees.
init:
taking a main-memory sample and clustering it hierarchically $\to$ a tree $T$.
selecting from $T$ certain of its points that represent clusters of approximately some disired size $n$ $\to$ the leaf of the cluster-representing tree.
grouping clusters with a commom ancestor in $T$ into interior nodes.
rebalancing, similar to the reorganization of a B-tree.
From root to a leaf, we always choose the node closest to the new point $p$.
Finally,, we pick the cluster whose clustriod is closest to $p$. then:
Add 1 to $N$.
Add $d(p,q)$ to sumrows of all $q$ that are in the cluster before.
estimate: $$ROWSUM(p) = ROWSUM(c) + N d^2(p,c)$$
if $p$ is one of the $k$ closest or furthest points from the clustroid, then replacing one.
if $p$ is closer to other points than the clustroid, then replacing it.
it is possible that the true clustroid will no longer be one of the original $k$ closest points $\to$ brought data on disk into main memory periodically for a recomputation of the cluster featrues.
The GRGPF Algorithm assumes that there is a limit on the radius ($\sqrt(ROWSUM(c)/N)$).
If a cluster's radius grows too large $\to$ split it into two. As in a B-tree, this splitting can ripple all the way up to the root.
If the tree is too large to fit in main memory, we raise the limit on the radius and consider merging pairs of clusters. and then:
recalculate all rowsums:
for $p \in c_1$, $c_1 \cap c_2 = C$:
$$ROWSUM_c(p) = ROWSUM_{c_1}(p) + N_{c_2} (d^2(p,c_1) + d^2(c_1,c_2)) + ROWSUM_{c_2}(c_2) $$
filter the new clustriod and $k$ closest points, $k$ furthest points from orginal $4k+1$ features of $c_1$ and $c_2$.
#todo: exercise
Simplified BDMO Algorithm that builds on the methodology for counting ones in a steam in Sec. 4.6
Bucket:
its size is the number of points.
its size obey the restriction that there are one or two of each size, up to some limit. And the sequence of allowable bucket size does not need start with 1, but each size is twice the previous size.
all sizes: nondecreasing as we go back in time.
The contents:
The size.
The timestamp.
A collection of records after clustering:
the number of points in the cluster.
the centriod.
Any other parameters necessary.
drop the out-date buckets (go beyong $N$ points).
merge buckets if there are three identical size.
Three examples:
k-means approach in a Euclidean space.
assumption: the cluster are changing very slowly.
expect the cluster centriods to migrate sufficiently quickly.
create more than $k$ clusters in each bucket.
non-Euclidean space and no constrain on the number of clusters.
like GRGPF Algorithm.
Question about the most recent $m$ points in the stream.
choose the smallest set of buckets that cover the last $m$ points. (Always less than $2m$).
assume that the points between $2m$ and $m+1$ will not affect the result.
Or use a more complex bucketing shceme in Sec 4.6.6 to cover at most the last $m(1+\epsilon)$.
then pool all their clusters in the clusters and merge them.
#todo: exercise