In [2]:
import numpy as np
import numpy.random as npr
import pylab as pl
%matplotlib inline
In [3]:
# for reproducibility
npr.seed(2)
In [4]:
# generate some synthetic data
num_centers = 10
dim = 2
centers = np.zeros((num_centers,dim))
sizes = (npr.rand(num_centers)+0.5)
num_points = 10000
for i in xrange(1,num_centers):
centers[i] = centers[i-1] + npr.randn(dim)*3
X = np.zeros((num_points,dim))
Y = np.zeros(num_points)
for i in range(num_points):
c = npr.randint(num_centers)
X[i] = npr.randn(dim)*sizes[c] + centers[c]
Y[i] = c
In [5]:
pl.scatter(X[:,0],X[:,1],c=Y,linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1],c=range(num_centers))
for i in range(num_centers-1):
cpts = centers[i:i+2]
pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')
Out[5]:
In [6]:
# to do: replace random walk with a branching process
In [7]:
from sklearn.neighbors import KernelDensity
In [8]:
bandwidth=0.5
kde = KernelDensity(bandwidth)
In [9]:
kde.fit(X)
scores = kde.score_samples(X)
In [10]:
scores.min(),scores.max()
Out[10]:
In [11]:
sigmoid = lambda x : (1 / (1 + np.exp(-x)))
In [12]:
accept_prob = 1 - (scores - scores.min()) / (scores.max() - scores.min())
accept_prob_s = sigmoid(accept_prob)
print(accept_prob.min(),accept_prob.max())
print(accept_prob_s.min(),accept_prob_s.max())
In [13]:
pl.scatter(X[:,0],X[:,1],c=(np.log(accept_prob+1)+0.3),linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis('off')
Out[13]:
In [14]:
down_sampled_X = []
down_sampled_Y = []
down_sampled_i = []
for i in range(num_points):
if npr.rand() < accept_prob[i]:
down_sampled_i.append(i)
down_sampled_X.append(X[i])
down_sampled_Y.append(Y[i])
down_sampled_X = np.array(down_sampled_X)
down_sampled_Y = np.array(down_sampled_Y)
In [15]:
len(X)
Out[15]:
In [16]:
len(down_sampled_X)
Out[16]:
In [17]:
kde_2 = KernelDensity(bandwidth)
kde_2.fit(down_sampled_X)
new_densities = kde_2.score_samples(down_sampled_X)
In [18]:
new_densities.shape
Out[18]:
In [19]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=-new_densities.max() - new_densities,linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis('off')
Out[19]:
In [20]:
from sklearn.cluster import AgglomerativeClustering
In [21]:
cluster_model = AgglomerativeClustering(10)
cluster_model.fit(down_sampled_X)
Out[21]:
In [22]:
cluster_pred = cluster_model.labels_
In [23]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=Y[down_sampled_i],linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(centers[:,0],centers[:,1]);
pl.axis
Out[23]:
In [24]:
cluster_centers = []
for i in range(len(set(cluster_pred))):
cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
cluster_centers = np.array(cluster_centers)
In [25]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')
Out[25]:
In [26]:
cluster_true = Y[down_sampled_i]
In [27]:
from sklearn import metrics
In [28]:
# for just the downsampled points
metrics.adjusted_mutual_info_score(cluster_true,cluster_pred)
Out[28]:
In [29]:
def distance(x,y):
return np.sqrt(np.sum((x-y)**2))
In [30]:
def assign_to_nearest_landmark(points,landmarks):
nearest = np.zeros(len(points))
for i,point in enumerate(points):
distances = np.zeros(len(landmarks))
for j,landmark in enumerate(landmarks):
distances[j] = distance(point,landmark)
nearest[i] = np.argmin(distances)
return nearest
In [31]:
upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)
In [32]:
pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
#pl.colorbar();
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
pl.axis('off')
Out[32]:
In [33]:
# for all points
metrics.adjusted_mutual_info_score(Y,upsampled_cluster_pred)
Out[33]:
In [34]:
from sklearn.cluster import KMeans
In [35]:
kmeans_model = KMeans(10)
In [36]:
kmeans_model.fit(X)
Out[36]:
In [37]:
kmeans_pred = kmeans_model.predict(X)
metrics.adjusted_mutual_info_score(Y,kmeans_pred)
Out[37]:
In [38]:
# compute distance graph
import networkx as nx
def distance_graph(centers):
G = nx.Graph()
for i in range(len(centers)-1):
for j in range(i+1,len(centers)):
G.add_edge(i,j,weight=distance(centers[i],centers[j]))
return G
G = distance_graph(centers)
In [39]:
mst = nx.minimum_spanning_tree(G)
In [40]:
mst.edges()
Out[40]:
In [41]:
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=Y[down_sampled_i],linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1]);
for e in mst.edges():
cpts = centers[np.array(e)]
pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')
Out[41]:
In [42]:
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
for e in mst.edges():
cpts = cluster_centers[np.array(e)]
pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')
Out[42]:
In [43]:
occupancy = np.array([sum(upsampled_cluster_pred==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())
In [44]:
norm_occupancy.max()
Out[44]:
In [45]:
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
cpts = cluster_centers[np.array(e)]
pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1],
c=cluster_centers[:,0],s=100+(200*norm_occupancy));
pl.axis('off')
Out[45]:
In [46]:
occupancy = np.array([sum(Y==i) for i in range(len(set(upsampled_cluster_pred)))])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())
In [47]:
G = distance_graph(centers)
mst = nx.minimum_spanning_tree(G)
for e in mst.edges():
cpts = centers[np.array(e)]
pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(centers[:,0],centers[:,1],
c=centers[:,1],s=100+(200*norm_occupancy));
pl.axis('off')
Out[47]:
Re-doing the thing for a hypothesis of 50 clusters instead
In [97]:
def SPADE(X,num_clusters=50,kde_bandwidth=0.5,native_layout=False):
num_points = len(X)
from time import time
t0 = time()
print("Estimating density...")
# estimate density
kde = KernelDensity(kde_bandwidth)
kde.fit(X)
scores = kde.score_samples(X)
print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
# "down-sample"
print("Down-sampling...")
accept_prob = 1 - (scores - scores.min()) / (scores.max() - scores.min())
down_sampled_X = []
down_sampled_i = []
for i in range(num_points):
if npr.rand() < accept_prob[i]:
down_sampled_i.append(i)
down_sampled_X.append(X[i])
down_sampled_X = np.array(down_sampled_X)
print("Done! Reduced data from {1} to {2} points. Elapsed time: {0:.2f}s".format(time() - t0,len(X),len(down_sampled_X)))
print("Clustering...")
# cluster down-sampled data
cluster_model = AgglomerativeClustering(num_clusters)
cluster_model.fit(down_sampled_X)
# compute cluster centers
cluster_pred = cluster_model.labels_
cluster_centers = []
for i in range(num_clusters):
cluster_centers.append(np.mean(down_sampled_X[cluster_pred==i],0))
cluster_centers = np.array(cluster_centers)
print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
print("Up-sampling...")
# "up-sample" (assign all points to nearest cluster center)
upsampled_cluster_pred = assign_to_nearest_landmark(X,cluster_centers)
# compute normalized cluster occupancies
occupancy = np.array([sum(upsampled_cluster_pred==i) for i in range(num_clusters)])
norm_occupancy = (occupancy - occupancy.min()) / (occupancy.max() - occupancy.min())
print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
# compute MST
print("Computing MST...")
G = distance_graph(cluster_centers)
mst = nx.minimum_spanning_tree(G)
print("Done! Elapsed time: {0:.2f}s".format(time() - t0))
# plot SPADE tree
print("Plotting SPADE tree...")
if X.shape[1] == 2:
print("Plotting overlay on data")
pl.scatter(X[:,0],X[:,1],c=upsampled_cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(cluster_centers[:,0],cluster_centers[:,1]);
for e in mst.edges():
cpts = cluster_centers[np.array(e)]
pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
pl.axis('off')
pl.figure()
# set positions by force-directed layout
pos = nx.graphviz_layout(mst)
positions = np.zeros((len(pos),2))
for p in pos:
positions[p] = pos[p]
# if the data is 2D, we can optionally use cluster centers
if X.shape[1] == 2 and native_layout:
positions = cluster_centers
for e in mst.edges():
cpts = positions[np.array(e)]
pl.plot(cpts[:,0],cpts[:,1],c='black',linewidth=2)
#pl.scatter(down_sampled_X[:,0],down_sampled_X[:,1],c=cluster_pred,linewidths=0,alpha=0.5)
pl.scatter(positions[:,0],positions[:,1],
c=cluster_centers[:,0],s=100+(200*norm_occupancy));
pl.axis('off')
print("Done! Total elapsed time: {0:.2f}s".format(time() - t0))
return cluster_centers,mst
In [86]:
results = SPADE(X[:1000])
In [ ]:
In [ ]:
In [223]:
centers = np.array([[0,0],[1,0],[0,1],[1,1]],dtype=float)
centers -= 0.5
centers = np.vstack((centers,centers*2,centers*3,
centers*4,centers*5,centers*6,
centers*7,centers*8,centers*9,
centers*10,centers*11,centers*12,
centers*13,centers*14,
[0,0]))
In [224]:
centers *= 3
In [225]:
kde = KernelDensity()
kde.fit(centers)
Out[225]:
In [226]:
samples = kde.sample(5000)
In [227]:
pl.scatter(samples[:,0],samples[:,1],linewidths=0,
c=kde.score_samples(samples))
pl.axis('off')
Out[227]:
In [228]:
for i in range(10):
pl.figure()
results = SPADE(samples)