In [6]:
import numpy as np
import pylab as pl
%matplotlib inline
pl.rcParams['font.family']='Serif'
High-level overview of the FLOW-MAP algorithm:
In [7]:
X = np.random.randn(1000,2)
In [8]:
# downsampling:
# 1. Estimating density
# 2. Choosing as a function of density
In [9]:
from sklearn import neighbors
In [10]:
k = 10
In [11]:
knn = neighbors.NearestNeighbors()
In [12]:
knn.fit(X)
Out[12]:
In [12]:
In [13]:
def local_density_k(X,k=10,metric=None):
if metric != None:
bt = neighbors.BallTree(X,200,metric=metric)
neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
else:
neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
distances = np.array(neighbor_graph.mean(1))[:,0]
return 1-((distances - distances.min())/(distances.max() - distances.min()))
In [14]:
def local_density_r(X,r=0.1,metric=None):
if metric != None:
bt = neighbors.BallTree(X,200,metric=metric)
neighbor_graph = neighbors.radius_neighbors_graph(bt,r)
else:
neighbor_graph = neighbors.radius_neighbors_graph(X,r)
counts = np.array(neighbor_graph.sum(1))[:,0]
return ((counts - counts.min())/(counts.max() - counts.min()))
In [15]:
def compute_accept_prob(densities,
outlier_density_percentile=1.0,
target_density_percentile=3.0):
''' densities is a vector of densities '''
OD = np.percentile(densities,outlier_density_percentile)
TD = np.percentile(densities,target_density_percentile)
#print(OD,TD)
accept_prob = np.zeros(len(densities))
for i,LD in enumerate(densities):
if LD < OD:
accept_prob[i] = 0
elif LD > OD and LD <= TD:
accept_prob[i] = 1
elif LD > TD:
accept_prob[i] = TD/LD
# output
return accept_prob
In [16]:
x = np.random.randn(10000)
In [17]:
pl.hist(x,bins=50);
In [17]:
In [18]:
densities = np.random.randn(1000)+10
In [19]:
ap = compute_accept_prob(densities)
In [20]:
pl.scatter(densities,ap)
Out[20]:
In [21]:
densities[(densities<8.1631259434) * (densities>7.71169482942)]
Out[21]:
In [22]:
np.percentile(np.random.randn(1000),50.0)
Out[22]:
In [23]:
def accept_according_to_probs(accept_prob):
''' just output indices'''
return accept_prob > np.random.rand(len(accept_prob))
In [24]:
from sklearn import neighbors
from sklearn.neighbors import KernelDensity
def generate_blobs(num_samples=5000,separation=8):
centers = np.array([[0,0],[1,0],[0,1],[1,1]],dtype=float)
centers -= 0.5
centers = np.vstack((centers,#centers*2,centers*3,
#centers*4,centers*5,centers*6,
#centers*7,centers*8,centers*9,
#centers*10,centers*11,centers*12,
#centers*13,centers*14,
[0,0]))
centers *= separation
kde = KernelDensity()
kde.fit(centers)
samples = kde.sample(num_samples)
density = kde.score_samples(samples)
return samples,density
samples,density = generate_blobs(5000,10)
In [25]:
pl.scatter(samples[:,0],samples[:,1],linewidths=0,alpha=0.1)
Out[25]:
In [26]:
est_density = local_density_k(samples)
In [27]:
pl.hist(est_density,bins=50);
In [28]:
accept_prob = compute_accept_prob(est_density)
In [29]:
accept_ind = accept_according_to_probs(accept_prob)
In [30]:
downsampled = samples[accept_ind]
In [31]:
pl.scatter(downsampled[:,0],downsampled[:,1],linewidths=0,alpha=0.1)
Out[31]:
In [32]:
len(downsampled),len(samples)
Out[32]:
In [32]:
In [33]:
from sklearn.cluster import AgglomerativeClustering
In [34]:
# compare with other clustering algorithms
In [35]:
target_clust_num = 200
cluster_model = AgglomerativeClustering(target_clust_num)
C = cluster_model.fit_predict(downsampled)
In [35]:
In [ ]:
In [36]:
min(C),max(C)
Out[36]:
In [41]:
C==0
Out[41]:
In [42]:
C.shape,X.shape,downsampled.shape
Out[42]:
In [43]:
len(set(C)),max(C)
Out[43]:
In [44]:
X.T.shape
Out[44]:
In [45]:
# compute cluster centers given cluster assignments
def compute_cluster_centers(X,C):
centers = np.zeros((len(set(C)),len(X.T)))
for i in set(C):
points = X[C==i]
centers[i] = np.mean(points,0)
return centers
In [46]:
centers = compute_cluster_centers(downsampled,C)
In [47]:
centers.shape
Out[47]:
In [48]:
from scipy.spatial import distance
pdist = distance.pdist(centers)
distmat = distance.squareform(pdist)
In [49]:
pl.imshow(distmat,interpolation='none')
Out[49]:
In [73]:
r = np.percentile(pdist,1.0)
r
Out[73]:
In [51]:
adj = (distmat < r)
In [75]:
num_neighbors = local_density_r(centers,r)
num_neighbors.shape
Out[75]:
In [82]:
pl.hist(num_neighbors,bins=len(set(num_neighbors)));
In [77]:
sorted_clust_id = sorted(range(len(num_neighbors)),key=lambda i:num_neighbors[i])
In [83]:
min_edges = 2
max_edges = 20
In [91]:
def num_edges(densities,min_edges=2,max_edges=20):
''' pass in an array of densities'''
assert(len(densities)>1)
min_density = np.min(densities)
max_density = np.max(densities)
lambdas = densities / (max_density - min_density)
return np.array(min_edges + lambdas*(max_edges - min_edges),dtype=int)
In [98]:
num_edges_array = num_edges(num_neighbors)
num_edges_array
Out[98]:
In [95]:
nn = neighbors.NearestNeighbors(max_edges+1)
nn.fit(centers)
Out[95]:
In [100]:
num_edges_array[0]
Out[100]:
In [123]:
nn.kneighbors(centers[0],num_edges_array[0]+1)
Out[123]:
In [ ]:
In [106]:
nn.kneighbors(centers[0],num_edges_array[0]+1)[1][0][1:]
Out[106]:
In [107]:
G = nx.Graph()
In [109]:
G.add_edge(0,107)
In [112]:
G.nodes()
Out[112]:
In [138]:
i=0
dist,neigh = nn.kneighbors(centers[i],num_edges_array[i]+1)
dist,neigh
Out[138]:
In [139]:
for j in range(1,len(dist)):
print(dist[j])
In [152]:
def construct_graph(centers,num_edges_array):
G = nx.Graph()
for i in range(len(centers)):
distances,neighbors = nn.kneighbors(centers[i],num_edges_array[i]+1)
distances = distances[0]
neighbors = neighbors[0]
for j in range(1,len(distances)):
G.add_edge(i,neighbors[j],weight=distances[j])
return G
In [153]:
G = construct_graph(centers,num_edges_array)
In [154]:
G.nodes()
Out[154]:
In [155]:
for line in nx.generate_edgelist(G):
print(line)
In [130]:
# replace adj as desired
In [53]:
w = 1/(distmat)
w
Out[53]:
In [54]:
w*adj
Out[54]:
In [55]:
w[w==np.inf]=0
In [56]:
w
Out[56]:
In [57]:
weighted_adj_mat = w*adj
In [58]:
# compute density cluster
In [59]:
import networkx as nx
In [60]:
G = nx.Graph(weighted_adj_mat)
In [156]:
pos = nx.graphviz_layout(G)
In [157]:
positions = np.array(pos.values())
In [158]:
density.shape,positions.shape
Out[158]:
In [158]:
In [159]:
pl.scatter(positions[:,0],positions[:,1])
Out[159]:
In [161]:
nx.draw(G,pos=positions)
In [162]:
nx.draw_networkx_edges(G,pos=positions)
Out[162]:
In [ ]:
# compute r
In [ ]:
In [ ]:
class FlowMap():