Solutions 5 - Examples in Data Clustering

Assignment 1


In [ ]:

Assignment 2

The letters are written in the file files/Indore.jpg. First, preprocessing is needed.


In [1]:
using Images

In [2]:
img=load("files/Indore.jpg")


Out[2]:

In [3]:
img=map(Gray,img)


Out[3]:

In [4]:
# Extract the matrix
A=float(img)


Out[4]:
800×480 Array{Float64,2}:
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 ⋮                                       ⋱                              
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392

In [5]:
sort(A[:])


Out[5]:
384000-element Array{Float64,1}:
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 ⋮                   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   

In [6]:
# Truncate small elements (0 is black, 1 is white)
sum(A.>0.2)


Out[6]:
375220

In [7]:
A.*=(A.<=0.2)


Out[7]:
800×480 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 ⋮                        ⋮              ⋱            ⋮                      
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0

In [8]:
sum(A.>0), prod(size(A))


Out[8]:
(8780, 384000)

In [9]:
colorview(Gray,A)


Out[9]:

In [10]:
# Increase the contrast for clustering
A=ones(size(A)).*map(Float64,A.>0)


Out[10]:
800×480 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 ⋮                        ⋮              ⋱            ⋮                      
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0

In [11]:
colorview(Gray,rotl90(A))


Out[11]:

In [12]:
# Create the data points from letters
X=Array{Int}(undef,2,sum(A.==1))


Out[12]:
2×8780 Array{Int64,2}:
 4294967296  4294967296  4294967296  84381184  …  0  80757536  0  80742472  0
   84381616    84382198    84381184  84381248     0         2  0         2  0

In [13]:
ind=findall(A.==1)


Out[13]:
8780-element Array{CartesianIndex{2},1}:
 CartesianIndex(183, 156)
 CartesianIndex(184, 156)
 CartesianIndex(185, 156)
 CartesianIndex(186, 156)
 CartesianIndex(182, 157)
 CartesianIndex(183, 157)
 CartesianIndex(184, 157)
 CartesianIndex(185, 157)
 CartesianIndex(186, 157)
 CartesianIndex(187, 157)
 CartesianIndex(182, 158)
 CartesianIndex(183, 158)
 CartesianIndex(184, 158)
 ⋮                       
 CartesianIndex(173, 342)
 CartesianIndex(174, 342)
 CartesianIndex(175, 342)
 CartesianIndex(176, 342)
 CartesianIndex(177, 342)
 CartesianIndex(172, 343)
 CartesianIndex(173, 343)
 CartesianIndex(174, 343)
 CartesianIndex(175, 343)
 CartesianIndex(176, 343)
 CartesianIndex(173, 344)
 CartesianIndex(174, 344)

In [14]:
ind=findall(A.==1)
for i=1:length(ind)
    X[:,i]=[ind[i][1],ind[i][2]]
end
X=map(Float64,X)


Out[14]:
2×8780 Array{Float64,2}:
 183.0  184.0  185.0  186.0  182.0  …  174.0  175.0  176.0  173.0  174.0
 156.0  156.0  156.0  156.0  157.0     343.0  343.0  343.0  344.0  344.0

In [15]:
# import Pkg; Pkg.add("Clustering")

In [16]:
# Solve it
using Clustering

In [17]:
out=kmeans(X,6)


Out[17]:
KmeansResult{Array{Float64,2},Float64,Int64}([627.666 181.271 … 295.108 422.617; 245.858 251.477 … 257.855 256.781], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2  …  2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [9118.88, 9123.34, 9129.8, 9138.26, 8926.47, 8928.93, 8933.39, 8939.84, 8948.3, 8958.76  …  8233.7, 8222.16, 8212.62, 8462.37, 8444.83, 8429.29, 8415.75, 8404.21, 8628.88, 8613.34], [1595, 876, 1320, 1575, 1906, 1508], [1595, 876, 1320, 1575, 1906, 1508], 2.0614671900966853e7, 11, true)

For plotting we use the function from the notebook K-means Algorithm.


In [21]:
function plotKmeansresult(out::KmeansResult,X::Array)
    k=size(out.centers,2)
    # Clusters
    scatter(X[1,findall(out.assignments.==1)],X[2,findall(out.assignments.==1)])
    for j=2:k
        scatter!(X[1,findall(out.assignments.==j)],
            X[2,findall(out.assignments.==j)],markersize=1)
    end
    # Centers
    scatter!(out.centers[1,:],out.centers[2,:],markercolor=:black)
end


Out[21]:
plotKmeansresult (generic function with 1 method)

In [22]:
using Plots

In [23]:
plotKmeansresult(out,X)


Out[23]:
200 300 400 500 600 700 800 175 200 225 250 275 300 325 y1 y2 y3 y4 y5 y6 y7

Now we try the spectral $k$-partitioning. We:

  • form the Laplacian matrix,
  • compute eigenvectors corresponding to $k$ smallest eigenvalues, and
  • cluster those vectors with $k$-means algorithm.

In [24]:
# Now some functions
using SparseArrays
using LinearAlgebra
function my_weight_matrix(src::Array,dst::Array,weights::Array)
    n=nv(G)
    sparse([src;dst],[dst;src],[weights;weights],n,n)
end

my_laplacian(W::AbstractMatrix)=spdiagm(0=>vec(sum(W,dims=2)))-W

function my_normalized_laplacian(L::AbstractMatrix)
    D=1.0./sqrt.(diag(L))
    n=length(D)
    [L[i,j]*(D[i]*D[j]) for i=1:n, j=1:n]
end


Out[24]:
my_normalized_laplacian (generic function with 1 method)

In [25]:
function plotKpartresult(C::Vector,X::Array)
    k=maximum(C)
    # Clusters
    scatter(X[1,findall(C.==1)],X[2,findall(C.==1)])
    for j=2:k
        scatter!(X[1,findall(C.==j)],X[2,findall(C.==j)])
    end
    scatter!()
end


Out[25]:
plotKpartresult (generic function with 1 method)

In [26]:
# import Pkg; Pkg.add("Distances")

In [27]:
using Distances
S=pairwise(SqEuclidean(),X,dims=2)
# S=pairwise(Cityblock(),X)
β=1


Out[27]:
1

In [28]:
# Weight matrix
W=exp.(-β*S)
# Sum of weights
D=vec(sum(W,dims=2))
# Laplacian matrix
L=my_laplacian(W)
# Normalized Laplacian matrix
Ln=my_normalized_laplacian(L)


Out[28]:
8780×8780 Array{Float64,2}:
  1.0          -0.297331     -0.0148032    …   0.0           0.0        
 -0.297331      1.0          -0.25568          0.0           0.0        
 -0.0148032    -0.25568       1.0              0.0           0.0        
 -0.000115989  -0.0148029    -0.297324         0.0           0.0        
 -0.128317     -0.00549362   -3.70157e-5       0.0           0.0        
 -0.256394     -0.0811091    -0.00403818   …   0.0           0.0        
 -0.0904931    -0.211528     -0.0778166        0.0           0.0        
 -0.00450498   -0.0778096    -0.211508         0.0           0.0        
 -3.15852e-5   -0.004031     -0.0809647        0.0           0.0        
 -3.89101e-8   -3.66928e-5   -0.00544569       0.0           0.0        
 -0.00638751   -0.000273467  -1.8426e-6    …   0.0           0.0        
 -0.0127418    -0.00403082   -0.000200683      0.0           0.0        
 -0.00447838   -0.0104682    -0.00385103       0.0           0.0        
  ⋮                                        ⋱                            
  0.0           0.0           0.0             -0.0123539    -0.00452954 
  0.0           0.0           0.0             -0.00451046   -0.0122197  
  0.0           0.0           0.0          …  -0.000225872  -0.00452158 
  0.0           0.0           0.0             -1.58619e-6   -0.000234623
  0.0           0.0           0.0             -1.9543e-9    -2.13598e-6 
  0.0           0.0           0.0             -0.128314     -0.00636703 
  0.0           0.0           0.0             -0.258631     -0.0948267  
  0.0           0.0           0.0          …  -0.0945298    -0.256099   
  0.0           0.0           0.0             -0.00524063   -0.104909   
  0.0           0.0           0.0             -4.29072e-5   -0.0063467  
  0.0           0.0           0.0              1.0          -0.350673   
  0.0           0.0           0.0             -0.350673      1.0        

In [29]:
# Normalized Laplacian
using Arpack
k=6
m=size(L,1)
λ,Y=eigs(Ln,nev=k,which=:SM, v0=ones(m))
λ


Out[29]:
6-element Array{Float64,1}:
 3.5295473934075124e-16
 3.952388171814223e-16 
 4.079070123880647e-16 
 4.166738476462434e-16 
 4.174351430384205e-16 
 4.246385223227009e-16 

In [30]:
Y=Diagonal(1.0./sqrt.(D))*Y
out=kmeans(Y',k)
plotKpartresult(out.assignments,X)


Out[30]:
200 300 400 500 600 700 800 175 200 225 250 275 300 325 y1 y2 y3 y4 y5 y6