Solutions 5 - Examples in Data Clustering

Assignment 1



In [ ]:

Assignment 2

The letters are written in the file files/Indore.jpg. First, preprocessing is needed.



In [1]:

    
using Images



In [2]:

    
img=load("files/Indore.jpg")









    Out[2]:



In [3]:

    
img=map(Gray,img)









    Out[3]:



In [4]:

    
# Extract the matrix
A=float(img)









    Out[4]:





800×480 Array{Float64,2}:
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 ⋮                                       ⋱                              
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392  …  0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392
 0.980392  0.980392  0.980392  0.980392     0.980392  0.980392  0.980392



In [5]:

    
sort(A[:])









    Out[5]:





384000-element Array{Float64,1}:
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.00784313725490196 
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 0.011764705882352941
 ⋮                   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196   
 0.984313725490196



In [6]:

    
# Truncate small elements (0 is black, 1 is white)
sum(A.>0.2)









    Out[6]:





375220



In [7]:

    
A.*=(A.<=0.2)









    Out[7]:





800×480 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 ⋮                        ⋮              ⋱            ⋮                      
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0



In [8]:

    
sum(A.>0), prod(size(A))









    Out[8]:





(8780, 384000)



In [9]:

    
colorview(Gray,A)









    Out[9]:



In [10]:

    
# Increase the contrast for clustering
A=ones(size(A)).*map(Float64,A.>0)









    Out[10]:





800×480 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 ⋮                        ⋮              ⋱            ⋮                      
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0



In [11]:

    
colorview(Gray,rotl90(A))









    Out[11]:



In [12]:

    
# Create the data points from letters
X=Array{Int}(undef,2,sum(A.==1))









    Out[12]:





2×8780 Array{Int64,2}:
 4294967296  4294967296  4294967296  84381184  …  0  80757536  0  80742472  0
   84381616    84382198    84381184  84381248     0         2  0         2  0



In [13]:

    
ind=findall(A.==1)









    Out[13]:





8780-element Array{CartesianIndex{2},1}:
 CartesianIndex(183, 156)
 CartesianIndex(184, 156)
 CartesianIndex(185, 156)
 CartesianIndex(186, 156)
 CartesianIndex(182, 157)
 CartesianIndex(183, 157)
 CartesianIndex(184, 157)
 CartesianIndex(185, 157)
 CartesianIndex(186, 157)
 CartesianIndex(187, 157)
 CartesianIndex(182, 158)
 CartesianIndex(183, 158)
 CartesianIndex(184, 158)
 ⋮                       
 CartesianIndex(173, 342)
 CartesianIndex(174, 342)
 CartesianIndex(175, 342)
 CartesianIndex(176, 342)
 CartesianIndex(177, 342)
 CartesianIndex(172, 343)
 CartesianIndex(173, 343)
 CartesianIndex(174, 343)
 CartesianIndex(175, 343)
 CartesianIndex(176, 343)
 CartesianIndex(173, 344)
 CartesianIndex(174, 344)



In [14]:

    
ind=findall(A.==1)
for i=1:length(ind)
    X[:,i]=[ind[i][1],ind[i][2]]
end
X=map(Float64,X)









    Out[14]:





2×8780 Array{Float64,2}:
 183.0  184.0  185.0  186.0  182.0  …  174.0  175.0  176.0  173.0  174.0
 156.0  156.0  156.0  156.0  157.0     343.0  343.0  343.0  344.0  344.0



In [15]:

    
# import Pkg; Pkg.add("Clustering")



In [16]:

    
# Solve it
using Clustering



In [17]:

    
out=kmeans(X,6)









    Out[17]:





KmeansResult{Array{Float64,2},Float64,Int64}([627.666 181.271 … 295.108 422.617; 245.858 251.477 … 257.855 256.781], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2  …  2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [9118.88, 9123.34, 9129.8, 9138.26, 8926.47, 8928.93, 8933.39, 8939.84, 8948.3, 8958.76  …  8233.7, 8222.16, 8212.62, 8462.37, 8444.83, 8429.29, 8415.75, 8404.21, 8628.88, 8613.34], [1595, 876, 1320, 1575, 1906, 1508], [1595, 876, 1320, 1575, 1906, 1508], 2.0614671900966853e7, 11, true)

For plotting we use the function from the notebook K-means Algorithm.



In [21]:

    
function plotKmeansresult(out::KmeansResult,X::Array)
    k=size(out.centers,2)
    # Clusters
    scatter(X[1,findall(out.assignments.==1)],X[2,findall(out.assignments.==1)])
    for j=2:k
        scatter!(X[1,findall(out.assignments.==j)],
            X[2,findall(out.assignments.==j)],markersize=1)
    end
    # Centers
    scatter!(out.centers[1,:],out.centers[2,:],markercolor=:black)
end









    Out[21]:





plotKmeansresult (generic function with 1 method)



In [22]:

    
using Plots



In [23]:

    
plotKmeansresult(out,X)









    Out[23]:

Now we try the spectral $k$-partitioning. We:

form the Laplacian matrix,
compute eigenvectors corresponding to $k$ smallest eigenvalues, and
cluster those vectors with $k$-means algorithm.



In [24]:

    
# Now some functions
using SparseArrays
using LinearAlgebra
function my_weight_matrix(src::Array,dst::Array,weights::Array)
    n=nv(G)
    sparse([src;dst],[dst;src],[weights;weights],n,n)
end

my_laplacian(W::AbstractMatrix)=spdiagm(0=>vec(sum(W,dims=2)))-W

function my_normalized_laplacian(L::AbstractMatrix)
    D=1.0./sqrt.(diag(L))
    n=length(D)
    [L[i,j]*(D[i]*D[j]) for i=1:n, j=1:n]
end









    Out[24]:





my_normalized_laplacian (generic function with 1 method)



In [25]:

    
function plotKpartresult(C::Vector,X::Array)
    k=maximum(C)
    # Clusters
    scatter(X[1,findall(C.==1)],X[2,findall(C.==1)])
    for j=2:k
        scatter!(X[1,findall(C.==j)],X[2,findall(C.==j)])
    end
    scatter!()
end









    Out[25]:





plotKpartresult (generic function with 1 method)



In [26]:

    
# import Pkg; Pkg.add("Distances")



In [27]:

    
using Distances
S=pairwise(SqEuclidean(),X,dims=2)
# S=pairwise(Cityblock(),X)
β=1









    Out[27]:





1



In [28]:

    
# Weight matrix
W=exp.(-β*S)
# Sum of weights
D=vec(sum(W,dims=2))
# Laplacian matrix
L=my_laplacian(W)
# Normalized Laplacian matrix
Ln=my_normalized_laplacian(L)









    Out[28]:





8780×8780 Array{Float64,2}:
  1.0          -0.297331     -0.0148032    …   0.0           0.0        
 -0.297331      1.0          -0.25568          0.0           0.0        
 -0.0148032    -0.25568       1.0              0.0           0.0        
 -0.000115989  -0.0148029    -0.297324         0.0           0.0        
 -0.128317     -0.00549362   -3.70157e-5       0.0           0.0        
 -0.256394     -0.0811091    -0.00403818   …   0.0           0.0        
 -0.0904931    -0.211528     -0.0778166        0.0           0.0        
 -0.00450498   -0.0778096    -0.211508         0.0           0.0        
 -3.15852e-5   -0.004031     -0.0809647        0.0           0.0        
 -3.89101e-8   -3.66928e-5   -0.00544569       0.0           0.0        
 -0.00638751   -0.000273467  -1.8426e-6    …   0.0           0.0        
 -0.0127418    -0.00403082   -0.000200683      0.0           0.0        
 -0.00447838   -0.0104682    -0.00385103       0.0           0.0        
  ⋮                                        ⋱                            
  0.0           0.0           0.0             -0.0123539    -0.00452954 
  0.0           0.0           0.0             -0.00451046   -0.0122197  
  0.0           0.0           0.0          …  -0.000225872  -0.00452158 
  0.0           0.0           0.0             -1.58619e-6   -0.000234623
  0.0           0.0           0.0             -1.9543e-9    -2.13598e-6 
  0.0           0.0           0.0             -0.128314     -0.00636703 
  0.0           0.0           0.0             -0.258631     -0.0948267  
  0.0           0.0           0.0          …  -0.0945298    -0.256099   
  0.0           0.0           0.0             -0.00524063   -0.104909   
  0.0           0.0           0.0             -4.29072e-5   -0.0063467  
  0.0           0.0           0.0              1.0          -0.350673   
  0.0           0.0           0.0             -0.350673      1.0



In [29]:

    
# Normalized Laplacian
using Arpack
k=6
m=size(L,1)
λ,Y=eigs(Ln,nev=k,which=:SM, v0=ones(m))
λ









    Out[29]:





6-element Array{Float64,1}:
 3.5295473934075124e-16
 3.952388171814223e-16 
 4.079070123880647e-16 
 4.166738476462434e-16 
 4.174351430384205e-16 
 4.246385223227009e-16



In [30]:

    
Y=Diagonal(1.0./sqrt.(D))*Y
out=kmeans(Y',k)
plotKpartresult(out.assignments,X)









    Out[30]: