KMeans Step-by-Step

First we will define the Point type.


In [1]:
type Point = Point of float []
    with
    override this.ToString() = let (Point ns) = this in sprintf "%A" ns

We define a function which computes the distance, in this case the euclidean distance, between two points.


In [2]:
/// Computes the euclidean distance between two points.
let euclDistance (Point ms) (Point ns) =
    ns
    |> Array.zip ms
    |> Array.sumBy (fun (m, n) -> (m-n)*(m-n))
    |> sqrt
    
/// Simple test for the distance function.
let testEuclDistance () =
    let p1 = Point [|0.; 1.|]
    let p2 = Point [|3.; 5.|]
    printfn "Euclidean Distance %A" (euclDistance p1 p2)
testEuclDistance ()


Euclidean Distance 5.0
Out[2]:

We now define a function that aggregates, in our case it returns the centroid, from a list of points.


In [3]:
/// Creates an array of zero values with the length equal to the dimension of the points.
let zeros = Array.item 0 >> fun (Point ns) -> ns.Length |> Array.zeroCreate<float>

/// Returns the centroid for a set of points.
let aggregate points =
    let z = (0, zeros points)
    let agg ms ns = ns |> Array.zip ms |> Array.map (fun (m, n) -> m + n)

    points
    |> Array.fold (fun (t, ms) (Point ns) -> (t + 1, agg ms ns)) z
    |> fun (ttl, ms) -> if ttl = 0 then ms else ms |> Array.map (fun m -> m / float ttl)
    |> Point
    
/// Quick test for the aggregation function.
let testAggregate () = 
    let p1 = Point [|0.; 1.|]
    let p2 = Point [|3.; 5.|]
    printfn "Aggregate: %A" (aggregate [|p1;p2|])
testAggregate()


Aggregate: Point [|1.5; 3.0|]
Out[3]:

Before moving forward with the k-means functionality, we will need some helper functions. First we will implement an active pattern which determines if a list contains distinct values. This active pattern will be used to build a list of randome indeces in a given set of data, and these indeces will have to be distinct.


In [4]:
/// Helper function which determines if all values in a list are distinct.
let (|Distinct|_|) xs =
    xs 
    |> List.distinct 
    |> List.length 
    |> (=) (xs |> List.length)
    |> function
    | true  -> Some xs
    | false -> None

/// Quick test for the active pattern
let testDistinct () =
    [1;2] |> function Distinct xs -> printfn "%O distinct values" xs | _ -> printfn "failed"
    [1;1] |> function Distinct _ -> printfn "failed" | xs -> printfn "%O not distrinct values" xs
testDistinct()


[1; 2] distinct values
[1; 1] not distrinct values
Out[4]:

The next helper function will generate k random centroids from a list of given points. The function uses a sequence unfold mechanism. In each iteration it generate k random indeces and checks if these values are distinct, using the active pattern defined above.


In [5]:
/// The state of the unfold sequence.
type UnfoldCompleted = | NotCompleted | Completed

/// Returns a set of centroids, randomly determined from  a list of data.
let rndCentroids (data: Point[]) (rnd: System.Random) k =

    /// geenrates a new index between 0 and the length of the data.
    let newIndex () = rnd.Next(data.Length)
    
    /// The function that controls the unfolding sequence.
    /// It generates k random indeces. If these values are distinct
    /// the sequence is completed, otherwise it continues with a new iteration. 
    let unfld = function
        | Completed -> 
            None
        | NotCompleted ->
            [for _ in 1 .. k -> newIndex ()]
            |> function
            | Distinct xs -> Some (xs, Completed)
            | _           -> Some ([], NotCompleted)

    /// Generate a sequence where the last item is a list of indeces.
    /// The centroids are the points at those indeces.
    NotCompleted
    |> Seq.unfold unfld
    |> Seq.last
    |> List.map (fun i -> Array.item i data)
    
/// Quick test for the function which randomly generates the centroids.
let testRndCentroids () =
    let rnd = System.Random ()
    let data = [ (0.0, 1.0); (1.0, 1.0); (10.0, 1.0); (13.0, 3.0); (4.0, 10.0); (5.0, 8.0) ]
    let points = data |> List.map (fun (x, y) -> Point [|x;y|]) |> Array.ofList
    printfn "Centroids: %A" (rndCentroids points rnd 3)

testRndCentroids ()


Centroids: [Point [|5.0; 8.0|]; Point [|1.0; 1.0|]; Point [|13.0; 3.0|]]
Out[5]:

The next function computes, for a given point, the index of the closest centroid from a given list of centroids.


In [6]:
/// Given a point, returns the index of the closest
/// centroid into a given list of centroids.
let closest dist (cs: Point list) (i: Point) =
    cs
    |> List.mapi  (fun i c    -> i, c)
    |> List.minBy (fun (_, c) -> dist c i)
    |> fst
    
/// Quick test function for the closest function.
let testClosest () =

    let points = 
        [ (0.0, 1.0); (1.0, 1.0); (10.0, 1.0); (13.0, 3.0); (4.0, 10.0); (5.0, 8.0) ]
        |> List.map (fun (x, y) -> Point [|x;y|]) 
        |> Array.ofList
        
    let cs = [Array.item 1 points; Array.item 2 points; Array.item 4 points]
    let p = Array.item 0 points
    
    let i = closest euclDistance cs p
    printfn "Centroids: %O" cs
    printfn "Point: %O" p
    printfn "Closest: %d - %O" i (List.item i cs) 
testClosest()


Centroids: [[|1.0; 1.0|]; [|10.0; 1.0|]; [|4.0; 10.0|]]
Point: [|0.0; 1.0|]
Closest: 0 - [|1.0; 1.0|]
Out[6]:

K-means algorithm can be implemented a sequence unfolding. The items of the sequence are the list of centroids and the list of index values in the list of centroids for each of the point in our population. The last item in the sequence is the final result when the set of centroids is stabilized.

We will use a helper type KMRes to store the list of centroids and the list of indeces.


In [7]:
type KMRes = {
    centroids: Point list
    idxs:      int [] }

let indeces res   = res.idxs
let centroids res = res.centroids

Now we are ready to work on the k-means implementation. The function requires 4 arguments:|

  1. dist - a distance function
  2. aggr - an aggregate function
  3. k - the number of centroids
  4. data - the set of points.

As we mentioned, the implementation is using the sequence unfold mechanism.

We start with a helper function, mkIndeces which given a list of centroids, computes the indeces for all data points, using the closest function which uses the given distance function.

The initial state is represented by the randomly generated centroids and the computed indeces (in the list of centroids) for all the points in the data set.

The mkCentroids function computes the new centroids from the list of indeces. Practically groups all the points using their indeces and then uses the aggregate function for each group of points.

We also use two heper active patterns. The NotComplted determines if unfold loop should continue. The second active pattern, SameIndeces determines if the list of indeces hasnt changed during a given iteration, which indidates if the state has stabilized.

The function that controls the unfolding mechanism, unfld, receives as input the internal KMRes state, extracts the current list of indeces, computes the new centroids using these indeces, using this centroids computes the new indeces for all data points, and determines if the new list of indeces is the same with the one from the begining of the iteration.

In the end, the sequence of KMRes is generated using the unfld function, and the last element of the sequence represents the stabilized list of centroids and the indeces for the data points.


In [8]:
/// Implementation of the k-means algorithm, needs as input arguments:
/// dist: a function which returns the distance between two points
/// aggr: a function which aggregates a list of points to one point
/// k   : the number of centroids
/// data: the set of points
let kmeans dist aggr k (data: Point []) =

    let mkIndeces cs = data |> Array.map (closest dist cs)

    /// Gets the list of initial of centroids and the indeces
    /// for each of the data point in the list of centroids.
    let zero = 
        let rnd = System.Random ()
        let cs = rndCentroids data rnd k
        let idxs = mkIndeces cs
        { centroids = cs; idxs = idxs }

    let mkCentroids idxs = 
        idxs
        |> Array.zip     data
        |> Array.groupBy snd
        |> Array.sortBy  fst
        |> Array.map     (snd >> Array.map fst >> aggr)
        |> List.ofArray

    /// Determines if the sequence unfold state
    /// is not completed yet.
    let (|NotCompleted|_|) (dn, res) =
        match dn with
        | Completed -> None
        | _         -> Some res

    let (|SameIndeces|_|) r r' = 
        if (r.idxs = r'.idxs) then Some r' else None

    let mkCompleted    res = Some (res, (Completed, res))
    let mkNotCompleted res = Some (res, (NotCompleted, res))

    /// The unfolding function to be used in the sequence unfold.
    let unfld = function
        | NotCompleted (res: KMRes) -> 
            res
            |> indeces
            |> mkCentroids
            |> fun cs -> { centroids = cs; idxs = mkIndeces cs }
            |> function
            | SameIndeces res r -> mkCompleted r
            | r                 -> mkNotCompleted r
        | _  -> None

    (NotCompleted, zero)
    |> Seq.unfold unfld
    |> Seq.last

let testKMeans () =
    let data = [ (0.0, 1.0); (1.0, 1.0); (10.0, 1.0); (13.0, 3.0); (4.0, 10.0); (5.0, 8.0) ]
    let points = data |> List.map (fun (x, y) -> Point [|x;y|]) |> Array.ofList

    let res = kmeans euclDistance aggregate 3 points
    printfn "Centroids: %A" (centroids res)
    printfn "Indeces: %A" (indeces res)
testKMeans()


Centroids: [Point [|10.0; 1.0|]; Point [|13.0; 3.0|]; Point [|2.5; 5.0|]]
Indeces: [|2; 2; 0; 1; 2; 2|]
Out[8]:

Lets chart the points and the centroids. We will use the XPlot nuget package.


In [9]:
// Install the XPlot nuget package.
#r "nuget:XPlot.Plotly,3.0.1"

// Install the XPlot.GoogleCharts nuget packages.
//#r "nuget:XPlot.GoogleCharts,3.0.1"

open XPlot.Plotly
//open XPlot.GoogleCharts

let xyOfPoints (ps: seq<Point>) =
    ps
    |> Seq.fold (fun (xs, ys) (Point ns) -> (xs @ [Array.item 0 ns], ys @ [Array.item 1 ns])) ([], [])
    
let tracePoints (xs, ys) = 
    Scatter(
            x = xs,
            y = ys,
            mode = "markers",
            marker = Marker(color = "rgb(0, 0, 250)", size = 10))
            
let traceCentroids (xs, ys) = 
    Scatter(
            x = xs,
            y = ys,
            mode = "markers",
            marker = Marker(color = "rgb(250, 0, 0)", size = 10, symbol = "cross"))


Installing package XPlot.Plotly, version 3.0.1.done!

In [10]:
let data = [ (0.0, 1.0); (1.0, 1.0); (10.0, 1.0); (13.0, 3.0); (4.0, 10.0); (5.0, 8.0) ]
let points = data |> List.map (fun (x, y) -> Point [|x;y|]) |> Array.ofList

points
|> xyOfPoints
|> tracePoints
|> Chart.Plot
|> Chart.WithWidth 700
|> Chart.WithHeight 500


Out[10]:

In [11]:
points 
|> kmeans euclDistance aggregate 3 
|> centroids
|> xyOfPoints
|> traceCentroids
|> Chart.Plot
|> Chart.WithWidth 700
|> Chart.WithHeight 500


Out[11]:

In [ ]: