In [1]:
import numpy as np
from scipy.stats import rankdata

Clusterized ranking


In [2]:
M = np.array([
    [5, 3, 1, 2, 8, 4, 6, 7],
    [5, 4, 3, 1, 8, 2, 6, 7],
    [1, 7, 5, 4, 8, 2, 3, 6],
    [6, 4, 2.5, 2.5, 8, 1, 7, 5],
    [8, 2, 4, 6, 3, 5, 1, 7],
    [5, 6, 4, 3, 2, 1, 7, 8],
    [6, 1, 2, 3, 5, 4, 8, 7],
    [5, 1, 3, 2, 7, 4, 6, 8],
    [6, 1, 3, 2, 5, 4, 7, 8],
    [5, 3, 2, 1, 8, 4, 6, 7],
    [7, 1, 3, 2, 6, 4, 5, 8],
    [1, 6, 5, 3, 8, 4, 2, 7]
])
n, m = M.shape

Here is how we find average ranking.


In [3]:
average_rank = rankdata(np.average(M, axis=0))
average_rank


Out[3]:
array([ 5. ,  3.5,  2. ,  1. ,  7. ,  3.5,  6. ,  8. ])

And this way we can get median ranking.


In [4]:
median_rank = rankdata(np.median(M, axis=0))
median_rank


Out[4]:
array([ 5. ,  2.5,  2.5,  1. ,  8. ,  4. ,  6. ,  7. ])

Next we need to compute kernel of disagreement.


In [5]:
adj = np.zeros((m, m), dtype=np.bool)
kernel = []
for i in range(m):
    for j in range(i + 1, m):
        if (average_rank[i] - average_rank[j])*(median_rank[i] - median_rank[j]) < 0:
            kernel.append([i, j])
            adj[i][j] = adj[j][i] = True
kernel


Out[5]:
[[4, 7]]

Now that we have a graph of the disagreement, we can easily find a full component via Depth First Search.


In [6]:
def dfs(i, used):
    if i in used:
        return []
    used.add(i)
    
    res = [i]
    for j in range(m):
        if adj[i][j]:
            res += dfs(j, used)
    return res

Last thing to do, is to iterate in the correct order, and don't forget to print a whole cluster when needed.


In [7]:
order = sorted(range(m), key=lambda i: (average_rank[i], median_rank[i]))
order


Out[7]:
[3, 2, 1, 5, 0, 6, 4, 7]

In [8]:
result = []
used = set()
for i in order:
    cluster = dfs(i, used)
    if len(cluster) > 0:
        result.append(cluster)
result


Out[8]:
[[3], [2], [1], [5], [0], [6], [4, 7]]

Kemeny distance


In [9]:
rankings = np.array([
    [[1], [2, 3], [4], [5], [6, 7]],
    [[1, 3], [4], [2], [5], [7], [6]],
    [[1], [4], [2], [3], [6], [5], [7]],
    [[1], [2, 4], [3], [5], [7], [6]],
    [[2], [3], [4], [5], [1], [6], [7]],
    [[1], [3], [2], [5], [6], [7], [4]],
    [[1], [5], [3], [4], [2], [6], [7]]
])
n = rankings.shape[0]

We need to be able to build relation matrix out of the ranking.


In [10]:
def build(x):
    n = sum(map(lambda r: len(r), x)) # Total amount of objects
    m = np.zeros((n, n), dtype=np.bool)
    for r in x:
        for i in r:
            for j in range(n):
                if not m[j][i - 1] or j + 1 in r:
                    m[i - 1][j] = True
    return m

Now we can calculate Kemedy distances between each two rankings.


In [11]:
dist = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        dist[i][j] = np.sum(build(rankings[i]) ^ build(rankings[j]))
dist


Out[11]:
array([[  0.,   5.,   8.,   5.,  10.,   8.,  10.],
       [  5.,   0.,   9.,   6.,  13.,  11.,   9.],
       [  8.,   9.,   0.,   5.,  14.,  14.,  12.],
       [  5.,   6.,   5.,   0.,  13.,  13.,  13.],
       [ 10.,  13.,  14.,  13.,   0.,  16.,  18.],
       [  8.,  11.,  14.,  13.,  16.,   0.,  10.],
       [ 10.,   9.,  12.,  13.,  18.,  10.,   0.]])

Let's find Kemeny median for the ranks.


In [12]:
median = np.argmin(np.sum(dist, axis=1))
rankings[median]


Out[12]:
[[1], [2, 3], [4], [5], [6, 7]]