In [1]:
import numpy as np
from scipy.stats import rankdata
In [2]:
M = np.array([
[5, 3, 1, 2, 8, 4, 6, 7],
[5, 4, 3, 1, 8, 2, 6, 7],
[1, 7, 5, 4, 8, 2, 3, 6],
[6, 4, 2.5, 2.5, 8, 1, 7, 5],
[8, 2, 4, 6, 3, 5, 1, 7],
[5, 6, 4, 3, 2, 1, 7, 8],
[6, 1, 2, 3, 5, 4, 8, 7],
[5, 1, 3, 2, 7, 4, 6, 8],
[6, 1, 3, 2, 5, 4, 7, 8],
[5, 3, 2, 1, 8, 4, 6, 7],
[7, 1, 3, 2, 6, 4, 5, 8],
[1, 6, 5, 3, 8, 4, 2, 7]
])
n, m = M.shape
Here is how we find average ranking.
In [3]:
average_rank = rankdata(np.average(M, axis=0))
average_rank
Out[3]:
And this way we can get median ranking.
In [4]:
median_rank = rankdata(np.median(M, axis=0))
median_rank
Out[4]:
Next we need to compute kernel of disagreement.
In [5]:
adj = np.zeros((m, m), dtype=np.bool)
kernel = []
for i in range(m):
for j in range(i + 1, m):
if (average_rank[i] - average_rank[j])*(median_rank[i] - median_rank[j]) < 0:
kernel.append([i, j])
adj[i][j] = adj[j][i] = True
kernel
Out[5]:
Now that we have a graph of the disagreement, we can easily find a full component via Depth First Search.
In [6]:
def dfs(i, used):
if i in used:
return []
used.add(i)
res = [i]
for j in range(m):
if adj[i][j]:
res += dfs(j, used)
return res
Last thing to do, is to iterate in the correct order, and don't forget to print a whole cluster when needed.
In [7]:
order = sorted(range(m), key=lambda i: (average_rank[i], median_rank[i]))
order
Out[7]:
In [8]:
result = []
used = set()
for i in order:
cluster = dfs(i, used)
if len(cluster) > 0:
result.append(cluster)
result
Out[8]:
In [9]:
rankings = np.array([
[[1], [2, 3], [4], [5], [6, 7]],
[[1, 3], [4], [2], [5], [7], [6]],
[[1], [4], [2], [3], [6], [5], [7]],
[[1], [2, 4], [3], [5], [7], [6]],
[[2], [3], [4], [5], [1], [6], [7]],
[[1], [3], [2], [5], [6], [7], [4]],
[[1], [5], [3], [4], [2], [6], [7]]
])
n = rankings.shape[0]
We need to be able to build relation matrix out of the ranking.
In [10]:
def build(x):
n = sum(map(lambda r: len(r), x)) # Total amount of objects
m = np.zeros((n, n), dtype=np.bool)
for r in x:
for i in r:
for j in range(n):
if not m[j][i - 1] or j + 1 in r:
m[i - 1][j] = True
return m
Now we can calculate Kemedy distances between each two rankings.
In [11]:
dist = np.zeros((n, n))
for i in range(n):
for j in range(n):
dist[i][j] = np.sum(build(rankings[i]) ^ build(rankings[j]))
dist
Out[11]:
Let's find Kemeny median for the ranks.
In [12]:
median = np.argmin(np.sum(dist, axis=1))
rankings[median]
Out[12]: