Références:
Plus nombreux sont les candidats à se présenter à l'élection, et plus l'on peut s'attendre à un meilleur résultat. Dans la pratique, les systèmes de vote actuels ne le permettent pas, car plusieurs filtres (médias, notoriété, biais cognitifs) ne permettent qu'à une poignée de candidats d'émerger.
Le vote à lots aléatoires résoud cette problématique, en ne demandant aux électeurs de ne voter que un lot de candidats. Les lots sont tirés aléatoirement pour chaque candidat, puis les votes sont agrégés.
Pour que le résultat ne soit pas biaisé, on veut que le vote respecte trois critères :
On pose $\epsilon_i = 0.1$.
In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.unicode'] = True
rc('font',**{'family':'serif','serif':['Times']})
# %matplotlib inline
%matplotlib osx
import scipy.stats
import sys
L'algorithme pour construire les lots dépend de $\alpha$ :
In [2]:
# this function builds a random subset
# occurrences must an np.array with dtype=float
def subset(Ncandidates, Nsubset, occurrences, alpha = 1.0):
proba_candidat = np.array([1/(j**alpha) if j != 0 else 1 for j in occurrences])
proba_candidat = proba_candidat / float(sum(proba_candidat))
lot = np.random.choice(Ncandidates, size=Nsubset, replace=False, p=proba_candidat)
occurrences[lot] += 1
return lot
In [ ]:
Ncandidates= 100
Nvoters = 50000
q = 1000
alpha = [1, 2, 3, 10]
occurrences = np.zeros(Ncandidates, dtype=float)
rsm = np.zeros((int(Nvoters/q), len(alpha)))
Nsubset = 5
for j in range(len(alpha)):
for i in range(Nvoters):
lot = subset(Ncandidates, Nsubset, occurrences, alpha[j])
if i % q == 0:
rsm[int(i/q),j] = scipy.stats.variation(occurrences)
for j in range(len(alpha)):
plt.loglog(range(q,Nvoters, q), rsm[1:, j],label="$\\alpha=%d$" % alpha[j])
plt.xlabel("Number of voters")
plt.ylabel("Coefficient of variation")
plt.legend()
plt.show()
Le coefficient de variation dépend également du nombre de candidats :
In [ ]:
candidates= range(10,100, 2)
q = 10
voters = range(0,1001, q)
Nvoters = max(voters)
rsm = np.zeros((len(candidates), len(voters)))
Nsubset = 5
epsilon_1 = 0.1
for j in range(len(candidates)):
c = candidates[j]
occurrences = np.zeros(candidates[j])
for i in range(Nvoters+1):
lot = subset(candidates[j], Nsubset, occurrences)
if i % q == 0:
rsm[j, int(i/q)] = min(scipy.stats.variation(occurrences), 0.1)
In [ ]:
from matplotlib import cm
fig = plt.figure()
plt.pcolor(rsm[:,1:], cmap=cm.plasma)
plt.colorbar()
plt.ylabel('Candidates')
plt.xlabel('Voters')
# plt.yticks(range(len(candidates)), candidates)
# plt.xticks(range(len(voters)), voters[1:], rotation='vertical')
plt.show()
Il faut donc aussi regarder comment varie le coefficient de variation avec le nombre de candidats et $\alpha$.
In [ ]:
candidates= range(10,100, 20)
Ncandidates = max(candidates)
alphas = [1, 3, 10]
Nvoters = 10000
cv_alpha_c_1 = np.zeros((len(candidates), len(alphas)))
Nsubset = 5
for l in range(len(alphas)):
for j in range(len(candidates)):
occurrences = np.zeros(candidates[j])
for i in range(Nvoters+1):
lot = subset(candidates[j], Nsubset, occurrences, alphas[l])
cv_alpha_c_1[j, l] = scipy.stats.variation(occurrences)
In [ ]:
for j in range(len(alphas)):
plt.loglog(candidates, cv_alpha_c_1[:, j],label="$\\alpha=%d$" % alphas[j])
plt.xlabel("Number of candidates")
plt.ylabel("Coefficient of variation")
plt.legend()
plt.show()
In [ ]:
Nvoters = 50000
Ncandidates = 100
Nsubset = 5
alpha = 3
corr = np.zeros((Ncandidates,Ncandidates))
occurrences = np.zeros(Ncandidates)
for i in range(Nvoters):
lot = subset(Ncandidates, Nsubset, occurrences, alpha)
for j in lot:
corr[j,lot] += 1
In [ ]:
plt.pcolor(corr)
plt.colorbar()
plt.ylabel('Candidates')
plt.xlabel('Candidates')
plt.yticks(np.arange(0.5,Ncandidates + .5, 10),range(0,Ncandidates, 10))
plt.xticks(np.arange(0.5,Ncandidates + .5, 10),range(0,Ncandidates, 10))
plt.show()
La diagonale a les points les plus forts, car un candidat est sur de se rencontrer avec lui-même ! En revanche, le reste de la matrice semble bien uniforme. Regardons comment varie le coefficient de variation entre deux candidats pour différents nombres d'électeurs et de candidats.
In [ ]:
candidates= range(10,100, 2)
Ncandidates = max(candidates)
q = 1000
Nvoters = 1000
alphas = range(1, 25)
rsm = np.zeros((len(candidates), len(alphas)))
Nsubset = 5
for j in range(len(candidates)):
occurrences = np.zeros(candidates[j])
corr = np.zeros((candidates[j],candidates[j]))
for i in range(len(alphas)):
for _ in range(Nvoters):
lot = subset(candidates[j], Nsubset, occurrences, alpha=alphas[i])
for k in lot:
corr[k,lot] += 1
tri = corr[np.triu_indices(candidates[j], 1)]
rsm[j, i] = scipy.stats.variation(tri)
In [ ]:
from matplotlib import cm
fig = plt.figure()
plt.pcolor(rsm[:,:], cmap=cm.plasma)
plt.colorbar()
plt.ylabel('Candidates')
plt.xlabel('$\\alpha$')
plt.yticks(range(len(candidates)), candidates)
plt.xticks(range(1,len(alphas),2), alphas[::2])
plt.show()
Coefficient de variation selon Ncandidates et Nvoters
In [ ]:
candidates= range(10,41, 1)
Ncandidates = max(candidates)
q = 100
Nvoters = 6000
alpha = 2
req2_cv = np.zeros((int(Nvoters/q)+1, len(candidates)))
Nsubset = 5
eps_2 = 0.1
for j in range(len(candidates)):
occurrences = np.zeros(candidates[j])
corr = np.zeros((candidates[j],candidates[j]))
for i in range(Nvoters+1):
lot = subset(candidates[j], Nsubset, occurrences, alpha=alpha)
for k in lot:
corr[k,lot] += 1
if i % q == 0:
tri = corr[np.triu_indices(candidates[j], 1)]
req2_cv[int(i/q),j] = min(scipy.stats.variation(tri), eps_2)
In [ ]:
from matplotlib import cm
fig = plt.figure()
plt.pcolor(np.transpose(req2_cv), cmap=cm.plasma)
plt.colorbar()
plt.xlabel('Voters')
plt.ylabel('Candidates')
plt.xlim((0,60))
plt.ylim((0,31))
plt.yticks(range(1,len(candidates)+1,3), candidates[::3])
voters = range(0,Nvoters+1,q)
plt.xticks(range(1,len(voters)+1,6), voters[::6], rotation='vertical')
plt.show()
In [ ]:
print len(range(1,int(Nvoters/q)))
print len(alphas[::q])
On remarque que les variations du nombre d'électeurs sont négligeables devant les variations du nombre de candidats aux ordres de grandeurs choisis. Quel est l'impact de $\alpha$?
In [ ]:
candidates= range(10,100, 10)
Ncandidates = max(candidates)
q = 1000
alphas = [1, 3, 10]
Nvoters = 30000
rsm = np.zeros((len(candidates), len(alphas)))
Nsubset = 5
for l in range(len(alphas)):
for j in range(len(candidates)):
occurrences = np.zeros(candidates[j])
corr = np.zeros((candidates[j],candidates[j]))
for i in range(Nvoters+1):
lot = subset(candidates[j], Nsubset, occurrences, alphas[l])
for k in lot:
corr[k,lot] += 1
tri = corr[np.triu_indices(candidates[j], -1)]
rsm[j, l] = scipy.stats.variation(tri)
In [ ]:
for j in range(len(alphas)):
plt.loglog(candidates, rsm[:, j],label="$\\alpha=%d$" % alphas[j])
plt.xlabel("Number of candidates")
plt.ylabel("Coefficient of variation")
plt.legend()
plt.show()
In [ ]:
Ncandidates = 50
q = 1000
alphas = [1, 3, 10]
Nvoters = 30000
cv2_av = np.zeros((int(Nvoters/q)+1, len(alphas)))
Nsubset = 5
for l in range(len(alphas)):
occurrences = np.zeros(Ncandidates)
corr = np.zeros((Ncandidates,Ncandidates))
for i in range(Nvoters+1):
lot = subset(Ncandidates, Nsubset, occurrences, alphas[l])
for k in lot:
corr[k,lot] += 1
if i % q == 0:
tri = corr[np.triu_indices(Ncandidates, -1)]
cv2_av[int(i/q),l] = scipy.stats.variation(tri)
In [ ]:
for j in range(len(alphas)):
plt.loglog(range(q, Nvoters+1, q), cv2_av[1:, j],label="$\\alpha=%d$" % alphas[j])
plt.xlabel("Number of voters")
plt.ylabel("Coefficient of variation")
plt.legend()
plt.show()
On retrouve une faible variation du coefficient de variation lorsque le nombre de candidats, et ce pour différentes valeurs de $\alpha$.
Finalement, on peut donc choisir $\alpha$ de sorte à respecter $\epsilon_1$ et $\epsilon_2$ selon un nombre fixé de candidats et d'électeurs. On veut également avoir $\alpha$ aussi faible que possible pour avoir la distribution la plus aléatoire possible. La fonction findMinAlpha calcule itérativement $\alpha$.
In [3]:
def computeAlpha(Ncandidates, Nvoters, Ntests, Nsubset, alpha):
err_samples = np.zeros(Ntests, dtype=int)
for t in range(Ntests):
# sys.stdout.write("\rTest: %i/%i (%i %%)" % (t+1, Ntests, float(t)/float(Ntests)*100.0))
occurrences = np.zeros(Ncandidates)
cv2_samples = np.zeros(Ntests)
cv1_samples = np.zeros(Ntests)
corr = np.zeros((Ncandidates,Ncandidates))
for i in range(Nvoters+1):
lot = subset(Ncandidates, Nsubset, occurrences, alpha)
for k in lot:
corr[k,lot] += 1
tri = corr[np.triu_indices(Ncandidates, -1)]
cv2_samples[t] = scipy.stats.variation(tri)
cv1_samples[t] = scipy.stats.variation(occurrences)
req1 = np.mean(cv1_samples, axis=0)
req2 = np.mean(cv1_samples, axis=0)
return (req1, req2)
def findMinAlpha(Ncandidates, Nvoters, Ntests = 100, Nsubset = 5, q = 1, alphaMin = 1, alphaMax = 100, epsilon1=0.1, epsilon2=0.1):
alpha = alphaMin
alpha_old = alpha
error1 = epsilon1 + 1
error2 = epsilon2 + 1
fw = 5
firstValid = 0
while True:
[req1, req2] = computeAlpha(Ncandidates, Nvoters, Ntests, Nsubset, alpha)
if req1 > epsilon1 or req2 > epsilon2:
alpha += 5 if firstValid == 0 else 1
if alpha > alphaMax:
raise Exception("Not converge.")
elif firstValid == 0:
firstValid = alpha
sys.stdout.write("First valid: %i" % (firstValid))
sys.stdout.flush()
alpha -= 4
if alpha < 0:
return alpha + 4
else:
return alpha
sys.stdout.write("\rC=%i, V=%i, $\\alpha$ = %i, req1 = %.4f, req2 = %.4f. Try with alpha=%i." % (Ncandidates, Nvoters,alpha_old, error1, error2, alpha))
sys.stdout.flush()
alpha_old = alpha
# sys.stdout.write("\n")
return alpha
In [39]:
alpha = findMinAlpha(100, 100, Ntests = 2)
# 200, 200: 8
# 400, 400:10
print("alpha = %i" % alpha)
In [40]:
import itertools
candidates = range(10,110, 3)
# Nvoters = 10000
# q = 3000
voters = range(100, 1000, 30)
minAlpha = np.zeros((len(candidates), len(voters)))
for (i,j) in itertools.product(range(len(candidates)), range(len(voters))):
minAlpha[i,j] = findMinAlpha(candidates[i], voters[j], Ntests = 1, Nsubset = 5, q = 1, alphaMin = 1, epsilon1=0.1, epsilon2=0.1)
In [44]:
from matplotlib import cm
fig = plt.figure()
plt.pcolor(minAlpha[:30,:], cmap=cm.plasma)
plt.colorbar()
plt.xlabel('Voters')
plt.ylabel('Candidates')
plt.yticks(range(1,len(candidates)+1,3), candidates[::3])
# voters = range(0,Nvoters+1,q)
plt.xticks(range(1,len(voters)+1,1), voters[::], rotation='vertical')
plt.show()
On compare les résultats du vote avec un vote jugé idéal, ie. sans triches, ou biais.
Pour simuler un vote idéal, on démarre avec les résultats du jugement majoritaire d'OpinionWay/Terra Nova. En mesurant l'écart-type et la moyenne de ces résultats, on peut tirer les résultats de nouveaux candidats selon une loi gaussienne.
In [4]:
def vote(lot, proba, Nsubset, Ngrades):
votes = np.zeros(Nsubset, dtype=int)
for i in range(Nsubset):
votes[i] = np.random.choice(range(Ngrades), size=1, replace=True, p=proba[i])
return votes
def normalize(v, ax=1):
n = np.sum(v, axis=ax)
b = np.transpose(v)
c = np.divide(b,n)
return np.transpose(c)
def tieBreaking(A, B):
#print str(A) + " " + str(B)
Ac = np.copy(A)
Bc = np.copy(B)
medA = argMedian(Ac)
medB = argMedian(Bc)
while medA == medB:
Ac[medA] -= 1
Bc[medB] -= 1
if not any(Ac):
return -1
if not any(Bc):
return 1
medA = argMedian(Ac)
medB = argMedian(Bc)
return -1 if (medA < medB) else 1
def majorityJudgment(results):
return sorted(range(len(results)), cmp=tieBreaking, key=results.__getitem__)
def probaCandidates(Ncandidates, Ngrades, inFile):
"""Read inFile. If there is not enough candidates, interpolate other. Save in outFile """
inCandidates = np.genfromtxt(inFile, delimiter = " ", dtype=float)
inCandidates = inCandidates[:,:Ngrades]
Nc = len(inCandidates)
N = min(Nc, Ncandidates)
param = np.zeros((Ngrades,2))
param[:,0] = np.mean(inCandidates, axis=0)
param[:,1] = np.std(inCandidates, axis=0)
np.random.shuffle(inCandidates)
outCandidates = np.zeros((Ncandidates,Ngrades))
outCandidates[:N] = inCandidates[:N,:]
if Ncandidates > Nc:
for i in range(Ngrades):
outCandidates[N:,i] = np.random.normal(param[i,0], param[i,1], Ncandidates-Nc)
return normalize(np.absolute(outCandidates))
def argMedian(A):
Ngrades = len(A)
s = np.array([sum(A[:i+1]) for i in range(Ngrades)])
mid = float(s[Ngrades-1])/2
return np.argwhere(mid < s)[0][0]
def rankError(rk_priori, rk_post, N):
rk = np.concatenate((rk_priori[:N], rk_post[:N]))
return len(set(rk)) - N
Comme avec findMinAlpha, findMinNvoters cherche itérativement le nombre minimal d'électeurs qui permenttent d'atteindre $\epsilon_3$.
In [5]:
def findMinNvoters(Ncandidates, maxError = 0.1, Ntests = 100, Nwinners = 10, Nsubset = 5, Ngrades = 5, q = 1000, alpha = 1, real_results = "terranova.txt", epsilon=0.0):
if epsilon == 0.0:
epsilon = maxError/10 # not implemented yet
maxTests = Ntests # max number of tests
Nvoters = 0
Nvoters_old = 0
Nwinners = min(Nwinners, Ncandidates)
errors = (maxError + 1)*np.ones(Nwinners)
minNvoters = np.zeros(Nwinners)
# perfect election
pr_priori = probaCandidates(Ncandidates, Ngrades, real_results)
res_priori = np.trunc(pr_priori*1000)
rk_priori = majorityJudgment(res_priori)
# election with random sets
raw = np.zeros((Ntests, Ncandidates,Ngrades))
occurrence = np.zeros((Ntests, Ncandidates))
while np.any(minNvoters == 0):
Nvoters += q
sys.stdout.write("\r%i voters is too low (%.4f > %.4f). Try with %i voters." % (Nvoters_old, error, maxError, Nvoters))
sys.stdout.flush()
err_samples = np.zeros((Ntests, Nwinners), dtype=int)
for t in range(Ntests):
# sys.stdout.write("\rTest: %i/%i (%i %%)" % (t+1, Ntests, float(t)/float(Ntests)*100.0))
for i in range(Nvoters_old, Nvoters+1):
lot = subset(Ncandidates, Nsubset, occurrence[t], alpha)
votes = vote(lot, pr_priori[lot,:], Nsubset, Ngrades)
raw[t,lot,votes] += 1
results = normalize(raw[t])
rk = majorityJudgment(raw[t])
err_samples[t,:] = [np.array(rankError(rk_priori, rk, Nwinner),dtype=float) for Nwinner in range(1,Nwinners+1)]
errors = np.mean(err_samples, axis=0)
minNvoters[(errors < maxError) & (minNvoters == 0)] = Nvoters# if error_test[i] and minNvoters == 0 else 0]
Nvoters_old = Nvoters
return minNvoters
In [95]:
import itertools
candidates = range(10,110, 10)
Nwinners = 10
minNvoters = np.zeros((len(candidates), Nwinners))
for i in range(len(candidates)):
print "C %i " % candidates[i]
minNvoters[i,:] = findMinNvoters(candidates[i], q =1000, Nwinners = Nwinners, Ntests = 10)
print minNvoters[i,:]
print minNvoters
In [94]:
for j in range(len(candidates)):
plt.plot(range(1,Nwinners+1), minNvoters[j,:],label="Nc = %i" % candidates[j])
plt.xlabel("Number of winners")
plt.ylabel("Minimal number of voters")
plt.legend()
plt.show()
In [68]:
Nwinners = [1]#range(1,5)
minNvoters_Nwinners = [findMinNvoters(1000, q =1000, Nwinner = i, Ntests = 2) for i in Nwinners]
print minNvoters_Nwinners
Ou, selon les paramètres du vote, on peut calculer l'erreur moyenne du vote avec computeError
In [6]:
def computeReq3(Ncandidates, Nvoters, maxError = 0.1, Nwinner = 1, Nsubset = 5, Ngrades = 5, alpha = 1, real_results = "terranova.txt", epsilon=0.0):
if epsilon == 0.0:
epsilon = maxError/10 # not implemented yet
maxTests = 200 # max number of tests
# perfect election
pr_priori = probaCandidates(Ncandidates, Ngrades, real_results)
res_priori = np.trunc(pr_priori*1000)
rk_priori = majorityJudgment(res_priori)
# election with random sets
raw = np.zeros((maxTests, Ncandidates,Ngrades))
occurrence = np.zeros((maxTests, Ncandidates))
err_samples = np.zeros(maxTests, dtype=int)
for t in range(maxTests):
for i in range(Nvoters+1):
lot = subset(Ncandidates, Nsubset, occurrence[t], alpha)
votes = vote(lot, pr_priori[lot,:], Nsubset, Ngrades)
raw[t,lot,votes] += 1
results = normalize(raw[t])
rk = majorityJudgment(raw[t])
err_samples[t] = rankError(rk_priori, rk, Nwinner)
return np.mean(err_samples, axis=0)
In [ ]:
print computeError(12, 10000, Nwinner = 5) #LaPrimaire.org
In [9]:
median
Out[9]:
In [42]:
results = np.array([[0.14, 0.45, 0.13, 0.06, 0.22],\
[0.24, 0.35, 0.13, 0.06, 0.22]])
Ncandidates = len(results)
acc = np.cumsum(results, axis=1)
print acc
median = np.zeros(Ncandidates)
gauge = np.zeros(Ncandidates)
for i in range(Ncandidates):
median[i] = 5 - len(acc[i, acc[i] > 0.5])
for i in range(Ncandidates):
if median[i] == 0:
gauge[i] = 1 - A[0]
else:
gauge[i] = max(acc[i, median[i]-1], 1 - acc[i, median[i]])
print median
print gauge
In [ ]:
In [31]:
def computeGauge(A):
acc = np.cumsum(A)
median = 5 - len(A[acc > 0.5])
if median == 0:
return [median, 1 - A[0]]
else:
return [median, max(acc[median-1], 1 - acc[median])]
def majorityJudgment(results):
Ncandidates = len(results)
acc = np.cumsum(results, axis=1)
print acc
median = 5 - len(results[acc > 0.5])
print median
gauge = np.zeros(Ncandidates)
print gauge
for i in range(Ncandidates):
if median[i] == 0:
gauge[i] = 1 - A[0]
else:
gauge[i] = max(acc[median-1], 1 - acc[median])
# return sorted(range(len(results)), cmp=tieBreaking, key=results.__getitem__)
return sorted(range(Ncandidates), key = lambda x: (x[1], x[2]))
In [ ]: