A weighted voting game is a $n$ player game where there are $k$ available choices to vote and a weight is assigned to each voter.
To maximize the performance of the ensemble, we need to give more weight to that member which has highest accuracy in the neighbourhood of the test point.
In case of 2 class classification problem, if $X_{val}$ is the validation set, then the local weight of the classifiers is given by
$$ w_i = log(\frac{p_i}{1-p_i}) $$where p_i is the estimated probability of correct classification measured in the validation set.
This requires the classifiers to be independent which can be assured by performing bootstrap aggregation or the classifier can simply be assumed as independent.
In this notebook, we check the performance of this algorithm on a 2-class dataset. Since the dataset is simple, very few training examples are needed. We knowingly train the individual classifiers on very few points to show how ensembles perform well.
In [1]:
import pandas as pd
import numpy as np
columns = ['code_num','thickness','uofcsize','uofcshape','adhesion','secsize','bnuclei','chromatinb','nnucleoi','mitoses','output']
data = pd.read_csv('breast-cancer-wisconsin.data',names=columns)
In [2]:
data.drop(['code_num'],1,inplace=True)
data.replace('?',-99999, inplace=True)
data = data.astype(int)
In [3]:
X = np.array(data.drop(['output'], 1))
y = np.array(data['output'])
In [4]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95,stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.80)
clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC()
clf3 = LogisticRegression()
clf4 = tree.DecisionTreeClassifier()
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)
clf4.fit(X_train,y_train)
accuracy1 = clf1.score(X_test, y_test)
accuracy2 = clf2.score(X_test, y_test)
accuracy3 = clf3.score(X_test,y_test)
accuracy4 = clf4.score(X_test,y_test)
print(accuracy1,accuracy2,accuracy3,accuracy4)
In [5]:
def get_weights(p):
p[p==1.0] = 0.99 #avoid inf error
odds = (p)/(1-p)
return np.log(odds)
In [6]:
global_acc_vector=np.array([accuracy1,accuracy2,accuracy3])
weights = get_weights(global_acc_vector)
In [7]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(X_val)
Out[7]:
In [8]:
def get_local_weights(test_point,n_neigh):
nearest_indices = neigh.kneighbors(test_point,n_neighbors=n_neigh,return_distance=False)[0]
X_verify = X_val[nearest_indices]
y_verify = y_val[nearest_indices]
score_pred1 = clf1.score(X_verify,y_verify)
score_pred2 = clf2.score(X_verify,y_verify)
score_pred3 = clf3.score(X_verify,y_verify)
score_pred4 = clf4.score(X_verify,y_verify)
acc_vector = np.array([score_pred1,score_pred2,score_pred3,score_pred4])
weights=get_weights(acc_vector)
return weights
In [9]:
def get_weighted_prediction(sample_point):
weights=get_local_weights(sample_point,4)
prediction=np.array([clf1.predict([sample_point]),clf2.predict([sample_point]),clf3.predict([sample_point]),clf2.predict([sample_point])])
quota_weight = 0.0
for _ in range(len(prediction)):
if prediction[_] == 4:
quota_weight = quota_weight + weights[_]
if quota_weight >= np.average(weights):
return 4
else:
return 2
In [10]:
import warnings
warnings.filterwarnings('ignore')
ensemble_pred=[]
for _ in range(len(X_test)):
ensemble_pred.append(get_weighted_prediction(X_test[_]))
In [11]:
ensemble_pred=np.array(ensemble_pred).reshape(y_test.shape)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,ensemble_pred))
In [12]:
def disagreement_measure(clf1,clf2,data):
output_clf1 = clf1.predict(data)
output_clf2 = clf2.predict(data)
return 1- accuracy_score(output_clf1,output_clf2)
In [13]:
def diversity_measure(ensemble_list,data,i):
ensemble_len = len(ensemble_list)
diversity = 0
for j in range(0,ensemble_len):
if j == i:
continue
diversity = diversity + disagreement_measure(ensemble_list[i],ensemble_list[j],data)
return float(diversity)/float(ensemble_len-1)
In [14]:
diversity_values = []
for i in range(0,4):
diversity_values.append(diversity_measure([clf1,clf2,clf3,clf4],X_val,i))
weights = [0,0,0,0]
for i in range(0,4):
for j in range(0,4):
if j == i:
continue
if diversity_values[i] >= diversity_values[j]:
weights[i] = weights[i] + 1
In [15]:
weights
Out[15]:
In [16]:
def banzhaf(weight, quota):
max_order = sum(weight)
polynomial = [1] + max_order*[0] # create a list to hold the polynomial coefficients
current_order = 0 # compute the polynomial coefficients
aux_polynomial = polynomial[:]
for i in range(len(weight)):
current_order = current_order + weight[i]
offset_polynomial = weight[i]*[0]+polynomial
for j in range(current_order+1):
aux_polynomial[j] = polynomial[j] + offset_polynomial[j]
polynomial = aux_polynomial[:]
banzhaf_power = len(weight)*[0] # create a list to hold the Banzhaf Power for each voter
swings = quota*[0] # create a list to compute the swings for each voter
for i in range(len(weight)): # compute the Banzhaf Power
for j in range(quota): # fill the swings list
if (j<weight[i]):
swings[j] = polynomial[j]
else:
swings[j] = polynomial[j] - swings[j-weight[i]]
for k in range(weight[i]): # fill the Banzhaf Power vector
banzhaf_power[i] = banzhaf_power[i] + swings[quota-1-k]
# Normalize Index
total_power = float(sum(banzhaf_power))
banzhaf_index = map(lambda x: x / total_power, banzhaf_power)
return np.array(list(banzhaf_index))
In [17]:
# weight threshold is [3,5]
double_banzhaf = banzhaf(weights,4) - banzhaf(weights,6)
print(double_banzhaf)
pruned_ensemble = []
pruned_weights = []
while sum(pruned_weights) <= 3:
h = np.argmax(double_banzhaf)
if sum(pruned_weights) + weights[h] >6:
break
pruned_ensemble.append(h)
pruned_weights.append(weights[h])
double_banzhaf[h] = -144
print(pruned_ensemble) #ensemble of 1 2 and 3 is pruned
In [18]:
def get_weighted_prediction_ensemble(pruned_ensemble,sample_point):
clf = {0: clf1, 1: clf2, 2: clf3, 3: clf4}
weights=get_local_weights(sample_point,3)
prediction=np.array([clf[i].predict([sample_point]) for i in pruned_ensemble] )
quota_weight = 0.0
for _ in range(len(prediction)):
if prediction[_] == 4:
quota_weight = quota_weight + weights[_]
if quota_weight >= np.average(weights):
return 4
else:
return 2
In [19]:
import warnings
warnings.filterwarnings('ignore')
ensemble_pred=[]
for _ in range(len(X_test)):
ensemble_pred.append(get_weighted_prediction_ensemble(pruned_ensemble,X_test[_]))
ensemble_pred=np.array(ensemble_pred).reshape(y_test.shape)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,ensemble_pred))