Weighted Majority Game with Local Accuracy Estimates

A weighted voting game is a $n$ player game where there are $k$ available choices to vote and a weight is assigned to each voter.

To maximize the performance of the ensemble, we need to give more weight to that member which has highest accuracy in the neighbourhood of the test point.

In case of 2 class classification problem, if $X_{val}$ is the validation set, then the local weight of the classifiers is given by

$$ w_i = log(\frac{p_i}{1-p_i}) $$

where p_i is the estimated probability of correct classification measured in the validation set.

This requires the classifiers to be independent which can be assured by performing bootstrap aggregation or the classifier can simply be assumed as independent.

In this notebook, we check the performance of this algorithm on a 2-class dataset. Since the dataset is simple, very few training examples are needed. We knowingly train the individual classifiers on very few points to show how ensembles perform well.


In [1]:
import pandas as pd
import numpy as np
columns = ['code_num','thickness','uofcsize','uofcshape','adhesion','secsize','bnuclei','chromatinb','nnucleoi','mitoses','output']
data = pd.read_csv('breast-cancer-wisconsin.data',names=columns)

In [2]:
data.drop(['code_num'],1,inplace=True)
data.replace('?',-99999, inplace=True)
data = data.astype(int)

In [3]:
X = np.array(data.drop(['output'], 1))
y = np.array(data['output'])

In [4]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95,stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.80)

clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC()
clf3 = LogisticRegression()
clf4 = tree.DecisionTreeClassifier()

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)
clf4.fit(X_train,y_train)

accuracy1 = clf1.score(X_test, y_test)
accuracy2 = clf2.score(X_test, y_test)
accuracy3 = clf3.score(X_test,y_test)
accuracy4 = clf4.score(X_test,y_test)

print(accuracy1,accuracy2,accuracy3,accuracy4)


0.655639097744 0.655639097744 0.681203007519 0.873684210526

In [5]:
def get_weights(p):
    p[p==1.0] = 0.99 #avoid inf error
    odds = (p)/(1-p)
    return np.log(odds)

In [6]:
global_acc_vector=np.array([accuracy1,accuracy2,accuracy3])
weights = get_weights(global_acc_vector)

In [7]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(X_val)


Out[7]:
NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=3, p=2, radius=1.0)

In [8]:
def get_local_weights(test_point,n_neigh):
    nearest_indices = neigh.kneighbors(test_point,n_neighbors=n_neigh,return_distance=False)[0]
    X_verify = X_val[nearest_indices]
    y_verify = y_val[nearest_indices]
    score_pred1 = clf1.score(X_verify,y_verify)
    score_pred2 = clf2.score(X_verify,y_verify)
    score_pred3 = clf3.score(X_verify,y_verify)
    score_pred4 = clf4.score(X_verify,y_verify)
    acc_vector = np.array([score_pred1,score_pred2,score_pred3,score_pred4])
    weights=get_weights(acc_vector)
    return weights

In [9]:
def get_weighted_prediction(sample_point):
    weights=get_local_weights(sample_point,4)
    prediction=np.array([clf1.predict([sample_point]),clf2.predict([sample_point]),clf3.predict([sample_point]),clf2.predict([sample_point])])
    quota_weight = 0.0
    for _ in range(len(prediction)):
        if prediction[_] == 4:
            quota_weight = quota_weight + weights[_]
    if quota_weight >= np.average(weights):
        return 4
    else:
        return 2

In [10]:
import warnings
warnings.filterwarnings('ignore')
ensemble_pred=[]
for _ in range(len(X_test)):
    ensemble_pred.append(get_weighted_prediction(X_test[_]))

In [11]:
ensemble_pred=np.array(ensemble_pred).reshape(y_test.shape)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,ensemble_pred))


0.935338345865

In [12]:
def disagreement_measure(clf1,clf2,data):
    output_clf1 = clf1.predict(data)
    output_clf2 = clf2.predict(data)
    return 1- accuracy_score(output_clf1,output_clf2)

In [13]:
def diversity_measure(ensemble_list,data,i):
    ensemble_len = len(ensemble_list)
    diversity = 0
    for j in range(0,ensemble_len):
        if j == i:
            continue
        diversity = diversity + disagreement_measure(ensemble_list[i],ensemble_list[j],data)
    return float(diversity)/float(ensemble_len-1)

In [14]:
diversity_values = []
for i in range(0,4):
    diversity_values.append(diversity_measure([clf1,clf2,clf3,clf4],X_val,i))
weights = [0,0,0,0]
for i in range(0,4):
    for j in range(0,4):
        if j == i:
            continue
        if diversity_values[i] >= diversity_values[j]:
            weights[i] = weights[i] + 1

In [15]:
weights


Out[15]:
[1, 1, 2, 3]

In [16]:
def banzhaf(weight, quota):

    max_order = sum(weight)

    polynomial = [1] + max_order*[0]               # create a list to hold the polynomial coefficients

    current_order = 0                              # compute the polynomial coefficients
    aux_polynomial = polynomial[:]
    for i in range(len(weight)):
        current_order = current_order + weight[i]
        offset_polynomial = weight[i]*[0]+polynomial
        for j in range(current_order+1):
            aux_polynomial[j] = polynomial[j] + offset_polynomial[j]
        polynomial = aux_polynomial[:]

    banzhaf_power = len(weight)*[0]                                 # create a list to hold the Banzhaf Power for each voter
    swings = quota*[0]                                              # create a list to compute the swings for each voter

    for i in range(len(weight)):                                    # compute the Banzhaf Power
        for j in range(quota):                                      # fill the swings list
            if (j<weight[i]):
                swings[j] = polynomial[j]
            else:
                swings[j] = polynomial[j] - swings[j-weight[i]]
        for k in range(weight[i]):                                  # fill the Banzhaf Power vector
            banzhaf_power[i] = banzhaf_power[i] + swings[quota-1-k]

    # Normalize Index
    total_power = float(sum(banzhaf_power))
    banzhaf_index = map(lambda x: x / total_power, banzhaf_power)

    return np.array(list(banzhaf_index))

In [17]:
# weight threshold is [3,5]
double_banzhaf = banzhaf(weights,4) - banzhaf(weights,6)
print(double_banzhaf)
pruned_ensemble = []
pruned_weights = []

while sum(pruned_weights) <= 3:
    h = np.argmax(double_banzhaf)
    if sum(pruned_weights) + weights[h] >6:
            break
    pruned_ensemble.append(h)
    pruned_weights.append(weights[h])
    double_banzhaf[h] = -144
print(pruned_ensemble)     #ensemble of 1 2 and 3 is pruned


[ 0.04166667  0.04166667 -0.20833333  0.125     ]
[3, 0]

In [18]:
def get_weighted_prediction_ensemble(pruned_ensemble,sample_point):
    clf = {0: clf1, 1: clf2, 2: clf3, 3: clf4}
    weights=get_local_weights(sample_point,3)
    prediction=np.array([clf[i].predict([sample_point]) for i in pruned_ensemble] )
    quota_weight = 0.0
    for _ in range(len(prediction)):
        if prediction[_] == 4:
            quota_weight = quota_weight + weights[_]
    if quota_weight >= np.average(weights):
        return 4
    else:
        return 2

In [19]:
import warnings
warnings.filterwarnings('ignore')
ensemble_pred=[]
for _ in range(len(X_test)):
    ensemble_pred.append(get_weighted_prediction_ensemble(pruned_ensemble,X_test[_]))
ensemble_pred=np.array(ensemble_pred).reshape(y_test.shape)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,ensemble_pred))


0.95037593985