Our goal is to given prediction results from m experts (classifiers), combine them with weights in such way to get as close as possible to the best classifer.
We do that by first initializing all the weights to 1, then iterating throught the results and in cases where the weighted sum of classifiers' prediction does not match with the ground-truth label, we remove the weith of those classifiers who caused this mistake by a multiplicative approach:
$w_i = w_i * exp(-\eta)$
In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
### Function to plot weights
def plot_weights(weights):
ncol = weights.shape[0]
yl = np.arange(ncol)
fig1 = plt.figure(1, figsize=(12,6))
ax = fig1.add_subplot(1, 1, 1)
#line1 = plt.plot(yl, yc, 'b-')
line1 = ax.plot(yl, weights, color='r')
plt.setp(line1, color='b', linewidth=3, marker='^', markersize=8)
plt.setp(ax.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax.get_yticklabels(), rotation='vertical', fontsize=16)
plt.xlabel('Classifier Index', size=20)
plt.ylabel('Weight', size=20)
plt.title('Weighted Ensemble Learning ', size=20)
#plt.legend(['Y1', 'Y2'], loc='upper right') ## not supported by plotly yet
plt.show()
In [3]:
## Function to update weights
def update_weights(ycv, fname, ncol, chunks=10000, lr1=-0.02, lr2=1):
weights = np.ones(shape=ncol, dtype=float) / float(ncol)
learn_rate_1 = np.exp(lr1)
learn_rate_2 = np.exp(lr2)
n = 0
num_mistakes, num_mistakes_simpleVotes = 0, 0
for df in pandas.read_table(fname, header=None, sep=' ', iterator=True, chunksize=chunks):
sys.stdout.write("%6d %5d,%d ==> "%(n, df.shape[0], df.shape[1]))
num_mistake_part, num_mistake_part_simpleVotes = 0, 0
for i in range(df.shape[0]):
row = df.iloc[i,:]
wsum = np.sum(weights * row)
yi = ycv.iloc[n,0]
if np.sum(row)*yi <=0 : # simple vote count
num_mistake_part_simpleVotes += 1
if wsum * yi <= 0:
num_mistake_part += 1
for j,vj in enumerate(row):
if (vj * yi) < 0:
weights[j] *= learn_rate_1
elif(vj * yi) > 0:
weights[j] *= learn_rate_2
weights /= np.sum(weights)
n += 1
print("Num. of Mistakes: %d \tWithout-updating %d"%(num_mistake_part, num_mistake_part_simpleVotes))
num_mistakes += num_mistake_part
num_mistakes_simpleVotes += num_mistake_part_simpleVotes
print('Total number of mistakes: %d without updating %d'%(num_mistakes, num_mistakes_simpleVotes))
return(weights)
In [4]:
### Function to predict test data with given weights
def predict_test(weights, fname, outname):
pred = np.zeros(shape=nlines, dtype=int)
sum_pred = np.zeros(shape=nlines, dtype=float)
chunks = 10000
n = 0
num_neg, num_pos = 0, 0
for df in pandas.read_table(fname, header=None, sep=' ', iterator=True, chunksize=chunks):
sys.stdout.write("%6d shape: %5d,%d ==> "%(n, df.shape[0], df.shape[1]))
for i in range(df.shape[0]):
row = df.iloc[i,:]
wsum = np.sum(weights * row)
if wsum < 0:
num_neg += 1
pred[n] = -1
sum_pred[n] = wsum
else:
num_pos += 1
pred[n] = 1
sum_pred[n] = wsum
n += 1
sys.stdout.write('Num. Positive %d Num. Negative %d\n'%(num_pos, num_neg))
df = pandas.DataFrame({'V1':pred, 'V2':sum_pred})
df.to_csv(outname, sep=' ')
In [20]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv.iloc[np.where(ycv[0] < 157)[0],0] = -1
ycv.iloc[np.where(ycv[0] >=157)[0],0] = 1
print(ycv.shape)
ycv.head()
Out[20]:
In [21]:
ncol = pandas.read_table('../results/hf1/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
pandas.read_table('../results/hf1/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[21]:
In [24]:
w1 = update_weights(ycv, '../results/hf1/res_cv.dat', ncol, lr1=-0.02, lr2=0.02)
plot_weights(w1)
In [7]:
nlines = pandas.read_table('../results/hf1/res_test.dat', header=None, sep=' ', usecols=[0]).shape[0]
print(nlines)
In [25]:
predict_test(w1, '../results/hf1/res_test.dat', '../results/pred.hf1.dat')
In [26]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv = ycv.iloc[np.where(ycv[0] >= 157)[0], :]
ycv.iloc[np.where(ycv[0] < 162)[0], 0] = -1
ycv.iloc[np.where(ycv[0] >= 162)[0], 0] = 1
print(ycv.shape)
ycv.head()
Out[26]:
In [27]:
ncol = pandas.read_table('../results/hf2/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
print(ncol)
pandas.read_table('../results/hf2/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[27]:
In [28]:
w2 = update_weights(ycv, '../results/hf2/res_cv.dat', ncol, lr1=-0.01, lr2=0.2)
plot_weights(w2)
In [29]:
predict_test(w2, '../results/hf2/res_test.dat', '../results/pred.hf2.dat')
In [5]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv = ycv.iloc[np.where(ycv[0] >= 162)[0], :]
nremain_cv = pandas.read_table('../results/hf3/res_cv.dat', usecols=[0], header=None, sep=' ').shape[0]
assert(ycv.shape[0] == nremain_cv)
#print(nremain_cv)
ycv.iloc[np.where(ycv[0] < 164)[0], 0] = -1
ycv.iloc[np.where(ycv[0] >= 164)[0], 0] = 1
print(ycv.shape)
ycv.head()
Out[5]:
In [6]:
ncol = pandas.read_table('../results/hf3/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
print(ncol)
pandas.read_table('../results/hf3/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[6]:
In [7]:
w3 = update_weights(ycv, '../results/hf3/res_cv.dat', ncol, lr1=-0.02, lr2=0.1)
plot_weights(w3)
In [10]:
nlines = pandas.read_table('../results/hf3/res_test.dat', header=None, sep=' ', usecols=[0]).shape[0]
print(nlines)
In [10]:
predict_test(w3, '../results/hf3/res_test.dat', '../results/pred.hf3.dat')
In [15]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv = ycv.iloc[np.where((ycv[0] >= 162) & (ycv[0]<164))[0], :]
print(ycv.shape)
nremain_cv = pandas.read_table('../results/hf4/res_cv.dat', usecols=[0], header=None, sep=' ').shape[0]
assert(ycv.shape[0] == nremain_cv)
#print(nremain_cv)
ycv.iloc[np.where(ycv[0] < 163)[0], 0] = -1
ycv.iloc[np.where(ycv[0] >= 163)[0], 0] = 1
ycv.head()
Out[15]:
In [16]:
ncol = pandas.read_table('../results/hf4/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
print(ncol)
pandas.read_table('../results/hf4/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[16]:
In [17]:
w4 = update_weights(ycv, '../results/hf4/res_cv.dat', ncol, lr1=-0.02, lr2=0.1, chunks=2000)
plot_weights(w4)
In [18]:
predict_test(w4, '../results/hf4/res_test.dat', '../results/pred.hf4.dat')
In [19]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv = ycv.iloc[np.where((ycv[0] >= 157) & (ycv[0]<162))[0], :]
print(ycv.shape)
nremain_cv = pandas.read_table('../results/hf5/res_cv.dat', usecols=[0], header=None, sep=' ').shape[0]
assert(ycv.shape[0] == nremain_cv)
#print(nremain_cv)
ycv.iloc[np.where(ycv[0] < 160)[0], 0] = -1
ycv.iloc[np.where(ycv[0] >= 160)[0], 0] = 1
ycv.head()
Out[19]:
In [20]:
ncol = pandas.read_table('../results/hf5/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
print(ncol)
pandas.read_table('../results/hf5/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[20]:
In [21]:
w5 = update_weights(ycv, '../results/hf5/res_cv.dat', ncol, lr1=-0.02, lr2=0.1, chunks=4000)
plot_weights(w5)
In [22]:
predict_test(w5, '../results/hf5/res_test.dat', '../results/pred.hf5.dat')
In [6]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv = ycv.iloc[np.where((ycv[0] >= 160) & (ycv[0]<162))[0], :]
print(ycv.shape)
nremain_cv = pandas.read_table('../results/hf6/res_cv.dat', usecols=[0], header=None, sep=' ').shape[0]
assert(ycv.shape[0] == nremain_cv)
#print(nremain_cv)
ycv.iloc[np.where(ycv[0] < 161)[0], 0] = -1
ycv.iloc[np.where(ycv[0] >= 161)[0], 0] = 1
print(np.sum(ycv[0] == -1), np.sum(ycv[0] == 1))
ycv.head()
Out[6]:
In [7]:
ncol = pandas.read_table('../results/hf6/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
print(ncol)
pandas.read_table('../results/hf6/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[7]:
In [8]:
w6 = update_weights(ycv, '../results/hf6/res_cv.dat', ncol, lr1=-0.02, lr2=0.1, chunks=4000)
plot_weights(w6)
In [11]:
predict_test(w6, '../results/hf6/res_test.dat', '../results/pred.hf6.dat')
In [12]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv = ycv.iloc[np.where((ycv[0] >= 157) & (ycv[0]<160))[0], :]
print(ycv.shape)
nremain_cv = pandas.read_table('../results/hf7/res_cv.dat', usecols=[0], header=None, sep=' ').shape[0]
assert(ycv.shape[0] == nremain_cv)
#print(nremain_cv)
ycv.iloc[np.where(ycv[0] < 159)[0], 0] = -1
ycv.iloc[np.where(ycv[0] >= 159)[0], 0] = 1
print(np.sum(ycv[0] == -1), np.sum(ycv[0] == 1))
ycv.head()
Out[12]:
In [13]:
ncol = pandas.read_table('../results/hf7/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
print(ncol)
pandas.read_table('../results/hf7/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[13]:
In [14]:
w7 = update_weights(ycv, '../results/hf7/res_cv.dat', ncol, lr1=-0.02, lr2=0.1, chunks=4000)
plot_weights(w7)
In [15]:
predict_test(w7, '../results/hf7/res_test.dat', '../results/pred.hf7.dat')
In [16]:
ycv = pandas.read_table('../data/label_cv.txt', header=None, sep=' ')
ycv = ycv.iloc[np.where((ycv[0] >= 157) & (ycv[0]<159))[0], :]
print(ycv.shape)
nremain_cv = pandas.read_table('../results/hf8/res_cv.dat', usecols=[0], header=None, sep=' ').shape[0]
assert(ycv.shape[0] == nremain_cv)
#print(nremain_cv)
ycv.iloc[np.where(ycv[0] < 158)[0], 0] = -1
ycv.iloc[np.where(ycv[0] >= 158)[0], 0] = 1
print(np.sum(ycv[0] == -1), np.sum(ycv[0] == 1))
ycv.head()
Out[16]:
In [17]:
ncol = pandas.read_table('../results/hf8/res_cv.dat', header=None, sep=' ', nrows=3).shape[1]
print(ncol)
pandas.read_table('../results/hf8/res_cv.dat', header=None, sep=' ', nrows=3).head()
Out[17]:
In [18]:
w8 = update_weights(ycv, '../results/hf8/res_cv.dat', ncol, lr1=-0.02, lr2=0.1, chunks=1000)
plot_weights(w8)
In [19]:
predict_test(w8, '../results/hf8/res_test.dat', '../results/pred.hf8.dat')
In [ ]: