In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from time import time
from mclearn.experiment import ActiveExperiment, load_results, save_results
from mclearn.tools import log
from sklearn.externals import joblib
from matplotlib.ticker import FuncFormatter
%matplotlib inline
sns.set_style('white')
Experiment Setup:
In [3]:
RUN_EXPERIMENTS = False
uci_sets = ['glass', 'ionosphere', 'iris', 'magic', 'miniboone',
'pageblocks', 'pima', 'sonar', 'vehicle', 'wine', 'wpbc']
datasets = sorted(uci_sets + ['sdss'])
methods = ['passive', 'margin', 'w-margin', 'confidence',
'w-confidence', 'entropy', 'w-entropy',
'qbb-margin', 'qbb-kl', 'passive', 'thompson', 'ocucb', 'klucb',
'exp++', 'borda', 'geometric', 'schulze']
In [4]:
def run_expt(X, y, dataset, scale=True):
log(dataset, end='')
for method in methods:
log('.', end='')
expt = ActiveExperiment(X, y, dataset, method, scale)
expt.run_policies()
expt = ActiveExperiment(X, y, dataset, None, scale)
expt.run_asymptote()
log('')
In [5]:
if RUN_EXPERIMENTS:
for dataset in uci_sets:
data_path = os.path.join('data', dataset + '.csv')
data = pd.read_csv(data_path)
X, y = data.iloc[:, 1:], data['target']
run_expt(X, y, dataset)
data_path = os.path.join('data', 'sdss.h5')
data = pd.read_hdf(data_path, 'sdss')
class_idx = data.columns.get_loc('class')
X, y = data.iloc[:, (class_idx+1):], data['class']
run_expt(X, y, 'sdss', False)
In [6]:
if RUN_EXPERIMENTS:
for (i, dataset) in enumerate(datasets):
maximum = {}
measures = ['f1', 'accuracy', 'mpba']
for measure in measures:
asymptote_measure = 'asymptote_' + measure
max_measure = 'max_' + measure
results = {}
for method in methods:
results[method] = load_results(dataset, method, 'mpba', True)
results['asymptote'] = load_results(dataset, 'asymptote', asymptote_measure, True)
maximum[max_measure] = results['asymptote']
for method in methods:
maximum[max_measure] = max(maximum[max_measure], max(results[method]))
save_results(dataset, 'max', maximum)
In [ ]:
def run_expt(X, y, dataset, scale=True):
log(dataset, end='')
for method in methods:
log('.', end='')
expt = ActiveExperiment(X, y, dataset, method, scale=scale, passive=False)
expt.run_policies()
In [ ]:
methods = ['thompson', 'ocucb', 'klucb',
'exp++', 'borda', 'geometric', 'schulze']
In [ ]:
if RUN_EXPERIMENTS:
for dataset in uci_sets:
data_path = os.path.join('data', dataset + '.csv')
data = pd.read_csv(data_path)
X, y = data.iloc[:, 1:], data['target']
run_expt(X, y, dataset)
data_path = os.path.join('data', 'sdss.h5')
data = pd.read_hdf(data_path, 'sdss')
class_idx = data.columns.get_loc('class')
X, y = data.iloc[:, (class_idx+1):], data['class']
run_expt(X, y, 'sdss', False)
In [26]:
def calculate_strength(asymptote, passive, policy):
deficiency = np.sum(asymptote - policy) / np.sum(asymptote - passive)
strength = 1 - deficiency
return strength
In [27]:
def plot_mpba_strength():
fig = plt.figure(figsize=(15, 20))
fig.subplots_adjust(hspace=.6)
for (i, dataset) in enumerate(datasets):
results = {}
for method in methods:
results[method] = load_results(dataset, method, 'mpba', True)
results['max'] = load_results(dataset, 'max', 'max_mpba')
strength_dict = {}
for method in methods:
s = calculate_strength(results['max'], results['passive'], results[method])
strength_dict[method] = [s]
strength_df = pd.DataFrame(strength_dict)
strength_df = strength_df[(-strength_df.ix[0]).argsort()]
ax = fig.add_subplot(6, 2, i + 1)
# set bar colours
palette = {}
for method in methods:
if strength_df.ix[0, method] > strength_df.ix[0, 'passive']:
palette[method] = sns.color_palette()[0]
else:
palette[method] = sns.color_palette()[2]
sns.barplot(data=strength_df, ax=ax, palette=palette)
ax.set_title(dataset)
ax.set_ylim(-0.3, 0.8)
ax.set_xticklabels(strength_df.columns, rotation=45, rotation_mode='anchor', ha='right')
# set bar width
new_width = 0.5
for bar in ax.patches:
x = bar.get_x()
width = bar.get_width()
centre = x + new_width / 2.
bar.set_x(centre - new_width / 2.)
bar.set_width(new_width)
#fig.savefig('strengths.pdf', bbox_inches='tight')
plt.show()
In [28]:
# methods = ['thompson', 'ocucb', 'klucb',
# 'exp++', 'borda', 'geometric', 'schulze']
# methods += [method + '-wop' for method in methods]
# methods += ['passive']
In [9]:
plot_mpba_strength()
In [10]:
def plot_learning_curves():
selected_methods = ['passive', 'confidence', 'borda', 'exp++']
format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
fig = plt.figure(figsize=(15, 20))
for (i, dataset) in enumerate(datasets):
learning_curves = {}
for method in selected_methods:
learning_curves[method] = load_results(dataset, method, 'mpba', True)
maximum = load_results(dataset, 'max', 'max_mpba')
sample_size = learning_curves['passive'].shape[0] + 49
ax = fig.add_subplot(4, 3, i + 1)
for method in selected_methods:
xticks = np.arange(50, 50 + len(learning_curves[method]))
ax.plot(xticks, learning_curves[method], label=method, linewidth=1)
ax.legend(loc='lower right', frameon=True)
ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))
ax.set_title(dataset)
ax.tick_params(top='off')
ax.plot([50, sample_size], [maximum, maximum], ls='--', color='#377eb8')
ax.set_xlim(50, sample_size)
fig.savefig('learning_curves.pdf', bbox_inches='tight')
plt.show()
In [11]:
plot_learning_curves()