notebook.community

Edit and run



In [2]:

    
%load_ext autoreload
%autoreload 2



In [3]:

    
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
from time import time
from mclearn.experiment import ActiveExperiment, load_results, save_results
from mclearn.tools import log
from matplotlib.ticker import FuncFormatter
%matplotlib inline
sns.set_style('white')
warnings.filterwarnings('ignore')  # Ignore annoying numpy warnings



In [4]:

    
uci_sets = ['glass', 'ionosphere']

Thompson Sampling: testing effect of mu and tau/sigma



In [ ]:

    
for dataset in uci_sets:
    data_path = os.path.join('data', dataset + '.csv')
    data = pd.read_csv(data_path)
    X, y = data.iloc[:, 1:], data['target']
    for mu in [0.001, 0.005, 0.05, 0.5]:
        for var in [0.001, 0.005, 0.01, 0.02]:
            print(dataset, mu, var)
            save_name = 'thompson-{}-{}'.format(mu, var)
            expt = ActiveExperiment(
                X, y, dataset, 'thompson', scale=True,
                n_splits=10, n_jobs=10, ts_sigma=var,
                ts_tau=var, ts_mu=mu, save_name=save_name)
            expt.run_policies()

Testing effect of information density



In [ ]:

    
for dataset in uci_sets:
    data_path = os.path.join('data', dataset + '.csv')
    data = pd.read_csv(data_path)
    X, y = data.iloc[:, 1:], data['target']
    for policy in ['w-confidence', 'w-margin']:
        for gamma in [50, 60, 70, 90, 95, 99]:
            print(dataset, policy, gamma)
            save_name = '{}-gamma-{}'.format(policy, gamma)
            expt = ActiveExperiment(
                X, y, dataset, policy, scale=True,
                n_splits=10, n_jobs=10, gamma_percentile=gamma,
                save_name=save_name)
            expt.run_policies()

Results



In [1]:

    
titles = {
    'f1': 'F1',
    'accuracy': 'Accuracy',
    'mpba': 'MPBA'
}



In [5]:

    
def plot_learning_curves(measure):
    format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
    fig = plt.figure(figsize=(15, 20))
    
    selected_methods = []
    for mu in [0.001, 0.005, 0.05, 0.5]:
        for var in [0.001, 0.005, 0.01, 0.02]:
            selected_methods.append('thompson-{}-{}'.format(mu, var))
    for (i, dataset) in enumerate(uci_sets):
        initial_n = 10
        
        learning_curves = {}
        for method in selected_methods:
            learning_curves[method] = load_results(dataset, method, measure, True)
            
        maximum = load_results(dataset, 'asymptote', 'asymptote_{}'.format(measure), True)
        sample_size = learning_curves['thompson-0.05-0.02'].shape[0] + 9

        ax = fig.add_subplot(4, 3, i + 1)
        for method in selected_methods:
            xticks = np.arange(initial_n, initial_n + len(learning_curves[method]))
            method_label = 'exp3++' if method == 'exp++' else method
            ax.plot(xticks, learning_curves[method], label=method_label, linewidth=1)

        ax.legend(loc='lower right', frameon=True)
        ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))
        ax.set_title(dataset)
        ax.tick_params(top='off')
        ax.set_ylabel(titles[measure])
#         ax.set_xscale("log")

        ax.plot([initial_n, sample_size], [maximum, maximum], ls='--', color='#377eb8')
        ax.set_xlim(initial_n, sample_size)
        [i.set_linewidth(0.5) for i in ax.spines.values()]
#     fig.savefig('figures/learning_curves-thompson-{}-{}.pdf'.format(measure, data), bbox_inches='tight')



In [11]:

    
plot_learning_curves('mpba')



In [6]:

    
def calculate_strength(asymptote, passive, policy):
    n_trials, n_samples = passive.shape
    asymptote = np.repeat(asymptote, n_samples).reshape((n_trials, n_samples))
    deficiency = np.sum(asymptote - policy, axis=1) / np.sum(asymptote - passive, axis=1)
    strength = 1 - deficiency
    return strength



In [15]:

    
def plot_strength(measure='mpba', data='small'):
    fig = plt.figure(figsize=(10, 8))
    fig.subplots_adjust(hspace=.6)
    methods = []
    method_names = []
    
    for mu in [0.001, 0.005, 0.05, 0.5]:
        for var in [0.001, 0.005, 0.01, 0.02]:
            methods.append('thompson-{}-{}'.format(mu, var))
            method_names.append('μ={}, σ=τ={}'.format(mu, var))
            
    for i, (dataset, part) in enumerate(zip(['glass', 'ionosphere'], ('A', 'B'))):
        results = {}
        for method in methods + ['passive']:
            results[method] = load_results(dataset, method, measure, mean=False)
        results['max'] = load_results(dataset, 'max', 'max_' + measure, False)
        strength_dict = {}
        for method in methods:
            s = calculate_strength(results['max'], results['passive'], results[method])
            strength_dict[method] = s
        strength_df = pd.DataFrame(strength_dict)
        strength_df.columns = ['μ={}, σ=τ={}'.format(x.split('-')[1], x.split('-')[2])
                               for x in strength_df.columns]
        sorted_cols = (-strength_df.median()).sort_values().index
        strength_df = strength_df[sorted_cols]

        ax = fig.add_subplot(2, 1, i + 1)
        strength_df.index.name = 'trial'
        strength_df = strength_df.reset_index()
        strength_df = strength_df.melt(id_vars=['trial'], value_vars=method_names)
#         strength_df.loc[strength_df['variable'].isin(methods_al), 'type'] = 'single'
#         strength_df.loc[strength_df['variable'].isin(methods_bandits), 'type'] = 'bandit'
#         strength_df.loc[strength_df['variable'].isin(methods_rank), 'type'] = 'rank'
#         strength_df.loc[strength_df['variable'] == 'baseline', 'variable'] = 'explore'
#         strength_df.loc[strength_df['variable'] == 'exp++', 'variable'] = 'exp3++'
        sorted_cols = list(sorted_cols)
#         sorted_cols[sorted_cols.index('baseline')] = 'explore'
#         sorted_cols[sorted_cols.index('exp++')] = 'exp3++'
        # We could use hue here, but I think there is a bug in seaborn that squishes
        # the boxplot
        

        sns.boxplot(data=strength_df, x='variable', y='value', order=sorted_cols,
                    width=0.4, linewidth=1, fliersize=3,
                    color='#3498db')
        ax.set_title('({}) {}'.format(part, dataset))
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, rotation_mode='anchor', ha='right')
        ax.xaxis.set_visible(True)
        ax.set_ylabel(titles[measure] + ' Strength')
        ax.set_xlabel('')
        ax.axhline(linewidth=1)
        [i.set_linewidth(0.5) for i in ax.spines.values()]
        
        # set bar width
        new_width = 0.5
        for bar in ax.patches:
            x = bar.get_x()
            width = bar.get_width()
            centre = x + new_width / 2.

            bar.set_x(centre - new_width / 2.)
            bar.set_width(new_width)
            
    fig.savefig('figures/strengths-thompson-params.pdf'.format(measure, data), bbox_inches='tight')



In [16]:

    
plot_strength('mpba')



In [51]:

    
colors = {'passive': '#9b59b6',
          'entropy': '#3498db',
          'margin': '#95a5a6',
          'qbb-margin': '#e74c3c',
          'qbb-kl': '#34495e',
          'confidence': '#2ecc71'}

lc_line = {'passive': ':',
           'entropy': ':',
           'margin': '-',
          'borda': '-',
          'qbb-margin': '-.',
          'qbb-kl': '--',
          'confidence': '--'}



In [58]:

    
def plot_selections(datasets):
    fig = plt.figure(figsize=(15, 20))
    
    for k, dataset in enumerate(datasets):
        for i, mu in enumerate([0.001, 0.005, 0.05, 0.5]):
            for j, var in enumerate([0.001, 0.005, 0.01, 0.02]):
                method = 'thompson-{}-{}'.format(mu, var)
                order = k * 16 + i * 4 + j
                ax = fig.add_subplot(8, 4, order + 1)
                result = load_results(dataset, method)
                arms = ['passive', 'margin', 'confidence', 'entropy', 'qbb-margin', 'qbb-kl']
                total_n = sum(result['T'][0][-1])
                sample_sizes = np.arange(10, total_n + 10)
                trials = np.arange(1, total_n + 1)
                props = np.mean(result['T'], axis=0)[1:] / np.repeat(trials.reshape(-1, 1), 6, axis=1)
                df = pd.DataFrame(props, columns=arms)

                ordered_labels = df.iloc[-1].sort_values(ascending=False).index
                for label in ordered_labels:
                    curve = df[label]
                    inital_n = sample_sizes[0] - 1
                    n_selections = sample_sizes - inital_n
                    ax.plot(sample_sizes, curve, label=label, color=colors[label],
                        ls=lc_line[label], linewidth=1)

                if order in (28, 29, 30, 31):
                    ax.set_xlabel('Training Size')
                else:
                    ax.xaxis.set_major_formatter(plt.NullFormatter())
                    

                if order % 4 == 0:
                    ax.set_ylabel('Frequency of Selections')
                    format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
                    ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))
                else:
                    ax.yaxis.set_major_formatter(plt.NullFormatter())

                if order == 0:
                    ax.legend(loc='upper right', frameon=True)
                ax.set_title('{}, μ={}, σ=τ={}'.format(dataset, mu, var))
                ax.set_ylim((0, 0.3))
                ax.set_xlim((10, total_n + 10))
                ax.set_xlim((10, 235 + 10))
                [i.set_linewidth(0.5) for i in ax.spines.values()]
            
    fig.savefig('figures/selection-thompson-params-{}.pdf'.format('-'.join(datasets)), bbox_inches='tight')



In [59]:

    
plot_selections(['glass', 'ionosphere'])

Information Density



In [104]:

    
def plot_learning_curves(measure):
    format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
    fig = plt.figure(figsize=(15, 20))
    
    selected_methods = []
    for policy in ['w-confidence', 'w-margin']:
        for gamma in [50, 60, 70, 90, 95, 99]:
            selected_methods.append('{}-gamma-{}'.format(policy, gamma))
    
    for (i, dataset) in enumerate(uci_sets):
        initial_n = 10
        
        learning_curves = {}
        for method in selected_methods:
            learning_curves[method] = load_results(dataset, method, measure, True)
            
        maximum = load_results(dataset, 'asymptote', 'asymptote_{}'.format(measure), True)
        sample_size = learning_curves['w-margin-gamma-50'].shape[0] + 9

        ax = fig.add_subplot(4, 3, i + 1)
        for method in selected_methods:
            xticks = np.arange(initial_n, initial_n + len(learning_curves[method]))
            method_label = 'exp3++' if method == 'exp++' else method
            ax.plot(xticks, learning_curves[method], label=method_label, linewidth=1)

        ax.legend(loc='lower right', frameon=True)
        ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))
        ax.set_title(dataset)
        ax.tick_params(top='off')
        ax.set_ylabel(titles[measure])
#         ax.set_xscale("log")

        ax.plot([initial_n, sample_size], [maximum, maximum], ls='--', color='#377eb8')
        ax.set_xlim(initial_n, sample_size)
        [i.set_linewidth(0.5) for i in ax.spines.values()]
#     fig.savefig('figures/learning_curves-thompson-{}-{}.pdf'.format(measure, data), bbox_inches='tight')



In [105]:

    
plot_learning_curves('mpba')



In [18]:

    
def plot_strength(measure='mpba', data='small'):
    fig = plt.figure(figsize=(10, 8))
    fig.subplots_adjust(hspace=.6)
    methods = []
    method_names = []
    letters = 'AB'
    
    for policy in ['w-confidence', 'w-margin']:
        for gamma in [50, 60, 70, 90, 95, 99]:
            methods.append('{}-gamma-{}'.format(policy, gamma))
            method_names.append('{}, {}th'.format(policy, gamma))
            
    for (i, dataset) in enumerate(['glass', 'ionosphere']):
        results = {}
        for method in methods + ['passive']:
            results[method] = load_results(dataset, method, measure, mean=False)
        results['max'] = load_results(dataset, 'max', 'max_' + measure, False)
        strength_dict = {}
        for method in methods:
            s = calculate_strength(results['max'], results['passive'], results[method])
            strength_dict[method] = s
        strength_df = pd.DataFrame(strength_dict)
        strength_df.columns = ['{}-{}, {}th'.format(x.split('-')[0], x.split('-')[1], x.split('-')[3])
                               for x in strength_df.columns]
        sorted_cols = (-strength_df.median()).sort_values().index
        strength_df = strength_df[sorted_cols]

        ax = fig.add_subplot(2, 1, i + 1)
        strength_df.index.name = 'trial'
        strength_df = strength_df.reset_index()
        strength_df = strength_df.melt(id_vars=['trial'], value_vars=method_names)
#         strength_df.loc[strength_df['variable'].isin(methods_al), 'type'] = 'single'
#         strength_df.loc[strength_df['variable'].isin(methods_bandits), 'type'] = 'bandit'
#         strength_df.loc[strength_df['variable'].isin(methods_rank), 'type'] = 'rank'
#         strength_df.loc[strength_df['variable'] == 'baseline', 'variable'] = 'explore'
#         strength_df.loc[strength_df['variable'] == 'exp++', 'variable'] = 'exp3++'
        sorted_cols = list(sorted_cols)
#         sorted_cols[sorted_cols.index('baseline')] = 'explore'
#         sorted_cols[sorted_cols.index('exp++')] = 'exp3++'
        # We could use hue here, but I think there is a bug in seaborn that squishes
        # the boxplot
        
        
        sns.boxplot(data=strength_df, x='variable', y='value', order=sorted_cols,
                    width=0.4, linewidth=1, fliersize=3,
                    color='#3498db')
        ax.set_title('({}) {}'.format(letters[i], dataset))
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, rotation_mode='anchor', ha='right')
        ax.xaxis.set_visible(True)
        ax.set_ylabel(titles[measure] + ' Strength')
        ax.set_xlabel('')
        ax.axhline(linewidth=1)
        [i.set_linewidth(0.5) for i in ax.spines.values()]
        
        # set bar width
        new_width = 0.5
        for bar in ax.patches:
            x = bar.get_x()
            width = bar.get_width()
            centre = x + new_width / 2.

            bar.set_x(centre - new_width / 2.)
            bar.set_width(new_width)
            
    fig.savefig('figures/strengths-info_density-params.pdf'.format(measure, data), bbox_inches='tight')



In [19]:

    
plot_strength()

Smaller Pool



In [6]:

    
methods_al =  ['baseline', 'margin', 'w-margin', 'confidence',
            'w-confidence', 'entropy', 'w-entropy',
            'qbb-margin', 'qbb-kl']
methods_bandits = ['thompson', 'ocucb', 'klucb', 'exp++',]
methods_rank = ['borda', 'geometric', 'schulze']
methods_no_passive = methods_al + methods_bandits + methods_rank
methods = ['passive'] + methods_no_passive
measures = ['f1', 'accuracy', 'mpba']



In [17]:

    
for (i, dataset) in enumerate(['sdss-small-train', 'pageblocks-small-train', 'sdss-small-pool', 'pageblocks-small-pool']):
    maximum = {}
    for measure in measures:
        asymptote_measure = 'asymptote_' + measure
        max_measure = 'max_' + measure
        results = {}
        for method in methods:
            results[method] = load_results(dataset, method, measure, False)
            results[method] = np.max(results[method], axis=1)
        results['asymptote'] = load_results(dataset, 'asymptote', asymptote_measure, False)
        maximum[max_measure] = results['asymptote']
        for method in methods:
            maximum[max_measure] = np.maximum(maximum[max_measure], max(results[method]))
    save_results(dataset, 'max', maximum)



In [8]:

    
def calculate_strength(asymptote, passive, policy):
    n_trials, n_samples = passive.shape
    asymptote = np.repeat(asymptote, n_samples).reshape((n_trials, n_samples))
    deficiency = np.sum(asymptote - policy, axis=1) / np.sum(asymptote - passive, axis=1)
    strength = 1 - deficiency
    return strength



In [9]:

    
titles = {
    'f1': 'F1',
    'accuracy': 'Accuracy',
    'mpba': 'MPBA'
}



In [32]:

    
def plot_mpba_strength(measure, datasets):
    fig = plt.figure(figsize=(15, 10))
    fig.subplots_adjust(hspace=.6)
    
    for (i, dataset) in enumerate(datasets):
        results = {}
        for method in methods:
            results[method] = load_results(dataset, method, measure, mean=False)
        results['max'] = load_results(dataset, 'max', 'max_' + measure, False)
        strength_dict = {}
        for method in methods_no_passive:
            s = calculate_strength(results['max'], results['passive'], results[method])
            strength_dict[method] = s
        strength_df = pd.DataFrame(strength_dict)
        sorted_cols = (-strength_df.median()).sort_values().index
        strength_df = strength_df[sorted_cols]

        ax = fig.add_subplot(3, 2, i + 1)
        strength_df.index.name = 'trial'
        strength_df = strength_df.reset_index()
        strength_df = strength_df.melt(id_vars=['trial'], value_vars=methods_no_passive)
        strength_df.loc[strength_df['variable'].isin(methods_al), 'type'] = 'single'
        strength_df.loc[strength_df['variable'].isin(methods_bandits), 'type'] = 'bandit'
        strength_df.loc[strength_df['variable'].isin(methods_rank), 'type'] = 'rank'
        strength_df.loc[strength_df['variable'] == 'baseline', 'variable'] = 'explore'
        strength_df.loc[strength_df['variable'] == 'exp++', 'variable'] = 'exp3++'
        sorted_cols = list(sorted_cols)
        sorted_cols[sorted_cols.index('baseline')] = 'explore'
        sorted_cols[sorted_cols.index('exp++')] = 'exp3++'
        # We could use hue here, but I think there is a bug in seaborn that squishes
        # the boxplot
        
        palette_map = {
            **{m: sns.color_palette()[0] for m in methods_al},
            **{m: sns.color_palette()[2] for m in ['thompson', 'ocucb', 'klucb', 'exp3++', 'explore']},
            **{m: sns.color_palette()[1] for m in methods_rank},
        }
        sns.boxplot(data=strength_df, x='variable', y='value', order=sorted_cols,
                    width=0.4, linewidth=1, palette=palette_map, fliersize=3)
        
        if '-' in dataset:
            dataset, size, kind = dataset.split('-')
            if size == 'small' and kind == 'train':
                title = '{} (small training set)'.format(dataset)
            elif size == 'small' and kind == 'pool':
                title = '{} (small training and test sets)'.format(dataset)
        else:
            title = '{} (full dataset)'.format(dataset)
        ax.set_title(title)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, rotation_mode='anchor', ha='right')
        ax.xaxis.set_visible(True)
        ax.set_ylabel(titles[measure] + ' Strength')
        ax.set_xlabel('')
        ax.axhline(linewidth=1)
        [i.set_linewidth(0.5) for i in ax.spines.values()]
        
        # set bar width
        new_width = 0.5
        for bar in ax.patches:
            x = bar.get_x()
            width = bar.get_width()
            centre = x + new_width / 2.

            bar.set_x(centre - new_width / 2.)
            bar.set_width(new_width)
            
    fig.savefig('figures/strengths-pool-{}.pdf'.format(measure), bbox_inches='tight')



In [33]:

    
plot_mpba_strength('mpba', ['sdss', 'pageblocks', 'sdss-small-train' ,'pageblocks-small-train', 'sdss-small-pool' ,'pageblocks-small-pool'])



In [68]:

    
lc_colors = {'passive': '#9b59b6',
          'borda': '#3498db',
          'exp++': '#95a5a6',
          'confidence': '#e74c3c'}
lc_line = {'passive': ':',
          'borda': '-',
          'exp++': '-.',
          'confidence': '--'}



In [69]:

    
def plot_learning_curves(measure, datasets):
    letters = 'ABCDEF'
    selected_methods = ['passive', 'confidence', 'borda', 'exp++']
    format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
    fig = plt.figure(figsize=(15, 10))
    for (i, dataset) in enumerate(datasets):
        initial_n = 10
        
        learning_curves = {}
        for method in selected_methods:
            learning_curves[method] = load_results(dataset, method, measure, True)
            
        maximum = load_results(dataset, 'asymptote', 'asymptote_{}'.format(measure), True)
#         maximum = np.max(maximum)
        sample_size = learning_curves['passive'].shape[0] + 9

        ax = fig.add_subplot(2, 3, i + 1)
        for method in selected_methods:
            xticks = np.arange(initial_n, initial_n + len(learning_curves[method]))
            method_label = 'exp3++' if method == 'exp++' else method
            ax.plot(xticks, learning_curves[method], label=method_label, linewidth=1, color=lc_colors[method], ls=lc_line[method])

        ax.legend(loc='lower right', frameon=False)
        ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))
        if '-' in dataset:
            dataset, size, kind = dataset.split('-')
            if size == 'small' and kind == 'train':
                title = '({}) {} (small training set)'.format(letters[i], dataset)
            elif size == 'small' and kind == 'pool':
                title = '({}) {} (small training and test sets)'.format(letters[i], dataset)
        else:
            title = '({}) {} (full dataset)'.format(letters[i], dataset)
        ax.set_title(title)
        ax.tick_params(top='off')
        ax.set_ylabel(titles[measure])
#         ax.set_xscale("log")

        ax.plot([initial_n, sample_size], [maximum, maximum], ls=':', linewidth=1, color='black')
#         ax.set_xlim(initial_n, sample_size)
        ax.set_xlim(initial_n, 201)
        if 'sdss' in dataset:
            ax.set_ylim(0.63, 0.91)
        else:
            ax.set_ylim(0.45, 0.78)
            
        if i in (3, 4, 5):
            ax.set_xlabel('Training Size')
        [i.set_linewidth(0.5) for i in ax.spines.values()]
    fig.savefig('figures/learning_curves-pool-{}.pdf'.format(measure), bbox_inches='tight')



In [70]:

    
plot_learning_curves('mpba', ['pageblocks', 'pageblocks-small-train', 'pageblocks-small-pool', 'sdss', 'sdss-small-train', 'sdss-small-pool', ])



In [ ]: