In [ ]:
# imports
import collections
import copy
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scipy.stats

# mapping from casual name to directory where it is saved
classifiers = {
  'random forest': '6969/vanilla', 
  'adaboost':      '6970/vanilla', 
  'svm (rbf)':     '7707/kernel_rbf', 
  'svm (sigmoid)': '7707/kernel_sigmoid'
}

%matplotlib inline

Marginal Contribution


In [ ]:
# helper functions
def rank_dict(dictionary, reverse=False):
    dictionary = copy.copy(dictionary)
    if reverse:
        for key in dictionary.keys():
            dictionary[key] = 1 - dictionary[key]
    sortdict = collections.OrderedDict(sorted(dictionary.items()))
    ranks = scipy.stats.rankdata(list(sortdict.values()))
    result = {}
    for idx, (key, value) in enumerate(sortdict.items()):
        result[key] = ranks[idx]
    return result


def sum_dict_values(a, b, allow_subsets=False):
    result = {}
    a_total = sum(a.values())
    b_total = sum(b.values())
    a_min_b = set(a.keys()) - set(b.keys())
    b_min_a = set(b.keys()) - set(a.keys())
    if len(b_min_a) > 0:
        raise ValueError('dict b got illegal keys: %s' %str(b_min_a))
    if not allow_subsets and len(a_min_b):
        raise ValueError('keys not the same')
    for idx in a.keys():
        if idx in b:
            result[idx] = a[idx] + b[idx]
        else:
            result[idx] = a[idx]
    if sum(result.values()) != a_total + b_total:
        raise ValueError()
    return result


def divide_dict_values(d, denominator):
    result = {}
    for idx in d.keys():
        result[idx] = d[idx] / denominator
    return result


def format_name(name):
    mapping_plain = {
        'strategy': 'imputation',
        'max_features': 'max. features',
        'min_samples_leaf': 'min. samples leaf',
        'min_samples_split': 'min. samples split',
        'criterion': 'split criterion',
        'learning_rate': 'learning rate',
        'max_depth': 'max. depth',
        'n_estimators': 'iterations',
        'algorithm': 'algorithm',
    }
    mapping_short = {
        'strategy': 'imputation',
        'max_features': 'max. feat.',
        'min_samples_leaf': 'samples leaf',
        'min_samples_split': 'samples split',
        'criterion': 'split criterion',
        'learning_rate': 'learning r.',
        'max_depth': 'max. depth',
        'n_estimators': 'iterations',
        'algorithm': 'algo.',
    }

    parts = name.split('__')
    for idx, part in enumerate(parts):
        if part in mapping_plain:
            if len(parts) < 3:
                parts[idx] = mapping_plain[part]
            else:
                parts[idx] = mapping_short[part]

    return ' / '.join(parts)


def marginal_plots(sorted_values, keys, fig_title):
    plt.figure()
    plt.violinplot(list(sorted_values), list(range(len(sorted_values))))
    plt.plot([-0.5, len(sorted_values) - 0.5], [0, 0], 'k-', linestyle='--', lw=1)
    keys = [format_name(key) for key in keys]
    plt.xticks(list(range(len(sorted_values))), list(keys), rotation=45, ha='right')
    plt.ylabel('marginal contribution')
    # plt.title(fig_title)
    print(fig_title)
    plt.show()
    plt.close()


def determine_relevant(data, max_items=None, max_interactions=None):
    from statistics import median
    
    sorted_values = []
    keys = []
    interactions_seen = 0
    for key in sorted(data, key=lambda k: median(data[k]), reverse=True):
        if '__' in key:
            interactions_seen += 1
            if interactions_seen > max_interactions:
                continue

        sorted_values.append(data[key])
        keys.append(key)

    if max_items is not None:
        sorted_values = sorted_values[:max_items]
        keys = keys[:max_items]

    return sorted_values, keys

def obtain_marginal_contributions(result_directory):
    all_ranks = dict()
    all_tasks = list()
    total_ranks = None
    num_tasks = 0
    marginal_contribution = collections.defaultdict(list)

    for task_id in os.listdir(result_directory):
        task_dir = os.path.join(result_directory, task_id)
        if os.path.isdir(task_dir):
            pimp_file = os.path.join(task_dir, 'pimp_values_fanova.json')
            interaction_file = os.path.join(task_dir, 'pimp_values_fanova_interaction.json')

            if os.path.isfile(pimp_file) and os.path.isfile(interaction_file):
                hyperparameters = json.loads(open(pimp_file).read())
                hyperparameters.update(json.loads(open(interaction_file).read()))

                for hyperparameter, value in hyperparameters.items():
                    parts = hyperparameter.split('__')
                    if sorted(parts) != parts: continue

                    marginal_contribution[hyperparameter].append(value)
                all_tasks.append(task_id)

                all_ranks[task_id] = hyperparameters
                ranks = rank_dict(hyperparameters, reverse=True)
                if total_ranks is None:
                    total_ranks = ranks
                else:
                    total_ranks = sum_dict_values(total_ranks, ranks, allow_subsets=False)
                    num_tasks += 1
    total_ranks = divide_dict_values(total_ranks, num_tasks)
    return total_ranks, marginal_contribution, all_tasks

In [ ]:
for classifier, directory_suffix in classifiers.items():
    total_ranks, marginal_contribution, _ = obtain_marginal_contributions('data/fanova/' + directory_suffix)
    sorted_values, keys = determine_relevant(marginal_contribution, max_interactions=3)
    marginal_plots(sorted_values, keys, classifier)

Most important hyperparameter per dataset


In [ ]:
x_axis_feature = 'NumberOfInstances'
y_axis_feature = 'NumberOfFeatures'

# consistent colors with the other plot
colors = {
    '6969/vanilla': {'min_samples_leaf': '#ff61c3', 'max_features': '#db72fb', 'criterion': '#d19100', 'bootstrap': '#619cff', 'max_features__min_samples_leaf': '#00c19f'},
    '6970/vanilla': {'max_depth': '#ff61cc', 'algorithm': '#7cae00', 'learning_rate': '#c77cff', 'learning_rate__max_depth': '#00a9ff', 'algorithm__max_depth': '#00bfc4'},
    '7707/kernel_rbf': {'gamma': '#ff61cc', 'gamma_tol': '#00bfc4', 'C': '#c77cff', 'C_gamma': '#00a9ff'},
    '7707/kernel_sigmoid': {'gamma': '#ff61c3', 'gamma_tol': '#00c19f', 'tol': '#93aa00', 'coef0':'#00ba38', 'C_gamma':'#db72fb'},
}


for classifier, directory_suffix in classifiers.items():
    x_vals = {}
    y_vals = {}
    area = {}
    
    directory = 'data/fanova/' + directory_suffix
    task_qualities = json.load(open('data/fanova/task_qualities.json', 'r'))
    for task_id in os.listdir(directory):
        task_dir = os.path.join(directory, task_id)
        if not os.path.isdir(task_dir):
            continue
        pimp_file = os.path.join(task_dir, 'pimp_values_fanova.json')
        interaction_file = os.path.join(task_dir, 'pimp_values_fanova_interaction.json')
        
        if not (os.path.isfile(pimp_file) and os.path.isfile(interaction_file)):
            continue
        
        hyperparameters = json.load(open(pimp_file, 'r'))
        hyperparameters.update(json.load(open(interaction_file, 'r')))
        
        most_important = max(hyperparameters, key=hyperparameters.get) 
        value = hyperparameters[most_important]
        
        if most_important not in x_vals:
            x_vals[most_important] = []
            y_vals[most_important] = []
            area[most_important] = []
        x_vals[most_important].append(float(task_qualities[task_id][x_axis_feature]))
        y_vals[most_important].append(float(task_qualities[task_id][y_axis_feature]))
        area[most_important].append(float(value) * 50)
    
    plt.figure(figsize=(8, 6))
    plotted_items = []
    legend_keys = []
    for param in x_vals.keys():
        occurances = len(x_vals[param])
        if param in colors[directory_suffix]:
            current = plt.scatter(x_vals[param], y_vals[param], c=colors[directory_suffix][param], s=area[param], alpha=0.9)
        else:
            current = plt.scatter(x_vals[param], y_vals[param], s=area[param], alpha=1.0)
        plotted_items.append(current)
        legend_keys.append(format_name(param))
    

    legend = plt.legend(plotted_items, legend_keys, scatterpoints=1, loc='upper right')
    for idx in range(len(plotted_items)):
        legend.legendHandles[idx]._sizes = [50]
    
    print(classifier)
    # dimensions of the datasets
    plt.axis((450,100000,3,2100))
    plt.xscale("log")
    plt.yscale("log")

    plt.xlabel(x_axis_feature, fontsize='xx-large')
    plt.ylabel(y_axis_feature, fontsize='xx-large')
    plt.show()
    # plt.savefig('result_' + classifier + '.pdf', bbox_inches='tight')
    plt.close()

Verification

The plots in the submission are generated with an external plotting library. To keep the notebook understandable, we will just display the dataframes with results.


In [ ]:
for classifier, directory_suffix in classifiers.items():
    
    file = 'data/verification/' + directory_suffix + '/random_search.pkl'
    if os.path.isfile(file):
        dataframe = pickle.load(open(file, 'rb'))
        print(classifier)
        print(dataframe)

Prior experiments


In [ ]:
def plot_violin(results):
    data = []
    kde_wins = 0
    uni_wins = 0
    draws = 0
    dataframe = pd.DataFrame(columns=['task_id', 'mean_result_kde', 'mean_result_uniform', 'difference'])
    
    for task_id in results:
        if len(results[task_id]) == 2 and 'uniform' in results[task_id] and 'kde' in results[task_id] and len(results[task_id]['kde']) > 0 and len(results[task_id]['uniform']) > 0:
            scores_kde = sum(results[task_id]['kde'].values()) / len(results[task_id]['kde'])
            scores_uniform = sum(results[task_id]['uniform'].values()) / len(results[task_id]['uniform'])
            current_difference = scores_kde - scores_uniform
            data.append(current_difference)
            current_row = {'task_id': task_id, 'mean_result_kde': scores_kde, 'mean_result_uniform': scores_uniform, 'difference': current_difference}
            dataframe = dataframe.append(current_row, ignore_index=True)
    dataframe = dataframe.set_index('task_id')
    
    plt.figure(figsize=(2, 6))
    plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
    plt.plot([0.5, 1.5], [0, 0], 'k-', linestyle='--', lw=1)
    plt.violinplot(data)
    plt.show()
    plt.close()


for classifier, directory_suffix in classifiers.items():
    directory = 'data/priors/' + directory_suffix
    if not os.path.isfile(directory + '/cache_test.pkl'):
        raise ValueError('Could not find cache file:', directory + '/cache_test.pkl')
    cache_results_test = pickle.load(open(directory + '/cache_test.pkl', 'rb'))
    print(classifier)
    plot_violin(cache_results_test)

In [ ]: