Decision-tree-based parameter ranges


In [1]:
%matplotlib inline

import pandas as pd
from matplotlib import pyplot as plt

from atntools.trees import *
from atntools.tree_ranges import * 

set_number = 47

tree_file = '../../data/set{}/weka-output-j48.txt'.format(set_number)
feature_file = '../../data/set{0}/features.set{0}.labelled.csv'.format(set_number)

tree = parse_weka_j48_output_file(tree_file)
instances = pd.read_csv(feature_file)

# for plotting parameter values of best instances; does not affect other computations
instances.sort_values('environmentScoreSlope200', ascending=False, inplace=True)

distributions = get_distributions(tree, instances)
range_weights = get_range_weights(distributions)

min_weight = 1.0
max_weight = -1.0
for segments in range_weights.values():
    for low, high, weight in segments:
        min_weight = min(weight, min_weight)
        max_weight = max(weight, max_weight)

legend_displayed = False
num_params = len(distributions)
plt.figure(figsize=(9, 3 * num_params))

for i, (param, distribution) in enumerate(sorted(distributions.items())):
    
    ax = plt.subplot(num_params, 2, i * 2 + 1)
    plt.title(param + " - simulation outcomes")
    plt.xlabel(param + " parameter value")
    plt.ylabel("number of simulations")
    left = []; width = []
    bottom_unlabeled = []; bottom_good = []
    height_bad = []; height_unlabeled = []; height_good = []
    #max_instances = 0
    for low, high, good, bad, unlabeled in distribution:
        left.append(low)
        width.append(high - low)
        bottom_unlabeled.append(bad)
        bottom_good.append(bad + unlabeled)
        height_bad.append(bad)
        height_unlabeled.append(unlabeled)
        height_good.append(good)
        #max_instances = max(max_instances, good + bad + unlabeled)
    #ax.set_ylim(0, max_instances)
    plt.bar(left=left, bottom=bottom_good, height=height_good, width=width, color='green', label='good')
    plt.bar(left=left, height=height_bad, width=width, color='orange', label='bad')
    
    plt.bar(left=left, bottom=bottom_unlabeled, height=height_unlabeled, width=width, color='lightgray', label='unlabeled')
    #plt.bar(left=left, bottom=bottom_unlabeled, height=height_unlabeled, width=width, color='white', hatch='/', label='unlabeled')
    
    if not legend_displayed:
        plt.legend()
        legend_displayed = True
    
    ranges = range_weights[param]
    ax = plt.subplot(num_params, 2, i * 2 + 2)
    plt.title(param + " - parameter range scores")
    plt.xlabel(param + " parameter value")
    plt.ylabel("P(good) - P(bad)")
    ax.set_ylim(min_weight - 0.1, max_weight + 0.1)
    left = []; height = []; width = []; color = []

    for low, high, weight in ranges:
        left.append(low)
        height.append(weight)
        width.append(high - low)
        color.append('green' if weight > 0 else 'orange')
    plt.bar(left=left, height=height, width=width, color=color)
    
    
    # Plot best instances
    for i in range(5):
        plt.plot([instances[param].iloc[i]], [0], 'co', markersize=(16-3*i))
        
    #plt.savefig('tree-range_set-{}_{}'.format(set_number, param), dpi=600)
    
plt.tight_layout()



In [ ]: