Evaluation results

Overall association statistics



In [ ]:

    
markdown_str = ("The tables in this section show the standard association metrics between "
                "*observed* human scores and different types of machine scores. "
                "These results are computed on the evaluation set. `raw_trim` scores "
                "are truncated to [{}, {}]. `raw_trim_round` scores are computed by first truncating "
                "and then rounding the predicted score. Scaled scores are computed by re-scaling "
                "the predicted scores using mean and standard deviation of human scores as observed "
                "on the training data and mean and standard deviation of machine scores as predicted "
                "for the training set.".format(min_score, max_score))
display(Markdown(markdown_str))

Descriptive holistic score statistics

The table shows distributional properties of human and system scores. SMD values lower then -0.15 or higher than 0.15 are highlighted.

Please note that for raw scores, SMD values are likely to be affected by possible differences in scale.



In [ ]:

    
raw_or_scaled = "scaled" if use_scaled_predictions else "raw"
eval_file = join(output_dir, '{}_eval.{}'.format(experiment_id, file_format))
df_eval = DataReader.read_from_file(eval_file, index_col=0)
distribution_columns = ['N', 'h_mean', 'sys_mean', 'h_sd',  'sys_sd', 'h_min', 'sys_min', 'h_max', 'sys_max', 'SMD']
association_columns = ['N'] + [column for column in df_eval.columns if not column in distribution_columns]
df_distribution = df_eval[distribution_columns]
df_association = df_eval[association_columns]



In [ ]:

    
pd.options.display.width=10
formatter = partial(color_highlighter, low=-0.15, high=0.15)
HTML('<span style="font-size:95%">'+ df_distribution.to_html(classes=['sortable'], 
                                                             escape=False,
                                                             formatters={'SMD': formatter},
                                                             float_format=float_format_func) + '</span>')

Association statistics



In [ ]:

    
markdown_str = ['The table shows the standard association metrics between human scores and machine scores.']
if continuous_human_score:
    markdown_str.append("Note that for computation of `kappa` both human and machine scores are rounded.")
else:
    markdown_str.append("Note that for computation of `kappa` all machine scores are rounded.")

Markdown('\n'.join(markdown_str))



In [ ]:

    
pd.options.display.width=10
HTML('<span style="font-size:95%">'+ df_association.to_html(classes=['sortable'], 
                                                            escape=False,
                                                            float_format=float_format_func) + '</span>')

Confusion matrix



In [ ]:

    
markdown_str = ["Confusion matrix using {}, trimmed, and rounded scores and human scores (rows=system, columns=human).".format(raw_or_scaled)]

if continuous_human_score:
    markdown_str.append("Note: Human scores have beeen rounded to the nearest integer.")
            
Markdown('\n'.join(markdown_str))



In [ ]:

    
confmat_file = join(output_dir, '{}_confMatrix.{}'.format(experiment_id, file_format))
df_confmat = DataReader.read_from_file(confmat_file, index_col=0)
df_confmat

Distribution of human and machine scores



In [ ]:

    
markdown_strs = ["The histogram and the table below show the distibution of "
                 "human scores and {}, trimmed, and rounded machine scores "
                 "(as % of all responses).".format(raw_or_scaled)]
markdown_strs.append("Differences in the table between human and machine distributions "
                     "larger than 5 percentage points are <span class='highlight_color'>highlighted</span>.")
if continuous_human_score:
    markdown_strs.append("Note: Human scores have beeen rounded to the nearest integer.")
    
display(Markdown('\n'.join(markdown_strs)))



In [ ]:

    
scoredist_file = join(output_dir, '{}_score_dist.{}'.format(experiment_id, file_format))
df_scoredist = DataReader.read_from_file(scoredist_file, index_col=0)
df_scoredist_melted = pd.melt(df_scoredist, id_vars=['score'])
df_scoredist_melted = df_scoredist_melted[df_scoredist_melted['variable'] != 'difference']

# get the colors for the plot
colors = sns.color_palette("Greys", 2)

with sns.axes_style('whitegrid'):

    # make a barplot without a legend since we will 
    # add one manually later
    p = sns.catplot("score", "value", "variable", kind="bar",
                    palette=colors, data=df_scoredist_melted, 
                    height=3, aspect=2, legend=False)
    p.set_axis_labels('score', '% of responses')
    
    # add a legend with the right colors
    axis = p.axes[0][0]
    legend = axis.legend(labels=('Human', 'Machine'), title='', frameon=True, fancybox=True)
    legend.legendHandles[0].set_color(colors[0])
    legend.legendHandles[1].set_color(colors[1])

    imgfile = join(figure_dir, '{}_score_dist.svg'.format(experiment_id))
    plt.savefig(imgfile)

    if use_thumbnails:
        show_thumbnail(imgfile, next(id_generator))
    else:
        plt.show()



In [ ]:

    
formatter = partial(color_highlighter, low=0, high=5, absolute=True)
df_html = df_scoredist.to_html(classes=['sortable'], index=False, 
                               escape=False, formatters={'difference': formatter})
display(HTML(df_html))