True score evaluations

The tables in this section shows how well system scores can predict true scores. According to Test theory, a true score is a score that would have been obtained if there were no errors in measurement. While true scores cannot be observed, the variance of true scores and the prediction error can be estimated using observed human scores when multiple human ratings are available for a subset of responses. In this notebook, this variance and prediction error are estimated using human scores for responses in the evaluation set.


In [ ]:
prmse_columns = ['N','N raters', 'N single', 'N multiple', 
                 'Variance of errors', 'True score var',
                 'MSE true', 'PRMSE true']

def read_true_score_evals(model_list, file_format_summarize):
    true_score_evals = []
    for (model_id, model_name, config, csvdir, file_format) in model_list:
        csv_file = os.path.join(csvdir, '{}_true_score_eval.{}'.format(model_id, file_format))
        if os.path.exists(csv_file):
            df_true_score_eval_all = DataReader.read_from_file(csv_file, index_col=0)
            # figure out whether the score was scaled
            prefix = 'scale' if config.get('use_scaled_predictions') == True or config.get('scale_with') is not None else 'raw'        
            # use the line that corresponds to the appropriate score (scaled or raw)
            df_true_score_eval = df_true_score_eval_all.loc[['{}_trim'.format(prefix)]].copy()
            df_true_score_eval['system score type'] = prefix
            df_true_score_eval.index = [model_name]
            true_score_evals.append(df_true_score_eval)          
    if len(true_score_evals) > 0:
        df_true_score_evals = pd.concat(true_score_evals, sort=True)
    else:
        df_true_score_evals = pd.DataFrame()
    return(df_true_score_evals)

df_true_score_eval = read_true_score_evals(model_list, file_format_summarize)
if not df_true_score_eval.empty:
    writer = DataWriter(summary_id)
    writer.write_experiment_output(output_dir,
                                   {'true_score_eval': df_true_score_eval},
                                   index=True,
                                   file_format=file_format_summarize)

In [ ]:
if not df_true_score_eval.empty:
    markdown_strs = ["#### Proportional reduction in mean squared error (PRMSE)"]
    markdown_strs.append("The table shows variance of human rater errors, "
                         "true score variance, mean squared error (MSE) and "
                         "proportional reduction in mean squared error (PRMSE) for "
                         "predicting a true score with system score.")
    display(Markdown('\n'.join(markdown_strs)))
    pd.options.display.width=10
    df_prmse = df_true_score_eval[prmse_columns].copy()
    df_prmse.replace({np.nan: '-'}, inplace=True)
    display(HTML('<span style="font-size:95%">'+ df_prmse.to_html(classes=['sortable'], 
                                                                  escape=False,
                                                                  float_format=float_format_func) + '</span>'))
else:
    display(Markdown("No information available for any of the models"))