True score evaluations

In [ ]:
if not out_dfs['true_score_evaluations'].empty:

    markdown_strs = []
    markdown_strs.append("The tables in this section show how well system scores can "
                        "predict *true* scores. According to Test theory, a *true* score "
                        "is a score that would have been obtained if there were no errors "
                        "in measurement. While true scores cannot be observed, the variance "
                        "of true scores and the prediction error can be estimated using observed "
                        "human scores when multiple human ratings are available for a subset of "
                        "responses. In this notebook these are estimated using human scores for "
                        "responses in the evaluation set.")
    markdown_strs.append("The table shows variance of human rater errors, "
                         "true score variance, mean squared error (MSE) and "
                         "proportional reduction in mean squared error (PRMSE) for "
                         "predicting a true score with system score.")
    prmse_columns = ['version', 'N','N raters', 'N single', 'N multiple', 
                     'Variance of errors', 'True score var',
                     'MSE true', 'PRMSE true']
    df_prmse = out_dfs['true_score_evaluations'][prmse_columns].copy()
    df_prmse.replace({np.nan: '-'}, inplace=True)
    display(HTML('<span style="font-size:95%">'+ df_prmse.to_html(classes=['sortable'], 
                                                                  escape=False, index=False,
                                                                  float_format=float_format_func) + '</span>'))