The tables in this section show the standard association metrics between human scores and different types of machine scores. These results are computed on the evaluation set. The scores for each model have been truncated to values indicated in truncation range
. When indicated, scaled scores are computed by re-scaling the predicted scores using mean and standard deviation of human scores as observed on the training data and mean and standard deviation of machine scores as predicted for the training set.
In [ ]:
def read_evals(model_list, file_format_summarize):
has_missing_trims = False
evals = []
for (model_id, model_name, config, csvdir, file_format) in model_list:
csv_file = os.path.join(csvdir, '{}_eval_short.{}'.format(model_id, file_format))
if os.path.exists(csv_file):
df_eval = DataReader.read_from_file(csv_file, index_col=0)
df_eval.index = [model_name]
# figure out whether the score was scaled
df_eval['system score type'] = 'scale' if config.get('use_scaled_predictions') == True or config.get('scale_with') is not None else 'raw'
# we want to display the truncation range, but this is slightly complicated
# we first check to see if the post-processing params file exists; if it does,
# we grab the trim_min and trim_max values from that file (which still could be None!)
trim_min, trim_max = None, None
postproc_file = os.path.join(csvdir, '{}_postprocessing_params.{}'.format(model_id, file_format))
if os.path.exists(postproc_file):
df_postproc = DataReader.read_from_file(postproc_file)
trim_min = df_postproc['trim_min'].values[0]
trim_max = df_postproc['trim_max'].values[0]
# if the trim_min or trim_max is still None, we then grab whatever is in the config
trim_min = config.get('trim_min') if trim_min is None else trim_min
trim_max = config.get('trim_max') if trim_max is None else trim_max
# finally, we calculate the max and min scores; if we couldn't get any trim values,
# then we default these to `?` and the set `has_missing_trims=True`
if trim_min is None:
min_score, has_missing_trims = '?', True
else:
min_score = float(trim_min) - config.get('trim_tolerance', 0.4998)
if trim_max is None:
max_score, has_missing_trims = '?', True
else:
max_score = float(trim_max) + config.get('trim_tolerance', 0.4998)
df_eval['truncation range'] = "[{}, {}]".format(min_score, max_score)
# rename the columns to remove reference to scale/raw scores
new_column_names = [col.split('.')[0] if not 'round' in col
else '{} (rounded)'.format(col.split('.')[0])
for col in df_eval.columns ]
df_eval.columns = new_column_names
evals.append(df_eval)
if len(evals) > 0:
df_evals = pd.concat(evals, sort=True)
else:
df_evals = pd.DataFrame()
return df_evals, has_missing_trims
df_eval, has_missing_trims = read_evals(model_list, file_format_summarize)
if has_missing_trims:
display(Markdown('**Note:** The minimum and/or maximum scores after truncation could not be '
'be computed in some cases. This is because `trim_min` and/or `trim_max` '
'could not be found in either the configuration file or the postprocessing '
'parameters file. Scores that could not be computed are shown as `?`.'))
if not df_eval.empty:
writer = DataWriter(summary_id)
writer.write_experiment_output(output_dir,
{'eval_short': df_eval},
index=True,
file_format=file_format_summarize)
In [ ]:
pd.options.display.width=10
formatter = partial(color_highlighter, low=-0.15, high=0.15)
if not df_eval.empty:
display(HTML(df_eval[['N', 'system score type', "truncation range", 'h_mean', 'h_sd',
'sys_mean', 'sys_sd', 'SMD']].to_html(index=True,
classes=['sortable'],
escape=False,
formatters={'SMD': formatter},
float_format=int_or_float_format_func)))
else:
display(Markdown("No information available for any of the models"))
In [ ]:
if not df_eval.empty:
wtkappa_col = 'wtkappa' if 'wtkappa' in df_eval else 'wtkappa (rounded)'
display(HTML(df_eval[['N',
'system score type',
'corr', 'R2', 'RMSE',
wtkappa_col,
'kappa (rounded)',
'exact_agr (rounded)',
'adj_agr (rounded)']].to_html(index=True,
classes=['sortable'],
escape=False,
float_format=int_or_float_format_func)))
else:
display(Markdown("No information available for any of the models"))