Overall descriptive feature statistics


In [ ]:
if not out_dfs['descriptives'].empty:
    display(HTML(out_dfs['descriptives'].to_html(index=True, classes=['alternate_colors3_groups'], float_format=float_format_func)))
else:
    display(Markdown(no_info_str))

Prevalence of recoded cases


In [ ]:
if not out_dfs['outliers'].empty:
    display(HTML(out_dfs['outliers'].to_html(index=True, classes=['alternate_colors3_groups'], float_format=float_format_func)))
else:
    display(Markdown(no_info_str))

Feature value distribution


In [ ]:
if not out_dfs['percentiles'].empty:
    display(HTML(out_dfs['percentiles'].to_html(index=True, classes=['alternate_colors3_groups'], float_format=float_format_func)))
else:
    display(Markdown(no_info_str))

Correlations between feature values between old and new model

The table shows correlations between raw feature values and human scores in old and new model as well as the correlations between the feature values in both models.


In [ ]:
missing_value_warnings = []
if not (outputs_old['df_train_features'].empty or outputs_new['df_train_features'].empty):
    ids_in_both_sets = list(set(outputs_old['df_train_features']['spkitemid']).intersection(outputs_new['df_train_features']['spkitemid']))
    
    if not len(ids_in_both_sets)== 0:
        if not len(ids_in_both_sets) == len(outputs_old['df_train_features']):
            missing_value_warnings.append("Some responses from the old data set were not present in the new data.")
        if not len(ids_in_both_sets) == len(outputs_new['df_train_features']):
            missing_value_warnings.append("Some responses from the new data set were not present in the old data.")
    
        # select matching data sets
        df_selected_old = outputs_old['df_train_features'][outputs_old['df_train_features']['spkitemid'].isin(ids_in_both_sets)]
        df_selected_new = outputs_new['df_train_features'][outputs_new['df_train_features']['spkitemid'].isin(ids_in_both_sets)]
    
        df_correlations = comparer.compute_correlations_between_versions(df_selected_old,
                                                                         df_selected_new)
        
        if len(missing_value_warnings) > 0:
            display(Markdown('*WARNING*: {} These responses were excluded from this analysis.'.format(' '.join(missing_value_warnings))))
        display(HTML(df_correlations[['N', 'human_old', 'human_new', 'old_new']].to_html(index=True,
                                                                                         classes=['sortable'],
                                                                                         float_format=int_or_float_format_func)))

    else:
        display(Markdown("*WARNING: There were no matching response IDs in the training sets in old and new version*"))
else:
    display(Markdown(no_info_str))