Feature distributions and inter-feature correlations

Training set distributions

The following plot shows the distributions of the feature values in the training set, after transformation (if applicable), truncation and standardization (if applicable). The line shows the kernel density estimate. The human score (sc1) is also included.

Response length (length) is included if you specified length_column in the config file, unless the column had missing values or a standard deviation <= 0.



In [ ]:

    
selected_columns = features_used + ['sc1', 'spkitemid']
df_train_preproc_selected_features = df_train_preproc[selected_columns]
try:
    df_train_preproc_selected_features = df_train_preproc_selected_features.merge(df_train_length, on='spkitemid')
except NameError:
    column_order = sorted(features_used) + ['sc1']
else:
    column_order = sorted(features_used) + ['sc1', 'length']

df_train_preproc_melted = pd.melt(df_train_preproc_selected_features, id_vars=['spkitemid'])
df_train_preproc_melted = df_train_preproc_melted[['variable', 'value']]

# we need to reduce col_wrap and increase width if the feature names are too long
if longest_feature_name > 10:
    col_wrap = 2
    # adjust height to allow for wrapping really long names. We allow 0.25 in per line
    height = 2+(math.ceil(longest_feature_name/30)*0.25)
    aspect = 4/height
else:
    col_wrap = 3
    aspect = 1

with sns.axes_style('white'):
    g = sns.FacetGrid(col='variable', data=df_train_preproc_melted, col_wrap=col_wrap, 
                      col_order=column_order, sharex=False, sharey=False, height=height, 
                      aspect=aspect)
    g.map(sns.distplot, "value", color="grey", kde=False)
    for ax, cname in zip(g.axes, g.col_names):
        labels = ax.get_xticks()
        ax.set_xlabel('')
        ax.set_xticklabels(labels, rotation=90)
        plot_title = '\n'.join(wrap(str(cname), 30))
        ax.set_title(plot_title)

    # we want to try to force `tight_layout()`, but if this 
    # raises a warning, we don't want the entire notebook to fail
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        plt.tight_layout(h_pad=1.0)

    imgfile = join(figure_dir, '{}_distrib.svg'.format(experiment_id))
    plt.savefig(imgfile)
    if use_thumbnails:
        show_thumbnail(imgfile, next(id_generator))
    else:
        plt.show()

Inter-feature correlations

The following table shows the Pearson correlations between all the training features after transformation (if applicable), truncation and standardization (if applicable). The human score (sc1) is also included.

Response length (length) is included if you specified length_column in the config file, unless the column had missing values or a standard deviation <= 0.

The following values are highlighted:

inter-feature correlations above 0.7, and
sc1-feature correlations lower than 0.1 or higher than 0.7



In [ ]:

    
cors_file = join(output_dir, '{}_cors_processed.{}'.format(experiment_id,
                                                           file_format))
df_cors = DataReader.read_from_file(cors_file, index_col=0)
if 'length' in df_cors.columns:
    feature_columns = sorted([c for c in df_cors.columns if c not in ['sc1', 'length']])
    order = ['sc1', 'length'] + feature_columns
else:
    feature_columns = sorted([c for c in df_cors.columns if c != 'sc1'])
    order = ['sc1'] + feature_columns
    
df_cors = df_cors.reindex(index=order, columns=order)

# wrap the column names if the feature names are very long
if longest_feature_name > 10:
    column_names = ['\n'.join(wrap(c, 10)) for c in order]
else:
    column_names = order
    
df_cors.columns = column_names

# apply two different formatting to the columns according
# to two different thresholds. The first one highlights all
# inter-feature correlations > 0.7 (so, not including sc1)
# and the second highlights all sc1-X correlations lower
# than 0.1 and higher than 0.7. We will use red for the
# first formatting and blue for the second one. 
formatter1 = partial(color_highlighter, low=-1, high=0.7)
formatter2 = partial(color_highlighter, low=0.1, high=0.7)

formatter_dict = {c: formatter1 for c in column_names if not c == 'sc1'}
formatter_dict.update({'sc1': formatter2})

HTML(df_cors.to_html(classes=['sortable'], formatters=formatter_dict, escape=False))

Marginal and partial correlations

The plot below shows correlations between truncated and standardized (if applicable) values of each feature against human score. The first bar (Marginal) in each case shows Pearson's correlation. The second bar (Partial - all) shows partial correlations after controlling for all other variables. If you specified length_column in the config file, a third bar (Partial - length) will show partial correlations of each feature against the human score after controlling for length. The dotted lines correspond to r = 0.1 and r = 0.7.



In [ ]:

    
# read in and merge the score correlations 
margcor_file = join(output_dir, '{}_margcor_score_all_data.{}'.format(experiment_id, file_format))
pcor_file = join(output_dir, '{}_pcor_score_all_data.{}'.format(experiment_id, file_format))

df_margcor = DataReader.read_from_file(margcor_file, index_col=0)
df_pcor = DataReader.read_from_file(pcor_file, index_col=0)

# check if we have length partial correlations
pcor_no_length_file = join(output_dir, '{}_pcor_score_no_length_all_data.{}'.format(experiment_id,
                                                                                    file_format))
with_length = exists(pcor_no_length_file)
if with_length:
    df_pcor_no_length = DataReader.read_from_file(pcor_no_length_file, index_col=0)
    df_mpcor = pd.DataFrame([df_margcor.loc['All data'], 
                             df_pcor.loc['All data'], 
                             df_pcor_no_length.loc['All data']]).transpose()
    df_mpcor.columns = ['marginal', 'partial_all', 'partial_length']
    num_entries = 3
    labels = ('Marginal', 'Partial - all', 'Partial - length')

else:
    df_mpcor = pd.DataFrame([df_margcor.loc['All data'], 
                             df_pcor.loc['All data']]).transpose()
    df_mpcor.columns = ['marginal', 'partial_all']
    num_entries = 2
    labels = ('Marginal', 'Partial (all)')

df_mpcor.index.name = 'feature'
df_mpcor = df_mpcor.reset_index()
df_mpcor = pd.melt(df_mpcor, id_vars=['feature'])

# we need to change the plot height if the feature names are long
if longest_feature_name > 10:
    height = 3 + math.ceil((longest_feature_name - 10)/10)
else:
    height = 3
        
# we need a higher aspect if we have more than 40 features
aspect = 9/height if len(features_used) > 40 else 6/height


# get the colors for the plot
colors = sns.color_palette("Greys", num_entries)

# check for any negative correlations
limits = (0, 1)
if len(df_mpcor[df_mpcor.value < 0]):
    limits = (-1, 1)


with sns.axes_style('whitegrid'):

    # generate a bar plot but without the legend since we will
    # manually add one later
    p = sns.catplot("feature", "value", "variable", kind="bar",
                    palette=colors, data=df_mpcor, height=height, 
                    aspect=aspect, legend=False)
    p.set_axis_labels('', 'Correlation with score')
    p.set_xticklabels(rotation=90)
    p.set(ylim=limits)
    
    # add a line at 0.1 and 0.7
    axis = p.axes[0][0]
    axis.axhline(y=0.1, linestyle='--', linewidth=0.5, color='black');
    axis.axhline(y=0.7, linestyle='--', linewidth=0.5, color='black');

    # create the legend manually with the right colors
    legend = axis.legend(labels=labels, title='', frameon=True, 
                         fancybox=True, ncol=num_entries)
    for i in range(num_entries):
        legend.legendHandles[i].set_color(colors[i]);

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        plt.tight_layout(h_pad=1.0)

    imgfile = join(figure_dir, '{}_cors_score.svg'.format(experiment_id))
    plt.savefig(imgfile)
    if use_thumbnails:
        show_thumbnail(imgfile, next(id_generator))
    else:
        plt.show()



In [ ]:

    
len_margcor_file = join(output_dir, '{}_margcor_length_all_data.{}'.format(experiment_id,
                                                                           file_format))
len_pcor_file = join(output_dir, '{}_pcor_length_all_data.{}'.format(experiment_id,
                                                                     file_format))
if exists(len_margcor_file) and exists(len_pcor_file):

    if standardize_features:
        display(Markdown("The plot below shows the same correlations between truncated and "
                         "standardized values of each feature against length."))
    else:
        display(Markdown("The plot below shows the same correlations between truncated and "
                         "un-standardized values of each feature against length."))

    df_margcor = DataReader.read_from_file(len_margcor_file, index_col=0)
    df_pcor = DataReader.read_from_file(len_pcor_file, index_col=0)
    df_mpcor = pd.DataFrame([df_margcor.loc['All data'], df_pcor.loc['All data']]).transpose()
    df_mpcor.index.name = 'feature'
    df_mpcor.columns = ['marginal', 'partial']
    df_mpcor = df_mpcor.reset_index()
    df_mpcor = pd.melt(df_mpcor, id_vars=['feature'])

    # we need to change the plot height if the feature names are long
    if longest_feature_name > 10:
        height = 3 + math.ceil((longest_feature_name - 10)/10)
    else:
        height = 3
        
    # we need a higher aspect if we have more than 40 features
    aspect = 9/height if len(features_used) > 40 else 6/height


    # check for any negative correlations
    limits = (0, 1)
    if len(df_mpcor[df_mpcor.value < 0]):
        limits = (-1, 1)

    # get the colors for the plot
    colors = sns.color_palette("Greys", 2)
        
    with sns.axes_style('whitegrid'):
        
        # create a barplot but without the legend since
        # we will manually add one later
        p = sns.catplot("feature", "value", "variable", kind="bar",
                        palette=colors, data=df_mpcor, height=height, 
                        aspect=aspect, legend=False)
        p.set_axis_labels('', 'Correlation with length')
        p.set_xticklabels(rotation=90)
        p.set(ylim=limits)

        # create the legend manually with the right colors
        axis = p.axes[0][0]
        legend = axis.legend(labels=('Marginal', 'Partial  - all'), title='', 
                             frameon=True, fancybox=True, ncol=2)
        legend.legendHandles[0].set_color(colors[0]);
        legend.legendHandles[1].set_color(colors[1]);
        imgfile = join(figure_dir, '{}_cors_length.svg'.format(experiment_id))
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            plt.tight_layout(h_pad=1.0)

        plt.savefig(imgfile) 
        if use_thumbnails:
            show_thumbnail(imgfile, next(id_generator))
        else:
            plt.show()