The following plot shows the distributions of the feature values in
the training set, after transformation (if applicable), truncation
and standardization (if applicable). The line shows the kernel density estimate. The
human score (sc1
) is also included.
Response length (length
) is included if you specified length_column
in the config file, unless
the column had missing values or a standard deviation <= 0.
In [ ]:
selected_columns = features_used + ['sc1', 'spkitemid']
df_train_preproc_selected_features = df_train_preproc[selected_columns]
try:
df_train_preproc_selected_features = df_train_preproc_selected_features.merge(df_train_length, on='spkitemid')
except NameError:
column_order = sorted(features_used) + ['sc1']
else:
column_order = sorted(features_used) + ['sc1', 'length']
df_train_preproc_melted = pd.melt(df_train_preproc_selected_features, id_vars=['spkitemid'])
df_train_preproc_melted = df_train_preproc_melted[['variable', 'value']]
# we need to reduce col_wrap and increase width if the feature names are too long
if longest_feature_name > 10:
col_wrap = 2
# adjust height to allow for wrapping really long names. We allow 0.25 in per line
height = 2+(math.ceil(longest_feature_name/30)*0.25)
aspect = 4/height
else:
col_wrap = 3
aspect = 1
with sns.axes_style('white'):
g = sns.FacetGrid(col='variable', data=df_train_preproc_melted, col_wrap=col_wrap,
col_order=column_order, sharex=False, sharey=False, height=height,
aspect=aspect)
g.map(sns.distplot, "value", color="grey", kde=False)
for ax, cname in zip(g.axes, g.col_names):
labels = ax.get_xticks()
ax.set_xlabel('')
ax.set_xticklabels(labels, rotation=90)
plot_title = '\n'.join(wrap(str(cname), 30))
ax.set_title(plot_title)
# we want to try to force `tight_layout()`, but if this
# raises a warning, we don't want the entire notebook to fail
with warnings.catch_warnings():
warnings.simplefilter('ignore')
plt.tight_layout(h_pad=1.0)
imgfile = join(figure_dir, '{}_distrib.svg'.format(experiment_id))
plt.savefig(imgfile)
if use_thumbnails:
show_thumbnail(imgfile, next(id_generator))
else:
plt.show()
The following table shows the Pearson correlations between all the training features
after transformation (if applicable), truncation and standardization (if applicable). The human score
(sc1
) is also included.
Response length (length
) is included if
you specified length_column
in the config file, unless the column had missing
values or a standard deviation <= 0.
The following values are highlighted:
sc1
-feature correlations lower than 0.1 or higher than 0.7
In [ ]:
cors_file = join(output_dir, '{}_cors_processed.{}'.format(experiment_id,
file_format))
df_cors = DataReader.read_from_file(cors_file, index_col=0)
if 'length' in df_cors.columns:
feature_columns = sorted([c for c in df_cors.columns if c not in ['sc1', 'length']])
order = ['sc1', 'length'] + feature_columns
else:
feature_columns = sorted([c for c in df_cors.columns if c != 'sc1'])
order = ['sc1'] + feature_columns
df_cors = df_cors.reindex(index=order, columns=order)
# wrap the column names if the feature names are very long
if longest_feature_name > 10:
column_names = ['\n'.join(wrap(c, 10)) for c in order]
else:
column_names = order
df_cors.columns = column_names
# apply two different formatting to the columns according
# to two different thresholds. The first one highlights all
# inter-feature correlations > 0.7 (so, not including sc1)
# and the second highlights all sc1-X correlations lower
# than 0.1 and higher than 0.7. We will use red for the
# first formatting and blue for the second one.
formatter1 = partial(color_highlighter, low=-1, high=0.7)
formatter2 = partial(color_highlighter, low=0.1, high=0.7)
formatter_dict = {c: formatter1 for c in column_names if not c == 'sc1'}
formatter_dict.update({'sc1': formatter2})
HTML(df_cors.to_html(classes=['sortable'], formatters=formatter_dict, escape=False))
The plot below shows correlations between truncated and standardized (if applicable) values of each feature against human score. The first bar (Marginal
) in each case shows Pearson's correlation. The second bar (Partial - all
) shows partial correlations after controlling for all other variables. If you specified length_column
in the config file, a third bar (Partial - length
) will show partial correlations of each feature against the human score after controlling for length. The dotted lines correspond to r = 0.1 and r = 0.7.
In [ ]:
# read in and merge the score correlations
margcor_file = join(output_dir, '{}_margcor_score_all_data.{}'.format(experiment_id, file_format))
pcor_file = join(output_dir, '{}_pcor_score_all_data.{}'.format(experiment_id, file_format))
df_margcor = DataReader.read_from_file(margcor_file, index_col=0)
df_pcor = DataReader.read_from_file(pcor_file, index_col=0)
# check if we have length partial correlations
pcor_no_length_file = join(output_dir, '{}_pcor_score_no_length_all_data.{}'.format(experiment_id,
file_format))
with_length = exists(pcor_no_length_file)
if with_length:
df_pcor_no_length = DataReader.read_from_file(pcor_no_length_file, index_col=0)
df_mpcor = pd.DataFrame([df_margcor.loc['All data'],
df_pcor.loc['All data'],
df_pcor_no_length.loc['All data']]).transpose()
df_mpcor.columns = ['marginal', 'partial_all', 'partial_length']
num_entries = 3
labels = ('Marginal', 'Partial - all', 'Partial - length')
else:
df_mpcor = pd.DataFrame([df_margcor.loc['All data'],
df_pcor.loc['All data']]).transpose()
df_mpcor.columns = ['marginal', 'partial_all']
num_entries = 2
labels = ('Marginal', 'Partial (all)')
df_mpcor.index.name = 'feature'
df_mpcor = df_mpcor.reset_index()
df_mpcor = pd.melt(df_mpcor, id_vars=['feature'])
# we need to change the plot height if the feature names are long
if longest_feature_name > 10:
height = 3 + math.ceil((longest_feature_name - 10)/10)
else:
height = 3
# we need a higher aspect if we have more than 40 features
aspect = 9/height if len(features_used) > 40 else 6/height
# get the colors for the plot
colors = sns.color_palette("Greys", num_entries)
# check for any negative correlations
limits = (0, 1)
if len(df_mpcor[df_mpcor.value < 0]):
limits = (-1, 1)
with sns.axes_style('whitegrid'):
# generate a bar plot but without the legend since we will
# manually add one later
p = sns.catplot("feature", "value", "variable", kind="bar",
palette=colors, data=df_mpcor, height=height,
aspect=aspect, legend=False)
p.set_axis_labels('', 'Correlation with score')
p.set_xticklabels(rotation=90)
p.set(ylim=limits)
# add a line at 0.1 and 0.7
axis = p.axes[0][0]
axis.axhline(y=0.1, linestyle='--', linewidth=0.5, color='black');
axis.axhline(y=0.7, linestyle='--', linewidth=0.5, color='black');
# create the legend manually with the right colors
legend = axis.legend(labels=labels, title='', frameon=True,
fancybox=True, ncol=num_entries)
for i in range(num_entries):
legend.legendHandles[i].set_color(colors[i]);
with warnings.catch_warnings():
warnings.simplefilter('ignore')
plt.tight_layout(h_pad=1.0)
imgfile = join(figure_dir, '{}_cors_score.svg'.format(experiment_id))
plt.savefig(imgfile)
if use_thumbnails:
show_thumbnail(imgfile, next(id_generator))
else:
plt.show()
In [ ]:
len_margcor_file = join(output_dir, '{}_margcor_length_all_data.{}'.format(experiment_id,
file_format))
len_pcor_file = join(output_dir, '{}_pcor_length_all_data.{}'.format(experiment_id,
file_format))
if exists(len_margcor_file) and exists(len_pcor_file):
if standardize_features:
display(Markdown("The plot below shows the same correlations between truncated and "
"standardized values of each feature against length."))
else:
display(Markdown("The plot below shows the same correlations between truncated and "
"un-standardized values of each feature against length."))
df_margcor = DataReader.read_from_file(len_margcor_file, index_col=0)
df_pcor = DataReader.read_from_file(len_pcor_file, index_col=0)
df_mpcor = pd.DataFrame([df_margcor.loc['All data'], df_pcor.loc['All data']]).transpose()
df_mpcor.index.name = 'feature'
df_mpcor.columns = ['marginal', 'partial']
df_mpcor = df_mpcor.reset_index()
df_mpcor = pd.melt(df_mpcor, id_vars=['feature'])
# we need to change the plot height if the feature names are long
if longest_feature_name > 10:
height = 3 + math.ceil((longest_feature_name - 10)/10)
else:
height = 3
# we need a higher aspect if we have more than 40 features
aspect = 9/height if len(features_used) > 40 else 6/height
# check for any negative correlations
limits = (0, 1)
if len(df_mpcor[df_mpcor.value < 0]):
limits = (-1, 1)
# get the colors for the plot
colors = sns.color_palette("Greys", 2)
with sns.axes_style('whitegrid'):
# create a barplot but without the legend since
# we will manually add one later
p = sns.catplot("feature", "value", "variable", kind="bar",
palette=colors, data=df_mpcor, height=height,
aspect=aspect, legend=False)
p.set_axis_labels('', 'Correlation with length')
p.set_xticklabels(rotation=90)
p.set(ylim=limits)
# create the legend manually with the right colors
axis = p.axes[0][0]
legend = axis.legend(labels=('Marginal', 'Partial - all'), title='',
frameon=True, fancybox=True, ncol=2)
legend.legendHandles[0].set_color(colors[0]);
legend.legendHandles[1].set_color(colors[1]);
imgfile = join(figure_dir, '{}_cors_length.svg'.format(experiment_id))
with warnings.catch_warnings():
warnings.simplefilter('ignore')
plt.tight_layout(h_pad=1.0)
plt.savefig(imgfile)
if use_thumbnails:
show_thumbnail(imgfile, next(id_generator))
else:
plt.show()