Flags and settings
In [1]:
SAVE_FIGURES = False
PAPER_FEATURES_V1 = ['frequency', 'aoa', 'clustering', 'phonemes_count', 'synonyms_count']
PAPER_FEATURES_V2 = ['frequency', 'aoa', 'clustering', 'letters_count', 'synonyms_count',
'orthographic_density']
Imports
In [2]:
import pandas as pd
import seaborn as sb
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from progressbar import ProgressBar
%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.features import SubstitutionFeaturesMixin
PAPER_FEATURE_NAMES_V1 = [SubstitutionFeaturesMixin._transformed_feature(feature).__doc__
for feature in PAPER_FEATURES_V1]
PAPER_FEATURE_NAMES_V2 = [SubstitutionFeaturesMixin._transformed_feature(feature).__doc__
for feature in PAPER_FEATURES_V2]
First build our data
In [3]:
words = set()
for feature in SubstitutionFeaturesMixin.__features__:
words.update(getattr(SubstitutionFeaturesMixin, '_' + feature)())
words = sorted(words)
data = dict((feature, np.zeros(len(words)))
for feature in SubstitutionFeaturesMixin.__features__)
tdata = dict((SubstitutionFeaturesMixin._transformed_feature(feature).__doc__,
np.zeros(len(words)))
for feature in SubstitutionFeaturesMixin.__features__)
for i, word in enumerate(ProgressBar(term_width=80)(words)):
for feature in SubstitutionFeaturesMixin.__features__:
data[feature][i] = getattr(SubstitutionFeaturesMixin, '_' + feature)(word)
tfeature = SubstitutionFeaturesMixin._transformed_feature(feature)
tdata[tfeature.__doc__][i] = tfeature(word)
data['word'] = words
tdata['word'] = words
features = pd.DataFrame(data)
tfeatures = pd.DataFrame(tdata)
del data, tdata, words
We compute the distribution of feature values, for each feature. When a distribution is very skewed, it's a good idea to translate it to log.
In [4]:
g = sb.FacetGrid(pd.melt(features, id_vars='word', var_name='feature'),
col='feature', col_wrap=4, sharex=False, sharey=False)
g.map(sb.distplot, 'value')
if SAVE_FIGURES:
g.fig.savefig(settings.FIGURE.format('all-feature_distributions-raw'),
bbox_inches='tight', dpi=300)
Conclusion: all features except aoa, letters_count, phonemes_count, syllables_count, and synonyms_count should be translated to log so we can see better.
The result is the following.
In [5]:
g = sb.FacetGrid(pd.melt(tfeatures, id_vars='word', var_name='feature'),
col='feature', col_wrap=4, sharex=False, sharey=False)
g.map(sb.distplot, 'value')
if SAVE_FIGURES:
g.fig.savefig(settings.FIGURE.format('all-feature_distributions-transformed'),
bbox_inches='tight', dpi=300)
Which is quite a bit more readable.
Now reducing to the features we expose in the paper
In [6]:
g = sb.FacetGrid(pd.melt(tfeatures[PAPER_FEATURE_NAMES_V2 + ['word']],
id_vars='word', var_name='feature'),
col='feature', col_wrap=3, sharex=False, sharey=False)
g.map(sb.distplot, 'value')
if SAVE_FIGURES:
g.fig.savefig(settings.FIGURE.format('paper-feature_distributions-transformed'),
bbox_inches='tight', dpi=300)
First the brute interactions between all features.
CAVEAT: because of seaborn#407, the interactions are computed on the set of words where all features are defined (i.e. we drop all rows where any feature is NaN). So this graph is a preview, and not something publishable.
In [7]:
g = sb.pairplot(tfeatures.dropna(), kind='reg', markers="+",
diag_kind="kde", diag_kws=dict(shade=True))
if SAVE_FIGURES:
g.fig.savefig(settings.FIGURE.format('all-feature_interactions'),
bbox_inches='tight', dpi=300)
Now reducing to the features we expose in the paper
In [8]:
g = sb.pairplot(tfeatures[PAPER_FEATURE_NAMES_V2].dropna(), kind='reg', markers="+",
diag_kind="kde", diag_kws=dict(shade=True))
if SAVE_FIGURES:
g.fig.savefig(settings.FIGURE.format('paper-feature_interactions'),
bbox_inches='tight', dpi=300)
In [9]:
# Imports
from matplotlib import cm
from matplotlib.patches import Ellipse
def pretty_plot_correlations(data, scaling=1.0, cor_fontsize=30.0, label_fontsize=35.0):
"""Pretty-plot a correlation matrix.
Parameters
----------
data : pd.DataFrame
DataFrame containing the variables for which to plot the correlation matrix.
scaling : float, optional
Scaling factor for the whole figure; for instance `2.0` would
create a figure twice as big (for better resolution when saving);
defaults to 1.0.
cor_fontsize : float, optional
Font size for the correlation values; defaults to 30.0.
label_fontsize : float, optional
Font size for the labels; defaults to 35.0.
"""
# Some layout definitions.
ellipse_lwidth = 1.5
sep_crop = 0.2
# Precompute useful values.
correlations = data.corr()
n_features = len(correlations)
lims = np.linspace(0, 1, n_features + 1)
step = 1 / n_features
xs = lims[:-1] + step / 2
esize = 0.98 * step
# Create our figure.
fig, ax = plt.subplots(figsize=(4 * scaling, 4 * scaling))
# For all couples...
for i in range(n_features):
for j in range(n_features):
# ...compute coordinates...
x = xs[j]
y = 1 - xs[i]
cor = correlations.iloc[i, j]
if i < j:
# ...plot an Ellipse...
color = cm.bwr((1 - cor) / 2)
ax.add_patch(Ellipse([x, y], esize, esize * (1 - abs(cor)),
angle=np.sign(cor) * 45, ls='solid', lw=ellipse_lwidth,
ec='black', fc=color, alpha=0.8))
elif i > j:
# ...or show the correlation value.
ax.text(x, y, "{:.2f}".format(cor).rstrip("0"),
fontsize=cor_fontsize,
horizontalalignment='center',
verticalalignment='center')
# Add the line separators and labels.
for i in range(n_features):
# Line separator
ax.plot([(i + sep_crop) * step, (i + 1 - sep_crop) * step],
[1 - (i + 1 - sep_crop) * step, 1 - (i + sep_crop) * step],
'k-', lw=ellipse_lwidth)
# Horizontal label
ax.text(- step / 2, 1 - step / 2 - i * step,
correlations.columns[i], fontsize=label_fontsize,
horizontalalignment='right', verticalalignment='center')
# Vertical label
ax.text(step / 2 + i * step, 1 + step / 2,
correlations.columns[i], fontsize=label_fontsize, rotation='vertical',
horizontalalignment='center', verticalalignment='bottom')
# Finally, set the axis limits and hide the axis themselves.
ax.set_xlim(- 0.1, 1.1)
ax.set_ylim(- 0.1, 1.1)
ax.axis('off')
return fig
Now the real correlation matrix
In [10]:
fig = pretty_plot_correlations(tfeatures, scaling=2.5, cor_fontsize=10, label_fontsize=15)
if SAVE_FIGURES:
fig.savefig(settings.FIGURE.format('all-feature_correlations'),
bbox_inches='tight', dpi=300)
Now reducing to the features we expose in the first version of the paper
In [11]:
fig = pretty_plot_correlations(tfeatures[PAPER_FEATURE_NAMES_V1],
scaling=2.5 * len(PAPER_FEATURE_NAMES_V1)
/ len(SubstitutionFeaturesMixin.__features__),
cor_fontsize=10, label_fontsize=15)
if SAVE_FIGURES:
fig.savefig(settings.FIGURE.format('paper-feature_correlations-v1'),
bbox_inches='tight', dpi=300)
Compute PCA on word features directly. This is just exploratory since we really want to compute PCA of feature variations upon substitution (see the 'variation' notebook).
In [12]:
from sklearn.decomposition import PCA
ntfeatures = tfeatures.select_dtypes(include=[np.number]).dropna()
pca = PCA(n_components='mle')
pca.fit(ntfeatures)
pcafeatures = pca.transform(ntfeatures)
print('Variance explained with first {} components (mle-estimated):'
.format(pca.n_components_))
print(pca.explained_variance_ratio_)
In [13]:
pca.components_
Out[13]:
A quick look at the first two components
In [14]:
plt.scatter(pcafeatures[:,0], pcafeatures[:,1])
Out[14]:
Compute inverted, centered and normalised feature values so that the clustering works better.
In [15]:
numeric_features = tfeatures.select_dtypes(include=[np.number])
inverted_features = numeric_features.copy()
for feature in numeric_features.columns:
if feature == '#letters':
continue
if numeric_features[['#letters', feature]].corr().loc['#letters', feature] < 0:
inverted_features[feature] = - inverted_features[feature]
pretty_plot_correlations(inverted_features, scaling=2.5, cor_fontsize=10, label_fontsize=15)
# Center and normalise the features.
centered_features = (inverted_features - inverted_features.mean()) / inverted_features.std()
Compute clustering of features. Tinkering with different linkage and n_cluster values shows this result is quite robust.
In [16]:
from sklearn import cluster
for n_clusters in [4, 5, 6, 7, 8]:
agglo = cluster.FeatureAgglomeration(n_clusters=n_clusters)
agglo.fit(centered_features.dropna())
groups = [[] for n in set(agglo.labels_)]
for i, feature in enumerate(centered_features.columns):
groups[agglo.labels_[i]].append(feature)
title = ' {} groups'.format(n_clusters)
print('-' * 70 + title)
print()
for i, group in enumerate(groups):
print('{}: {}'.format(i + 1, ', '.join(group)))
print()
We're going for 6 groups because:
So in our original selection, we had the right groups except that:
Here's the final set:
In [17]:
fig = pretty_plot_correlations(tfeatures[PAPER_FEATURE_NAMES_V2],
scaling=2.5 * len(PAPER_FEATURE_NAMES_V2)
/ len(SubstitutionFeaturesMixin.__features__),
cor_fontsize=10, label_fontsize=15)
if SAVE_FIGURES:
fig.savefig(settings.FIGURE.format('paper-feature_correlations-v2'),
bbox_inches='tight', dpi=300)