Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.discrete, source=Source.majority, past=Past.all, durl=Durl.exclude_past, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 7196 substitutions for model Model(time=Time.discrete, source=Source.majority, past=Past.all, durl=Durl.exclude_past, max_distance=1)
100% (7196 of 7196) |######################| Elapsed Time: 0:02:02 Time: 0:02:02

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | *** | *** | *** | *   |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *   | ns. |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | **  | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | **  | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | **  | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | ns. |
H_00 | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | **  | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *   | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *   | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | **  |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | *   | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | ns. | ns. |
H_00 | ns. | *** | *** | **  |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | **  |
H_00 | ns. | **  | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *   | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *   | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | *   | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | ns. | ns. |
H_00 | ns. | *** | *** | **  |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | ns. | ns. | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | **  | *** | *** |
H_00 | **  | ns. | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | ns. | ns. | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.54034524  0.16133351  0.08746155  0.07136385  0.03546255  0.03039217
  0.01970142  0.017005    0.01617424  0.00882725  0.00711032]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.424365 0.322792 -0.086371 0.253731 0.247127 -0.425529 0.232656 0.301246 -0.392979 0.282465 -0.150217 -0.008460
Component-1 0.295493 -0.378478 0.119609 -0.279861 -0.279761 -0.415574 0.159434 -0.309788 -0.459896 0.251741 -0.171270 0.015565
Component-2 -0.687100 -0.087009 0.120914 -0.048858 -0.698171 0.077989 0.020314 -0.025285 0.047699 -0.049860 -0.008755 0.062457

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (7196 of 7196) |######################| Elapsed Time: 0:01:46 Time: 0:01:46

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *   | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | ns. | *   | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 1 components.

Those explain the following variance:
[ 0.67099834]

Out[35]:
aoa frequency letters_count
Component-0 -0.73038 0.351168 -0.585855

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (7196 of 7196) |######################| Elapsed Time: 0:00:40 Time: 0:00:40

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 876 (cluster-unique) substitutions, but the PCA is in fact computed on 707 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.058068356393213394

intercept                      5.826721
global_aoa                    -0.020925
global_clustering             -0.109416
global_frequency               0.291589
global_letters_count          -0.036725
global_orthographic_density   -0.134025
global_synonyms_count         -0.069246
dtype: float64

Regressing global frequency with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07667410138513642

intercept                                              13.111822
global_aoa                                             -0.228358
global_clustering                                       0.860670
global_frequency                                        0.057987
global_letters_count                                   -1.035279
global_orthographic_density                            -0.399500
global_synonyms_count                                   1.855909
global_aoa * global_clustering                         -0.009022
global_aoa * global_frequency                           0.017645
global_aoa * global_letters_count                       0.007141
global_aoa * global_orthographic_density               -0.038266
global_aoa * global_synonyms_count                     -0.046001
global_clustering * global_frequency                   -0.017752
global_clustering * global_letters_count               -0.158836
global_clustering * global_orthographic_density         0.058099
global_clustering * global_synonyms_count               0.188633
global_frequency * global_letters_count                -0.005882
global_frequency * global_orthographic_density          0.040807
global_frequency * global_synonyms_count               -0.055279
global_letters_count * global_orthographic_density      0.096976
global_letters_count * global_synonyms_count           -0.000319
global_orthographic_density * global_synonyms_count    -0.008496
dtype: float64

Regressing rel frequency with 511 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.03506383309967587

intercept                     -6.412025
global_aoa                    -0.047547
global_clustering             -0.069124
global_frequency               0.261690
global_letters_count           0.071543
global_orthographic_density   -0.062212
global_synonyms_count          0.056531
dtype: float64

Regressing rel frequency with 511 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.049105320157902366

intercept                                              1.658419
global_aoa                                            -0.350987
global_clustering                                      0.674241
global_frequency                                      -0.456877
global_letters_count                                  -0.646165
global_orthographic_density                            0.356773
global_synonyms_count                                  0.661552
global_aoa * global_clustering                         0.015677
global_aoa * global_frequency                          0.037872
global_aoa * global_letters_count                      0.020016
global_aoa * global_orthographic_density              -0.064124
global_aoa * global_synonyms_count                     0.037905
global_clustering * global_frequency                  -0.068266
global_clustering * global_letters_count              -0.090160
global_clustering * global_orthographic_density        0.183357
global_clustering * global_synonyms_count              0.172376
global_frequency * global_letters_count               -0.002907
global_frequency * global_orthographic_density         0.056661
global_frequency * global_synonyms_count               0.025846
global_letters_count * global_orthographic_density     0.106279
global_letters_count * global_synonyms_count          -0.018845
global_orthographic_density * global_synonyms_count    0.068402
dtype: float64

Regressing global frequency with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.045363440589129334

intercept                   9.327010
rel_aoa                    -0.007812
rel_clustering             -0.087175
rel_frequency               0.210992
rel_letters_count          -0.070916
rel_orthographic_density   -0.142122
rel_synonyms_count         -0.167581
dtype: float64

Regressing global frequency with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.062484835378170955

intercept                                        9.185082
rel_aoa                                          0.101294
rel_clustering                                   0.270020
rel_frequency                                    0.179373
rel_letters_count                                0.040801
rel_orthographic_density                        -0.167428
rel_synonyms_count                              -0.123187
rel_aoa * rel_clustering                         0.048171
rel_aoa * rel_frequency                          0.031306
rel_aoa * rel_letters_count                     -0.010572
rel_aoa * rel_orthographic_density               0.019226
rel_aoa * rel_synonyms_count                     0.008873
rel_clustering * rel_frequency                   0.037745
rel_clustering * rel_letters_count              -0.135736
rel_clustering * rel_orthographic_density       -0.013805
rel_clustering * rel_synonyms_count              0.396358
rel_frequency * rel_letters_count                0.010919
rel_frequency * rel_orthographic_density         0.023709
rel_frequency * rel_synonyms_count               0.032458
rel_letters_count * rel_orthographic_density     0.029612
rel_letters_count * rel_synonyms_count          -0.042300
rel_orthographic_density * rel_synonyms_count    0.005567
dtype: float64

Regressing rel frequency with 511 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.2263350496912051

intercept                  -1.863160
rel_aoa                     0.009991
rel_clustering              0.107317
rel_frequency               0.591762
rel_letters_count          -0.165832
rel_orthographic_density   -0.301063
rel_synonyms_count         -0.072000
dtype: float64

Regressing rel frequency with 511 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.24170868111191002

intercept                                       -1.959730
rel_aoa                                          0.008660
rel_clustering                                   0.343094
rel_frequency                                    0.588158
rel_letters_count                               -0.044458
rel_orthographic_density                        -0.340055
rel_synonyms_count                              -0.003538
rel_aoa * rel_clustering                        -0.012808
rel_aoa * rel_frequency                         -0.017012
rel_aoa * rel_letters_count                      0.032937
rel_aoa * rel_orthographic_density               0.119597
rel_aoa * rel_synonyms_count                     0.147810
rel_clustering * rel_frequency                   0.027955
rel_clustering * rel_letters_count              -0.104644
rel_clustering * rel_orthographic_density       -0.094526
rel_clustering * rel_synonyms_count              0.225665
rel_frequency * rel_letters_count                0.025282
rel_frequency * rel_orthographic_density         0.039512
rel_frequency * rel_synonyms_count               0.003171
rel_letters_count * rel_orthographic_density     0.032702
rel_letters_count * rel_synonyms_count          -0.036644
rel_orthographic_density * rel_synonyms_count    0.155278
dtype: float64

Regressing global frequency with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06663461609013277

intercept                      3.385176
global_aoa                    -0.038892
global_clustering             -0.276178
global_frequency               0.295337
global_letters_count           0.174977
global_orthographic_density    0.129016
global_synonyms_count          0.220563
rel_aoa                        0.025822
rel_clustering                 0.193437
rel_frequency                 -0.011051
rel_letters_count             -0.235878
rel_orthographic_density      -0.295384
rel_synonyms_count            -0.370501
dtype: float64

Regressing global frequency with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.1547613857988165

intercept                                                -68.546443
global_aoa                                                 2.349463
global_clustering                                        -12.855953
global_frequency                                           1.542518
global_letters_count                                       0.160699
global_orthographic_density                               13.975271
global_synonyms_count                                     18.305129
rel_aoa                                                   -1.875521
rel_clustering                                            18.600280
rel_frequency                                             -1.612511
rel_letters_count                                         -1.737001
rel_orthographic_density                                 -14.945428
rel_synonyms_count                                       -10.925430
global_aoa * global_clustering                             0.576238
global_aoa * global_frequency                              0.153257
global_aoa * global_letters_count                          0.045677
global_aoa * global_orthographic_density                  -0.230597
global_aoa * global_synonyms_count                        -0.538470
global_aoa * rel_aoa                                      -0.011039
global_aoa * rel_clustering                               -0.593478
global_aoa * rel_frequency                                -0.100944
global_aoa * rel_letters_count                            -0.044783
global_aoa * rel_orthographic_density                      0.207569
global_aoa * rel_synonyms_count                            0.303905
global_clustering * global_frequency                       0.358112
global_clustering * global_letters_count                   0.066897
global_clustering * global_orthographic_density            2.366675
global_clustering * global_synonyms_count                  0.727310
global_clustering * rel_aoa                               -0.560565
global_clustering * rel_clustering                         0.149404
global_clustering * rel_frequency                         -0.347845
global_clustering * rel_letters_count                     -0.308100
global_clustering * rel_orthographic_density              -1.926472
global_clustering * rel_synonyms_count                    -0.231619
global_frequency * global_letters_count                    0.020245
global_frequency * global_orthographic_density             0.182561
global_frequency * global_synonyms_count                  -0.570532
global_frequency * rel_aoa                                -0.154320
global_frequency * rel_clustering                         -0.585800
global_frequency * rel_frequency                          -0.024927
global_frequency * rel_letters_count                      -0.029079
global_frequency * rel_orthographic_density                0.075875
global_frequency * rel_synonyms_count                      0.422004
global_letters_count * global_orthographic_density         0.165485
global_letters_count * global_synonyms_count              -0.130455
global_letters_count * rel_aoa                             0.046446
global_letters_count * rel_clustering                     -0.390242
global_letters_count * rel_frequency                       0.042048
global_letters_count * rel_letters_count                   0.007415
global_letters_count * rel_orthographic_density            0.044325
global_letters_count * rel_synonyms_count                 -0.009127
global_orthographic_density * global_synonyms_count       -1.609650
global_orthographic_density * rel_aoa                     -0.032167
global_orthographic_density * rel_clustering              -2.535024
global_orthographic_density * rel_frequency               -0.115745
global_orthographic_density * rel_letters_count           -0.011260
global_orthographic_density * rel_orthographic_density     0.080664
global_orthographic_density * rel_synonyms_count           1.180059
global_synonyms_count * rel_aoa                            0.291904
global_synonyms_count * rel_clustering                    -1.508532
global_synonyms_count * rel_frequency                      0.386476
global_synonyms_count * rel_letters_count                  0.304045
global_synonyms_count * rel_orthographic_density           1.374429
global_synonyms_count * rel_synonyms_count                 0.054400
rel_aoa * rel_clustering                                   0.540381
rel_aoa * rel_frequency                                    0.077973
rel_aoa * rel_letters_count                               -0.021673
rel_aoa * rel_orthographic_density                         0.136124
rel_aoa * rel_synonyms_count                              -0.100937
rel_clustering * rel_frequency                             0.541519
rel_clustering * rel_letters_count                         0.376085
rel_clustering * rel_orthographic_density                  1.949281
rel_clustering * rel_synonyms_count                        1.441405
rel_frequency * rel_letters_count                          0.017914
rel_frequency * rel_orthographic_density                   0.014398
rel_frequency * rel_synonyms_count                        -0.231222
rel_letters_count * rel_orthographic_density              -0.081243
rel_letters_count * rel_synonyms_count                    -0.127030
rel_orthographic_density * rel_synonyms_count             -0.844008
dtype: float64

Regressing rel frequency with 511 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3106177016925421

intercept                      3.360544
global_aoa                    -0.040913
global_clustering             -0.184760
global_frequency              -0.642452
global_letters_count           0.198517
global_orthographic_density    0.105638
global_synonyms_count          0.149869
rel_aoa                        0.018469
rel_clustering                 0.145451
rel_frequency                  0.970183
rel_letters_count             -0.267174
rel_orthographic_density      -0.250504
rel_synonyms_count            -0.299390
dtype: float64

Regressing rel frequency with 511 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.37701676222029123

intercept                                                -65.751755
global_aoa                                                 2.313083
global_clustering                                        -12.562940
global_frequency                                           0.290293
global_letters_count                                       0.095103
global_orthographic_density                               14.278480
global_synonyms_count                                     15.357425
rel_aoa                                                   -1.790464
rel_clustering                                            19.335354
rel_frequency                                             -0.327875
rel_letters_count                                         -1.832532
rel_orthographic_density                                 -15.409260
rel_synonyms_count                                        -8.029872
global_aoa * global_clustering                             0.565049
global_aoa * global_frequency                              0.157085
global_aoa * global_letters_count                          0.046322
global_aoa * global_orthographic_density                  -0.277574
global_aoa * global_synonyms_count                        -0.521501
global_aoa * rel_aoa                                      -0.008717
global_aoa * rel_clustering                               -0.572041
global_aoa * rel_frequency                                -0.102438
global_aoa * rel_letters_count                            -0.039073
global_aoa * rel_orthographic_density                      0.264742
global_aoa * rel_synonyms_count                            0.293835
global_clustering * global_frequency                       0.336348
global_clustering * global_letters_count                   0.133078
global_clustering * global_orthographic_density            2.378253
global_clustering * global_synonyms_count                  0.592000
global_clustering * rel_aoa                               -0.566720
global_clustering * rel_clustering                         0.115741
global_clustering * rel_frequency                         -0.278835
global_clustering * rel_letters_count                     -0.349532
global_clustering * rel_orthographic_density              -1.954374
global_clustering * rel_synonyms_count                    -0.058988
global_frequency * global_letters_count                    0.067439
global_frequency * global_orthographic_density             0.182629
global_frequency * global_synonyms_count                  -0.447555
global_frequency * rel_aoa                                -0.164166
global_frequency * rel_clustering                         -0.681231
global_frequency * rel_frequency                          -0.006337
global_frequency * rel_letters_count                      -0.058603
global_frequency * rel_orthographic_density                0.061792
global_frequency * rel_synonyms_count                      0.318943
global_letters_count * global_orthographic_density         0.175235
global_letters_count * global_synonyms_count              -0.054743
global_letters_count * rel_aoa                             0.025129
global_letters_count * rel_clustering                     -0.450192
global_letters_count * rel_frequency                       0.032965
global_letters_count * rel_letters_count                   0.011990
global_letters_count * rel_orthographic_density            0.071438
global_letters_count * rel_synonyms_count                 -0.095729
global_orthographic_density * global_synonyms_count       -1.593157
global_orthographic_density * rel_aoa                     -0.008235
global_orthographic_density * rel_clustering              -2.485110
global_orthographic_density * rel_frequency               -0.086165
global_orthographic_density * rel_letters_count           -0.003625
global_orthographic_density * rel_orthographic_density     0.133604
global_orthographic_density * rel_synonyms_count           1.204671
global_synonyms_count * rel_aoa                            0.268911
global_synonyms_count * rel_clustering                    -1.399639
global_synonyms_count * rel_frequency                      0.233719
global_synonyms_count * rel_letters_count                  0.254619
global_synonyms_count * rel_orthographic_density           1.378024
global_synonyms_count * rel_synonyms_count                 0.053167
rel_aoa * rel_clustering                                   0.543705
rel_aoa * rel_frequency                                    0.079429
rel_aoa * rel_letters_count                               -0.008893
rel_aoa * rel_orthographic_density                         0.097768
rel_aoa * rel_synonyms_count                              -0.086150
rel_clustering * rel_frequency                             0.580725
rel_clustering * rel_letters_count                         0.446489
rel_clustering * rel_orthographic_density                  1.947753
rel_clustering * rel_synonyms_count                        1.287930
rel_frequency * rel_letters_count                          0.010350
rel_frequency * rel_orthographic_density                   0.001003
rel_frequency * rel_synonyms_count                        -0.081904
rel_letters_count * rel_orthographic_density              -0.102257
rel_letters_count * rel_synonyms_count                    -0.082964
rel_orthographic_density * rel_synonyms_count             -0.924073
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 472 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.05885800485879522

intercept                      8.318794
global_aoa                     0.168695
global_clustering              0.285376
global_frequency              -0.103551
global_letters_count           0.061366
global_orthographic_density   -0.032138
global_synonyms_count         -0.093702
dtype: float64

Regressing global aoa with 472 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.10070695863466372

intercept                                              0.157272
global_aoa                                            -0.569554
global_clustering                                     -1.305176
global_frequency                                       0.759608
global_letters_count                                   1.357554
global_orthographic_density                           -1.594045
global_synonyms_count                                 -5.133870
global_aoa * global_clustering                        -0.036836
global_aoa * global_frequency                          0.021019
global_aoa * global_letters_count                      0.047340
global_aoa * global_orthographic_density              -0.009943
global_aoa * global_synonyms_count                     0.150199
global_clustering * global_frequency                   0.112245
global_clustering * global_letters_count               0.234407
global_clustering * global_orthographic_density       -0.267464
global_clustering * global_synonyms_count             -0.743055
global_frequency * global_letters_count               -0.038390
global_frequency * global_orthographic_density        -0.051507
global_frequency * global_synonyms_count              -0.058517
global_letters_count * global_orthographic_density     0.054934
global_letters_count * global_synonyms_count           0.013287
global_orthographic_density * global_synonyms_count    0.190928
dtype: float64

Regressing rel aoa with 472 measures, no interactions
           ^^^^^^^
R^2 = 0.0209758035234463

intercept                      2.652005
global_aoa                     0.052925
global_clustering              0.138310
global_frequency              -0.120164
global_letters_count           0.027905
global_orthographic_density    0.004514
global_synonyms_count         -0.016254
dtype: float64

Regressing rel aoa with 472 measures, with interactions
           ^^^^^^^
R^2 = 0.05818211825513952

intercept                                              5.174543
global_aoa                                            -0.586249
global_clustering                                      1.086815
global_frequency                                       0.377642
global_letters_count                                   0.763204
global_orthographic_density                           -2.546528
global_synonyms_count                                 -2.783780
global_aoa * global_clustering                        -0.053016
global_aoa * global_frequency                          0.018858
global_aoa * global_letters_count                      0.005243
global_aoa * global_orthographic_density               0.087891
global_aoa * global_synonyms_count                     0.056549
global_clustering * global_frequency                   0.007676
global_clustering * global_letters_count               0.020260
global_clustering * global_orthographic_density       -0.423534
global_clustering * global_synonyms_count             -0.675877
global_frequency * global_letters_count               -0.070506
global_frequency * global_orthographic_density        -0.062505
global_frequency * global_synonyms_count              -0.107589
global_letters_count * global_orthographic_density    -0.004131
global_letters_count * global_synonyms_count          -0.110160
global_orthographic_density * global_synonyms_count    0.101358
dtype: float64

Regressing global aoa with 472 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.029800538677308697

intercept                   6.331439
rel_aoa                    -0.019535
rel_clustering              0.338570
rel_frequency              -0.069815
rel_letters_count           0.066877
rel_orthographic_density   -0.213805
rel_synonyms_count         -0.163154
dtype: float64

Regressing global aoa with 472 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.07977396811617421

intercept                                        6.472300
rel_aoa                                         -0.188805
rel_clustering                                  -0.231230
rel_frequency                                   -0.063504
rel_letters_count                                0.036646
rel_orthographic_density                        -0.400977
rel_synonyms_count                              -0.228956
rel_aoa * rel_clustering                        -0.221909
rel_aoa * rel_frequency                         -0.051861
rel_aoa * rel_letters_count                      0.065388
rel_aoa * rel_orthographic_density               0.024942
rel_aoa * rel_synonyms_count                    -0.023813
rel_clustering * rel_frequency                   0.057848
rel_clustering * rel_letters_count               0.405436
rel_clustering * rel_orthographic_density        0.083783
rel_clustering * rel_synonyms_count             -1.036996
rel_frequency * rel_letters_count                0.010099
rel_frequency * rel_orthographic_density         0.007746
rel_frequency * rel_synonyms_count              -0.161505
rel_letters_count * rel_orthographic_density     0.110208
rel_letters_count * rel_synonyms_count           0.149613
rel_orthographic_density * rel_synonyms_count    0.336773
dtype: float64

Regressing rel aoa with 472 measures, no interactions
           ^^^^^^^
R^2 = 0.11891669994679643

intercept                   0.782530
rel_aoa                     0.409032
rel_clustering              0.035051
rel_frequency              -0.086760
rel_letters_count          -0.001981
rel_orthographic_density    0.096532
rel_synonyms_count         -0.058146
dtype: float64

Regressing rel aoa with 472 measures, with interactions
           ^^^^^^^
R^2 = 0.15422252118378732

intercept                                        1.061340
rel_aoa                                          0.521890
rel_clustering                                  -0.409077
rel_frequency                                   -0.050177
rel_letters_count                               -0.004181
rel_orthographic_density                         0.315130
rel_synonyms_count                              -0.139395
rel_aoa * rel_clustering                        -0.182839
rel_aoa * rel_frequency                          0.027635
rel_aoa * rel_letters_count                      0.059580
rel_aoa * rel_orthographic_density               0.079235
rel_aoa * rel_synonyms_count                     0.004535
rel_clustering * rel_frequency                   0.047069
rel_clustering * rel_letters_count               0.330375
rel_clustering * rel_orthographic_density        0.142785
rel_clustering * rel_synonyms_count             -0.514355
rel_frequency * rel_letters_count                0.035517
rel_frequency * rel_orthographic_density         0.163292
rel_frequency * rel_synonyms_count              -0.089825
rel_letters_count * rel_orthographic_density     0.064282
rel_letters_count * rel_synonyms_count           0.159639
rel_orthographic_density * rel_synonyms_count    0.372123
dtype: float64

Regressing global aoa with 472 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.07497881435442111

intercept                      6.776286
global_aoa                     0.371972
global_clustering              0.264588
global_frequency              -0.088157
global_letters_count           0.053360
global_orthographic_density    0.027119
global_synonyms_count          0.042069
rel_aoa                       -0.300611
rel_clustering                -0.023456
rel_frequency                 -0.036760
rel_letters_count              0.005576
rel_orthographic_density      -0.024624
rel_synonyms_count            -0.155214
dtype: float64

Regressing global aoa with 472 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.209906704226676

intercept                                                 64.975088
global_aoa                                                -0.071682
global_clustering                                          9.909939
global_frequency                                           0.879363
global_letters_count                                      -6.441959
global_orthographic_density                              -13.510475
global_synonyms_count                                    -18.489091
rel_aoa                                                    0.686767
rel_clustering                                            -9.244208
rel_frequency                                              2.060576
rel_letters_count                                          4.853382
rel_orthographic_density                                   8.450695
rel_synonyms_count                                        -2.014099
global_aoa * global_clustering                             0.076005
global_aoa * global_frequency                             -0.038160
global_aoa * global_letters_count                          0.101919
global_aoa * global_orthographic_density                   0.397041
global_aoa * global_synonyms_count                         0.178887
global_aoa * rel_aoa                                       0.021710
global_aoa * rel_clustering                               -0.053677
global_aoa * rel_frequency                                 0.066414
global_aoa * rel_letters_count                            -0.081379
global_aoa * rel_orthographic_density                     -0.404218
global_aoa * rel_synonyms_count                            0.140745
global_clustering * global_frequency                       0.055441
global_clustering * global_letters_count                  -0.562792
global_clustering * global_orthographic_density           -3.458720
global_clustering * global_synonyms_count                 -1.143701
global_clustering * rel_aoa                                0.049182
global_clustering * rel_clustering                         0.036280
global_clustering * rel_frequency                          0.279587
global_clustering * rel_letters_count                      0.290280
global_clustering * rel_orthographic_density               2.705724
global_clustering * rel_synonyms_count                    -0.090473
global_frequency * global_letters_count                    0.269786
global_frequency * global_orthographic_density            -0.661038
global_frequency * global_synonyms_count                  -0.062000
global_frequency * rel_aoa                                 0.078433
global_frequency * rel_clustering                         -0.027964
global_frequency * rel_frequency                          -0.037219
global_frequency * rel_letters_count                      -0.254444
global_frequency * rel_orthographic_density                0.671213
global_frequency * rel_synonyms_count                      0.783234
global_letters_count * global_orthographic_density        -0.667271
global_letters_count * global_synonyms_count               1.708587
global_letters_count * rel_aoa                            -0.159489
global_letters_count * rel_clustering                      0.689575
global_letters_count * rel_frequency                      -0.322505
global_letters_count * rel_letters_count                   0.027334
global_letters_count * rel_orthographic_density            0.605624
global_letters_count * rel_synonyms_count                 -1.371041
global_orthographic_density * global_synonyms_count        1.872865
global_orthographic_density * rel_aoa                     -0.421591
global_orthographic_density * rel_clustering               2.635400
global_orthographic_density * rel_frequency                0.481223
global_orthographic_density * rel_letters_count            0.314339
global_orthographic_density * rel_orthographic_density    -0.019809
global_orthographic_density * rel_synonyms_count          -1.301830
global_synonyms_count * rel_aoa                           -0.408604
global_synonyms_count * rel_clustering                     1.148881
global_synonyms_count * rel_frequency                     -0.528310
global_synonyms_count * rel_letters_count                 -1.809011
global_synonyms_count * rel_orthographic_density          -1.571219
global_synonyms_count * rel_synonyms_count                -0.180986
rel_aoa * rel_clustering                                  -0.256463
rel_aoa * rel_frequency                                   -0.088776
rel_aoa * rel_letters_count                                0.140657
rel_aoa * rel_orthographic_density                         0.370089
rel_aoa * rel_synonyms_count                               0.105333
rel_clustering * rel_frequency                            -0.295246
rel_clustering * rel_letters_count                        -0.029873
rel_clustering * rel_orthographic_density                 -1.771406
rel_clustering * rel_synonyms_count                       -0.818437
rel_frequency * rel_letters_count                          0.232479
rel_frequency * rel_orthographic_density                  -0.500189
rel_frequency * rel_synonyms_count                        -0.184662
rel_letters_count * rel_orthographic_density              -0.123813
rel_letters_count * rel_synonyms_count                     1.597442
rel_orthographic_density * rel_synonyms_count              1.400535
dtype: float64

Regressing rel aoa with 472 measures, no interactions
           ^^^^^^^
R^2 = 0.174244264190329

intercept                      5.078708
global_aoa                    -0.441616
global_clustering              0.277756
global_frequency              -0.024431
global_letters_count           0.066643
global_orthographic_density   -0.146983
global_synonyms_count          0.180193
rel_aoa                        0.730749
rel_clustering                -0.063966
rel_frequency                 -0.055099
rel_letters_count             -0.021288
rel_orthographic_density       0.080637
rel_synonyms_count            -0.265412
dtype: float64

Regressing rel aoa with 472 measures, with interactions
           ^^^^^^^
R^2 = 0.30228519622911676

intercept                                                 47.851489
global_aoa                                                -2.567980
global_clustering                                          6.653302
global_frequency                                           0.505554
global_letters_count                                      -2.467158
global_orthographic_density                              -11.490231
global_synonyms_count                                    -18.234472
rel_aoa                                                    2.905562
rel_clustering                                            -2.200749
rel_frequency                                              1.253360
rel_letters_count                                          2.403010
rel_orthographic_density                                   8.531837
rel_synonyms_count                                         1.145192
global_aoa * global_clustering                            -0.061330
global_aoa * global_frequency                              0.069653
global_aoa * global_letters_count                          0.044037
global_aoa * global_orthographic_density                   0.393112
global_aoa * global_synonyms_count                         0.117964
global_aoa * rel_aoa                                      -0.020944
global_aoa * rel_clustering                                0.012510
global_aoa * rel_frequency                                -0.007596
global_aoa * rel_letters_count                            -0.069829
global_aoa * rel_orthographic_density                     -0.463409
global_aoa * rel_synonyms_count                            0.180933
global_clustering * global_frequency                       0.065156
global_clustering * global_letters_count                  -0.237063
global_clustering * global_orthographic_density           -2.442238
global_clustering * global_synonyms_count                 -1.332560
global_clustering * rel_aoa                                0.151707
global_clustering * rel_clustering                         0.147048
global_clustering * rel_frequency                          0.045428
global_clustering * rel_letters_count                      0.059793
global_clustering * rel_orthographic_density               1.966046
global_clustering * rel_synonyms_count                     0.337128
global_frequency * global_letters_count                    0.101959
global_frequency * global_orthographic_density            -0.377359
global_frequency * global_synonyms_count                   0.017993
global_frequency * rel_aoa                                -0.012640
global_frequency * rel_clustering                         -0.217284
global_frequency * rel_frequency                          -0.031993
global_frequency * rel_letters_count                      -0.128434
global_frequency * rel_orthographic_density                0.377922
global_frequency * rel_synonyms_count                      0.558441
global_letters_count * global_orthographic_density        -0.463406
global_letters_count * global_synonyms_count               1.470483
global_letters_count * rel_aoa                            -0.052875
global_letters_count * rel_clustering                      0.282252
global_letters_count * rel_frequency                      -0.211251
global_letters_count * rel_letters_count                   0.012488
global_letters_count * rel_orthographic_density            0.481949
global_letters_count * rel_synonyms_count                 -1.263397
global_orthographic_density * global_synonyms_count        1.618497
global_orthographic_density * rel_aoa                     -0.322412
global_orthographic_density * rel_clustering               1.500828
global_orthographic_density * rel_frequency                0.102020
global_orthographic_density * rel_letters_count            0.078791
global_orthographic_density * rel_orthographic_density    -0.121769
global_orthographic_density * rel_synonyms_count          -0.944891
global_synonyms_count * rel_aoa                           -0.267317
global_synonyms_count * rel_clustering                     1.135200
global_synonyms_count * rel_frequency                     -0.401621
global_synonyms_count * rel_letters_count                 -1.570673
global_synonyms_count * rel_orthographic_density          -1.357186
global_synonyms_count * rel_synonyms_count                -0.141262
rel_aoa * rel_clustering                                  -0.295800
rel_aoa * rel_frequency                                   -0.035682
rel_aoa * rel_letters_count                                0.082988
rel_aoa * rel_orthographic_density                         0.352097
rel_aoa * rel_synonyms_count                              -0.022438
rel_clustering * rel_frequency                             0.105245
rel_clustering * rel_letters_count                         0.305993
rel_clustering * rel_orthographic_density                 -0.843379
rel_clustering * rel_synonyms_count                       -1.066939
rel_frequency * rel_letters_count                          0.182995
rel_frequency * rel_orthographic_density                  -0.071610
rel_frequency * rel_synonyms_count                        -0.276289
rel_letters_count * rel_orthographic_density              -0.025217
rel_letters_count * rel_synonyms_count                     1.450008
rel_orthographic_density * rel_synonyms_count              1.034962
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.039328780337313685

intercept                     -4.960501
global_aoa                     0.026110
global_clustering              0.124172
global_frequency              -0.042575
global_letters_count           0.008864
global_orthographic_density    0.057751
global_synonyms_count          0.032343
dtype: float64

Regressing global clustering with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.08498565565837723

intercept                                             -6.974861
global_aoa                                             0.236627
global_clustering                                      0.231839
global_frequency                                      -0.189770
global_letters_count                                   0.578149
global_orthographic_density                            1.082783
global_synonyms_count                                  0.502075
global_aoa * global_clustering                         0.000692
global_aoa * global_frequency                         -0.003940
global_aoa * global_letters_count                     -0.025440
global_aoa * global_orthographic_density              -0.019447
global_aoa * global_synonyms_count                     0.017869
global_clustering * global_frequency                  -0.039391
global_clustering * global_letters_count               0.049584
global_clustering * global_orthographic_density        0.014255
global_clustering * global_synonyms_count             -0.092656
global_frequency * global_letters_count               -0.000673
global_frequency * global_orthographic_density        -0.042572
global_frequency * global_synonyms_count              -0.010091
global_letters_count * global_orthographic_density    -0.065406
global_letters_count * global_synonyms_count          -0.131737
global_orthographic_density * global_synonyms_count   -0.193620
dtype: float64

Regressing rel clustering with 405 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.026472659473012472

intercept                      0.644869
global_aoa                     0.025853
global_clustering              0.091852
global_frequency              -0.017758
global_letters_count           0.027240
global_orthographic_density    0.096031
global_synonyms_count         -0.004328
dtype: float64

Regressing rel clustering with 405 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.04660917229003625

intercept                                             -1.724306
global_aoa                                             0.202926
global_clustering                                     -0.076489
global_frequency                                      -0.032722
global_letters_count                                   0.272384
global_orthographic_density                            0.908094
global_synonyms_count                                  0.794681
global_aoa * global_clustering                        -0.001899
global_aoa * global_frequency                         -0.005746
global_aoa * global_letters_count                     -0.017726
global_aoa * global_orthographic_density              -0.025302
global_aoa * global_synonyms_count                    -0.003186
global_clustering * global_frequency                  -0.002486
global_clustering * global_letters_count               0.027018
global_clustering * global_orthographic_density        0.035340
global_clustering * global_synonyms_count             -0.020672
global_frequency * global_letters_count                0.010403
global_frequency * global_orthographic_density        -0.018665
global_frequency * global_synonyms_count              -0.018398
global_letters_count * global_orthographic_density    -0.037683
global_letters_count * global_synonyms_count          -0.084451
global_orthographic_density * global_synonyms_count   -0.178735
dtype: float64

Regressing global clustering with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.012989394335568451

intercept                  -5.869899
rel_aoa                     0.015793
rel_clustering              0.080446
rel_frequency              -0.023439
rel_letters_count          -0.004074
rel_orthographic_density    0.027991
rel_synonyms_count         -0.020682
dtype: float64

Regressing global clustering with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05351253935090716

intercept                                       -5.935107
rel_aoa                                          0.001777
rel_clustering                                   0.041412
rel_frequency                                   -0.096264
rel_letters_count                               -0.024098
rel_orthographic_density                         0.081314
rel_synonyms_count                              -0.021602
rel_aoa * rel_clustering                        -0.018494
rel_aoa * rel_frequency                         -0.016242
rel_aoa * rel_letters_count                     -0.011123
rel_aoa * rel_orthographic_density               0.003056
rel_aoa * rel_synonyms_count                     0.052466
rel_clustering * rel_frequency                   0.029773
rel_clustering * rel_letters_count               0.059081
rel_clustering * rel_orthographic_density        0.001414
rel_clustering * rel_synonyms_count             -0.189734
rel_frequency * rel_letters_count                0.024101
rel_frequency * rel_orthographic_density        -0.019005
rel_frequency * rel_synonyms_count              -0.051371
rel_letters_count * rel_orthographic_density    -0.058071
rel_letters_count * rel_synonyms_count          -0.071395
rel_orthographic_density * rel_synonyms_count   -0.073928
dtype: float64

Regressing rel clustering with 405 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.11090449166405747

intercept                   0.298784
rel_aoa                     0.006205
rel_clustering              0.343437
rel_frequency               0.000140
rel_letters_count           0.020747
rel_orthographic_density    0.067729
rel_synonyms_count          0.035302
dtype: float64

Regressing rel clustering with 405 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1402819590641503

intercept                                        0.216131
rel_aoa                                         -0.010387
rel_clustering                                   0.354396
rel_frequency                                   -0.062823
rel_letters_count                                0.026643
rel_orthographic_density                         0.093658
rel_synonyms_count                               0.018180
rel_aoa * rel_clustering                         0.013536
rel_aoa * rel_frequency                         -0.007817
rel_aoa * rel_letters_count                     -0.011413
rel_aoa * rel_orthographic_density              -0.012153
rel_aoa * rel_synonyms_count                     0.054072
rel_clustering * rel_frequency                   0.016472
rel_clustering * rel_letters_count              -0.013169
rel_clustering * rel_orthographic_density       -0.062384
rel_clustering * rel_synonyms_count             -0.116210
rel_frequency * rel_letters_count                0.017436
rel_frequency * rel_orthographic_density        -0.025673
rel_frequency * rel_synonyms_count              -0.039181
rel_letters_count * rel_orthographic_density    -0.040083
rel_letters_count * rel_synonyms_count          -0.043391
rel_orthographic_density * rel_synonyms_count   -0.029122
dtype: float64

Regressing global clustering with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05287255337203889

intercept                     -4.403670
global_aoa                     0.038046
global_clustering              0.209035
global_frequency              -0.047252
global_letters_count           0.030943
global_orthographic_density    0.005913
global_synonyms_count          0.150018
rel_aoa                       -0.018678
rel_clustering                -0.108639
rel_frequency                  0.005011
rel_letters_count             -0.026670
rel_orthographic_density       0.066867
rel_synonyms_count            -0.150856
dtype: float64

Regressing global clustering with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.21853828250302654

intercept                                                -1.646919
global_aoa                                                0.692001
global_clustering                                         4.072104
global_frequency                                         -0.522565
global_letters_count                                      3.239659
global_orthographic_density                               3.807884
global_synonyms_count                                    -4.372606
rel_aoa                                                   0.379683
rel_clustering                                           -5.007405
rel_frequency                                             0.131879
rel_letters_count                                        -2.639793
rel_orthographic_density                                 -2.597016
rel_synonyms_count                                       -1.629689
global_aoa * global_clustering                           -0.150831
global_aoa * global_frequency                            -0.065728
global_aoa * global_letters_count                        -0.161241
global_aoa * global_orthographic_density                 -0.085193
global_aoa * global_synonyms_count                        0.050784
global_aoa * rel_aoa                                      0.007965
global_aoa * rel_clustering                               0.228721
global_aoa * rel_frequency                                0.083501
global_aoa * rel_letters_count                            0.131909
global_aoa * rel_orthographic_density                     0.032314
global_aoa * rel_synonyms_count                           0.038935
global_clustering * global_frequency                     -0.268886
global_clustering * global_letters_count                  0.151372
global_clustering * global_orthographic_density          -0.187809
global_clustering * global_synonyms_count                -0.379769
global_clustering * rel_aoa                               0.111729
global_clustering * rel_clustering                       -0.086045
global_clustering * rel_frequency                         0.216584
global_clustering * rel_letters_count                    -0.067448
global_clustering * rel_orthographic_density              0.158048
global_clustering * rel_synonyms_count                    0.127294
global_frequency * global_letters_count                  -0.068447
global_frequency * global_orthographic_density           -0.262239
global_frequency * global_synonyms_count                  0.193934
global_frequency * rel_aoa                                0.006971
global_frequency * rel_clustering                         0.259674
global_frequency * rel_frequency                          0.031068
global_frequency * rel_letters_count                      0.088719
global_frequency * rel_orthographic_density               0.185884
global_frequency * rel_synonyms_count                     0.043500
global_letters_count * global_orthographic_density       -0.300052
global_letters_count * global_synonyms_count             -0.036606
global_letters_count * rel_aoa                            0.020667
global_letters_count * rel_clustering                    -0.059654
global_letters_count * rel_frequency                      0.014661
global_letters_count * rel_letters_count                  0.016730
global_letters_count * rel_orthographic_density           0.291788
global_letters_count * rel_synonyms_count                 0.237069
global_orthographic_density * global_synonyms_count      -0.197358
global_orthographic_density * rel_aoa                     0.085625
global_orthographic_density * rel_clustering              0.107646
global_orthographic_density * rel_frequency               0.223208
global_orthographic_density * rel_letters_count           0.216146
global_orthographic_density * rel_orthographic_density    0.032556
global_orthographic_density * rel_synonyms_count          0.367978
global_synonyms_count * rel_aoa                          -0.165255
global_synonyms_count * rel_clustering                    0.515038
global_synonyms_count * rel_frequency                    -0.334644
global_synonyms_count * rel_letters_count                -0.279572
global_synonyms_count * rel_orthographic_density         -0.203806
global_synonyms_count * rel_synonyms_count                0.048303
rel_aoa * rel_clustering                                 -0.169953
rel_aoa * rel_frequency                                  -0.006378
rel_aoa * rel_letters_count                              -0.023177
rel_aoa * rel_orthographic_density                       -0.069135
rel_aoa * rel_synonyms_count                              0.153123
rel_clustering * rel_frequency                           -0.196243
rel_clustering * rel_letters_count                        0.057839
rel_clustering * rel_orthographic_density                 0.016459
rel_clustering * rel_synonyms_count                      -0.414457
rel_frequency * rel_letters_count                        -0.025392
rel_frequency * rel_orthographic_density                 -0.201408
rel_frequency * rel_synonyms_count                        0.094054
rel_letters_count * rel_orthographic_density             -0.240057
rel_letters_count * rel_synonyms_count                    0.046427
rel_orthographic_density * rel_synonyms_count            -0.004431
dtype: float64

Regressing rel clustering with 405 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.18746140991379356

intercept                     -3.029464
global_aoa                     0.033345
global_clustering             -0.514307
global_frequency              -0.028019
global_letters_count           0.034812
global_orthographic_density    0.048587
global_synonyms_count          0.005089
rel_aoa                       -0.020621
rel_clustering                 0.758198
rel_frequency                 -0.002649
rel_letters_count             -0.014975
rel_orthographic_density       0.019555
rel_synonyms_count             0.002792
dtype: float64

Regressing rel clustering with 405 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.3284944636286355

intercept                                                -2.892417
global_aoa                                                0.474510
global_clustering                                         2.244713
global_frequency                                          0.014201
global_letters_count                                      2.522824
global_orthographic_density                               2.330080
global_synonyms_count                                    -3.445307
rel_aoa                                                   0.569731
rel_clustering                                           -3.094905
rel_frequency                                            -0.140978
rel_letters_count                                        -2.264691
rel_orthographic_density                                 -1.455480
rel_synonyms_count                                       -2.239920
global_aoa * global_clustering                           -0.113208
global_aoa * global_frequency                            -0.041001
global_aoa * global_letters_count                        -0.126620
global_aoa * global_orthographic_density                 -0.085246
global_aoa * global_synonyms_count                        0.008984
global_aoa * rel_aoa                                      0.009989
global_aoa * rel_clustering                               0.185105
global_aoa * rel_frequency                                0.060153
global_aoa * rel_letters_count                            0.108621
global_aoa * rel_orthographic_density                     0.024727
global_aoa * rel_synonyms_count                           0.089835
global_clustering * global_frequency                     -0.151487
global_clustering * global_letters_count                  0.088911
global_clustering * global_orthographic_density          -0.241708
global_clustering * global_synonyms_count                -0.452154
global_clustering * rel_aoa                               0.105554
global_clustering * rel_clustering                       -0.133812
global_clustering * rel_frequency                         0.124547
global_clustering * rel_letters_count                    -0.046551
global_clustering * rel_orthographic_density              0.224731
global_clustering * rel_synonyms_count                    0.231255
global_frequency * global_letters_count                  -0.072117
global_frequency * global_orthographic_density           -0.195135
global_frequency * global_synonyms_count                  0.040399
global_frequency * rel_aoa                               -0.014686
global_frequency * rel_clustering                         0.139343
global_frequency * rel_frequency                          0.029713
global_frequency * rel_letters_count                      0.094662
global_frequency * rel_orthographic_density               0.139659
global_frequency * rel_synonyms_count                     0.169773
global_letters_count * global_orthographic_density       -0.212343
global_letters_count * global_synonyms_count             -0.010278
global_letters_count * rel_aoa                            0.009927
global_letters_count * rel_clustering                    -0.006435
global_letters_count * rel_frequency                      0.009930
global_letters_count * rel_letters_count                  0.016601
global_letters_count * rel_orthographic_density           0.239321
global_letters_count * rel_synonyms_count                 0.218739
global_orthographic_density * global_synonyms_count      -0.091524
global_orthographic_density * rel_aoa                     0.112889
global_orthographic_density * rel_clustering              0.127553
global_orthographic_density * rel_frequency               0.139324
global_orthographic_density * rel_letters_count           0.136814
global_orthographic_density * rel_orthographic_density    0.053103
global_orthographic_density * rel_synonyms_count          0.293812
global_synonyms_count * rel_aoa                          -0.169749
global_synonyms_count * rel_clustering                    0.600679
global_synonyms_count * rel_frequency                    -0.220007
global_synonyms_count * rel_letters_count                -0.226234
global_synonyms_count * rel_orthographic_density         -0.304272
global_synonyms_count * rel_synonyms_count                0.054901
rel_aoa * rel_clustering                                 -0.154494
rel_aoa * rel_frequency                                   0.012881
rel_aoa * rel_letters_count                              -0.012476
rel_aoa * rel_orthographic_density                       -0.080877
rel_aoa * rel_synonyms_count                              0.145844
rel_clustering * rel_frequency                           -0.107008
rel_clustering * rel_letters_count                        0.015016
rel_clustering * rel_orthographic_density                -0.050797
rel_clustering * rel_synonyms_count                      -0.518998
rel_frequency * rel_letters_count                        -0.024755
rel_frequency * rel_orthographic_density                 -0.133032
rel_frequency * rel_synonyms_count                       -0.002117
rel_letters_count * rel_orthographic_density             -0.161878
rel_letters_count * rel_synonyms_count                   -0.002240
rel_orthographic_density * rel_synonyms_count             0.046841
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06644758070760837

intercept                      3.621556
global_aoa                     0.100535
global_clustering             -0.048206
global_frequency               0.055011
global_letters_count           0.236918
global_orthographic_density    0.009182
global_synonyms_count         -0.224715
dtype: float64

Regressing global letters_count with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09936201219936869

intercept                                             -24.900562
global_aoa                                              1.450740
global_clustering                                      -4.313985
global_frequency                                        2.014875
global_letters_count                                    1.314181
global_orthographic_density                            -1.029393
global_synonyms_count                                   0.273400
global_aoa * global_clustering                          0.222275
global_aoa * global_frequency                           0.016070
global_aoa * global_letters_count                      -0.044591
global_aoa * global_orthographic_density                0.058826
global_aoa * global_synonyms_count                     -0.038259
global_clustering * global_frequency                    0.284238
global_clustering * global_letters_count                0.046127
global_clustering * global_orthographic_density        -0.084217
global_clustering * global_synonyms_count               0.053867
global_frequency * global_letters_count                -0.058786
global_frequency * global_orthographic_density         -0.006691
global_frequency * global_synonyms_count               -0.112442
global_letters_count * global_orthographic_density      0.018706
global_letters_count * global_synonyms_count            0.100082
global_orthographic_density * global_synonyms_count     0.423852
dtype: float64

Regressing rel letters_count with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.033649238342300936

intercept                      0.534737
global_aoa                     0.074824
global_clustering             -0.093641
global_frequency               0.032666
global_letters_count           0.164180
global_orthographic_density    0.041948
global_synonyms_count         -0.324964
dtype: float64

Regressing rel letters_count with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0762426449458754

intercept                                             -26.911637
global_aoa                                              1.700217
global_clustering                                      -3.919710
global_frequency                                        2.175628
global_letters_count                                    0.906925
global_orthographic_density                            -1.767219
global_synonyms_count                                   0.598534
global_aoa * global_clustering                          0.222624
global_aoa * global_frequency                          -0.004198
global_aoa * global_letters_count                      -0.061069
global_aoa * global_orthographic_density                0.067882
global_aoa * global_synonyms_count                     -0.095649
global_clustering * global_frequency                    0.300654
global_clustering * global_letters_count               -0.015277
global_clustering * global_orthographic_density        -0.246101
global_clustering * global_synonyms_count               0.016396
global_frequency * global_letters_count                -0.046284
global_frequency * global_orthographic_density         -0.015000
global_frequency * global_synonyms_count               -0.177891
global_letters_count * global_orthographic_density     -0.019480
global_letters_count * global_synonyms_count            0.134135
global_orthographic_density * global_synonyms_count     0.507551
dtype: float64

Regressing global letters_count with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.04958322011158256

intercept                   5.845968
rel_aoa                    -0.006100
rel_clustering              0.102298
rel_frequency               0.036727
rel_letters_count           0.236482
rel_orthographic_density   -0.066329
rel_synonyms_count         -0.237248
dtype: float64

Regressing global letters_count with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07591838846254872

intercept                                        5.989906
rel_aoa                                          0.002558
rel_clustering                                   0.059984
rel_frequency                                    0.141510
rel_letters_count                                0.212596
rel_orthographic_density                        -0.279049
rel_synonyms_count                              -0.440125
rel_aoa * rel_clustering                         0.125314
rel_aoa * rel_frequency                         -0.031701
rel_aoa * rel_letters_count                     -0.070074
rel_aoa * rel_orthographic_density              -0.037361
rel_aoa * rel_synonyms_count                    -0.154914
rel_clustering * rel_frequency                   0.044537
rel_clustering * rel_letters_count               0.046055
rel_clustering * rel_orthographic_density        0.035432
rel_clustering * rel_synonyms_count             -0.570512
rel_frequency * rel_letters_count               -0.057243
rel_frequency * rel_orthographic_density        -0.053375
rel_frequency * rel_synonyms_count              -0.172292
rel_letters_count * rel_orthographic_density     0.064459
rel_letters_count * rel_synonyms_count           0.039423
rel_orthographic_density * rel_synonyms_count    0.120453
dtype: float64

Regressing rel letters_count with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12134000748352369

intercept                   1.663050
rel_aoa                    -0.028437
rel_clustering             -0.085287
rel_frequency              -0.159474
rel_letters_count           0.440006
rel_orthographic_density    0.239842
rel_synonyms_count         -0.240115
dtype: float64

Regressing rel letters_count with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16396175325472584

intercept                                        1.795573
rel_aoa                                          0.101242
rel_clustering                                  -0.074747
rel_frequency                                   -0.050569
rel_letters_count                                0.538782
rel_orthographic_density                         0.149039
rel_synonyms_count                              -0.326991
rel_aoa * rel_clustering                         0.194216
rel_aoa * rel_frequency                          0.004812
rel_aoa * rel_letters_count                     -0.122223
rel_aoa * rel_orthographic_density              -0.122717
rel_aoa * rel_synonyms_count                    -0.190756
rel_clustering * rel_frequency                   0.060472
rel_clustering * rel_letters_count               0.003997
rel_clustering * rel_orthographic_density        0.030624
rel_clustering * rel_synonyms_count             -0.506593
rel_frequency * rel_letters_count               -0.047788
rel_frequency * rel_orthographic_density        -0.004725
rel_frequency * rel_synonyms_count              -0.142032
rel_letters_count * rel_orthographic_density     0.100716
rel_letters_count * rel_synonyms_count           0.127623
rel_orthographic_density * rel_synonyms_count    0.353797
dtype: float64

Regressing global letters_count with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07759140758944427

intercept                      0.617764
global_aoa                     0.201563
global_clustering             -0.240664
global_frequency               0.192145
global_letters_count           0.248942
global_orthographic_density   -0.213324
global_synonyms_count         -0.106260
rel_aoa                       -0.160806
rel_clustering                 0.211039
rel_frequency                 -0.161869
rel_letters_count             -0.005823
rel_orthographic_density       0.261312
rel_synonyms_count            -0.124617
dtype: float64

Regressing global letters_count with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.20402495550883692

intercept                                                 75.840985
global_aoa                                                -0.659042
global_clustering                                          7.244215
global_frequency                                          -2.061427
global_letters_count                                      -7.521044
global_orthographic_density                              -15.957168
global_synonyms_count                                     -9.646266
rel_aoa                                                   -0.493821
rel_clustering                                           -12.619453
rel_frequency                                              5.085753
rel_letters_count                                          7.971007
rel_orthographic_density                                  11.635324
rel_synonyms_count                                         3.928642
global_aoa * global_clustering                             0.417062
global_aoa * global_frequency                              0.159576
global_aoa * global_letters_count                          0.212472
global_aoa * global_orthographic_density                   0.532963
global_aoa * global_synonyms_count                        -0.610435
global_aoa * rel_aoa                                       0.077531
global_aoa * rel_clustering                               -0.348614
global_aoa * rel_frequency                                -0.147672
global_aoa * rel_letters_count                            -0.251815
global_aoa * rel_orthographic_density                     -0.311442
global_aoa * rel_synonyms_count                            0.572131
global_clustering * global_frequency                      -0.103405
global_clustering * global_letters_count                  -0.710802
global_clustering * global_orthographic_density           -2.256681
global_clustering * global_synonyms_count                  0.032301
global_clustering * rel_aoa                               -0.194529
global_clustering * rel_clustering                         0.122301
global_clustering * rel_frequency                          0.403638
global_clustering * rel_letters_count                      0.489307
global_clustering * rel_orthographic_density               2.093778
global_clustering * rel_synonyms_count                     0.271697
global_frequency * global_letters_count                    0.183297
global_frequency * global_orthographic_density            -0.036748
global_frequency * global_synonyms_count                   0.283011
global_frequency * rel_aoa                                 0.008155
global_frequency * rel_clustering                          0.619391
global_frequency * rel_frequency                          -0.034127
global_frequency * rel_letters_count                      -0.270125
global_frequency * rel_orthographic_density                0.214442
global_frequency * rel_synonyms_count                      0.052284
global_letters_count * global_orthographic_density        -0.262129
global_letters_count * global_synonyms_count               1.802403
global_letters_count * rel_aoa                            -0.103923
global_letters_count * rel_clustering                      0.976263
global_letters_count * rel_frequency                      -0.233040
global_letters_count * rel_letters_count                  -0.003780
global_letters_count * rel_orthographic_density            0.265490
global_letters_count * rel_synonyms_count                 -1.493078
global_orthographic_density * global_synonyms_count        1.006070
global_orthographic_density * rel_aoa                     -0.444819
global_orthographic_density * rel_clustering               1.627511
global_orthographic_density * rel_frequency               -0.160953
global_orthographic_density * rel_letters_count            0.009556
global_orthographic_density * rel_orthographic_density    -0.066780
global_orthographic_density * rel_synonyms_count           0.128608
global_synonyms_count * rel_aoa                            0.278306
global_synonyms_count * rel_clustering                     0.422458
global_synonyms_count * rel_frequency                     -0.867724
global_synonyms_count * rel_letters_count                 -1.728659
global_synonyms_count * rel_orthographic_density          -0.516513
global_synonyms_count * rel_synonyms_count                -0.336115
rel_aoa * rel_clustering                                   0.310799
rel_aoa * rel_frequency                                    0.045079
rel_aoa * rel_letters_count                               -0.018253
rel_aoa * rel_orthographic_density                         0.276145
rel_aoa * rel_synonyms_count                              -0.426889
rel_clustering * rel_frequency                            -0.668448
rel_clustering * rel_letters_count                        -0.694708
rel_clustering * rel_orthographic_density                 -1.369190
rel_clustering * rel_synonyms_count                       -0.830781
rel_frequency * rel_letters_count                          0.192329
rel_frequency * rel_orthographic_density                   0.049287
rel_frequency * rel_synonyms_count                         0.501088
rel_letters_count * rel_orthographic_density              -0.075798
rel_letters_count * rel_synonyms_count                     1.442324
rel_orthographic_density * rel_synonyms_count             -0.401551
dtype: float64

Regressing rel letters_count with 511 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.18549091533948692

intercept                      0.225723
global_aoa                     0.176022
global_clustering             -0.219163
global_frequency               0.198780
global_letters_count          -0.648058
global_orthographic_density   -0.178404
global_synonyms_count         -0.102037
rel_aoa                       -0.138395
rel_clustering                 0.181166
rel_frequency                 -0.184949
rel_letters_count              0.924010
rel_orthographic_density       0.208143
rel_synonyms_count            -0.099383
dtype: float64

Regressing rel letters_count with 511 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.29861002140878723

intercept                                                 64.626262
global_aoa                                                -1.158579
global_clustering                                          5.173592
global_frequency                                          -1.990043
global_letters_count                                      -6.826855
global_orthographic_density                              -14.734198
global_synonyms_count                                     -7.209824
rel_aoa                                                   -0.089592
rel_clustering                                           -10.234721
rel_frequency                                              4.842416
rel_letters_count                                          7.901730
rel_orthographic_density                                  10.453118
rel_synonyms_count                                         2.870074
global_aoa * global_clustering                             0.398865
global_aoa * global_frequency                              0.185979
global_aoa * global_letters_count                          0.231781
global_aoa * global_orthographic_density                   0.489053
global_aoa * global_synonyms_count                        -0.596041
global_aoa * rel_aoa                                       0.066477
global_aoa * rel_clustering                               -0.339974
global_aoa * rel_frequency                                -0.184902
global_aoa * rel_letters_count                            -0.268096
global_aoa * rel_orthographic_density                     -0.289717
global_aoa * rel_synonyms_count                            0.532238
global_clustering * global_frequency                      -0.050874
global_clustering * global_letters_count                  -0.490940
global_clustering * global_orthographic_density           -1.943601
global_clustering * global_synonyms_count                  0.028658
global_clustering * rel_aoa                               -0.181150
global_clustering * rel_clustering                         0.096910
global_clustering * rel_frequency                          0.343789
global_clustering * rel_letters_count                      0.330477
global_clustering * rel_orthographic_density               1.829328
global_clustering * rel_synonyms_count                     0.324190
global_frequency * global_letters_count                    0.159934
global_frequency * global_orthographic_density             0.073751
global_frequency * global_synonyms_count                   0.156585
global_frequency * rel_aoa                                -0.013947
global_frequency * rel_clustering                          0.557951
global_frequency * rel_frequency                          -0.034125
global_frequency * rel_letters_count                      -0.241143
global_frequency * rel_orthographic_density                0.132306
global_frequency * rel_synonyms_count                      0.168822
global_letters_count * global_orthographic_density        -0.246307
global_letters_count * global_synonyms_count               1.680986
global_letters_count * rel_aoa                            -0.129387
global_letters_count * rel_clustering                      0.753575
global_letters_count * rel_frequency                      -0.202840
global_letters_count * rel_letters_count                  -0.019585
global_letters_count * rel_orthographic_density            0.256741
global_letters_count * rel_synonyms_count                 -1.444123
global_orthographic_density * global_synonyms_count        0.778033
global_orthographic_density * rel_aoa                     -0.351555
global_orthographic_density * rel_clustering               1.197175
global_orthographic_density * rel_frequency               -0.213887
global_orthographic_density * rel_letters_count           -0.053651
global_orthographic_density * rel_orthographic_density    -0.046990
global_orthographic_density * rel_synonyms_count           0.170207
global_synonyms_count * rel_aoa                            0.323868
global_synonyms_count * rel_clustering                     0.288680
global_synonyms_count * rel_frequency                     -0.757815
global_synonyms_count * rel_letters_count                 -1.680676
global_synonyms_count * rel_orthographic_density          -0.336333
global_synonyms_count * rel_synonyms_count                -0.338942
rel_aoa * rel_clustering                                   0.340679
rel_aoa * rel_frequency                                    0.087580
rel_aoa * rel_letters_count                                0.012941
rel_aoa * rel_orthographic_density                         0.217008
rel_aoa * rel_synonyms_count                              -0.445634
rel_clustering * rel_frequency                            -0.611107
rel_clustering * rel_letters_count                        -0.539692
rel_clustering * rel_orthographic_density                 -0.985690
rel_clustering * rel_synonyms_count                       -0.735282
rel_frequency * rel_letters_count                          0.163312
rel_frequency * rel_orthographic_density                   0.066710
rel_frequency * rel_synonyms_count                         0.429447
rel_letters_count * rel_orthographic_density              -0.018866
rel_letters_count * rel_synonyms_count                     1.448364
rel_orthographic_density * rel_synonyms_count             -0.423029
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 493 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.009538679384343718

intercept                     -0.089965
global_aoa                     0.005559
global_clustering             -0.004964
global_frequency               0.020863
global_letters_count           0.018991
global_orthographic_density    0.035508
global_synonyms_count          0.094560
dtype: float64

Regressing global synonyms_count with 493 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.039382016786131506

intercept                                             -5.313987
global_aoa                                             0.479690
global_clustering                                     -0.364160
global_frequency                                       0.230428
global_letters_count                                   0.451838
global_orthographic_density                            0.748026
global_synonyms_count                                 -0.741589
global_aoa * global_clustering                         0.044834
global_aoa * global_frequency                         -0.003818
global_aoa * global_letters_count                     -0.025444
global_aoa * global_orthographic_density              -0.020869
global_aoa * global_synonyms_count                     0.041098
global_clustering * global_frequency                   0.003133
global_clustering * global_letters_count               0.000628
global_clustering * global_orthographic_density        0.037776
global_clustering * global_synonyms_count              0.011636
global_frequency * global_letters_count               -0.024889
global_frequency * global_orthographic_density        -0.025104
global_frequency * global_synonyms_count               0.050211
global_letters_count * global_orthographic_density    -0.021664
global_letters_count * global_synonyms_count           0.019654
global_orthographic_density * global_synonyms_count    0.044470
dtype: float64

Regressing rel synonyms_count with 493 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.006183776402040153

intercept                     -0.293378
global_aoa                     0.005149
global_clustering             -0.015812
global_frequency               0.006096
global_letters_count           0.013379
global_orthographic_density    0.042732
global_synonyms_count          0.061766
dtype: float64

Regressing rel synonyms_count with 493 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.0369618602441959

intercept                                             -3.998666
global_aoa                                             0.406785
global_clustering                                     -0.349487
global_frequency                                       0.099144
global_letters_count                                   0.253742
global_orthographic_density                            0.575698
global_synonyms_count                                 -0.701859
global_aoa * global_clustering                         0.056386
global_aoa * global_frequency                          0.007012
global_aoa * global_letters_count                     -0.019642
global_aoa * global_orthographic_density              -0.011907
global_aoa * global_synonyms_count                     0.022981
global_clustering * global_frequency                   0.002946
global_clustering * global_letters_count              -0.016731
global_clustering * global_orthographic_density        0.036235
global_clustering * global_synonyms_count              0.032772
global_frequency * global_letters_count               -0.019959
global_frequency * global_orthographic_density        -0.012089
global_frequency * global_synonyms_count               0.054449
global_letters_count * global_orthographic_density    -0.021386
global_letters_count * global_synonyms_count           0.044011
global_orthographic_density * global_synonyms_count    0.037221
dtype: float64

Regressing global synonyms_count with 493 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.010357435466021856

intercept                   0.406382
rel_aoa                    -0.007414
rel_clustering             -0.036609
rel_frequency               0.019617
rel_letters_count           0.021574
rel_orthographic_density    0.029477
rel_synonyms_count          0.073479
dtype: float64

Regressing global synonyms_count with 493 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.028784214813037834

intercept                                        0.472259
rel_aoa                                         -0.015952
rel_clustering                                  -0.091594
rel_frequency                                    0.044622
rel_letters_count                               -0.001268
rel_orthographic_density                         0.106951
rel_synonyms_count                               0.026472
rel_aoa * rel_clustering                         0.008778
rel_aoa * rel_frequency                         -0.005818
rel_aoa * rel_letters_count                     -0.008513
rel_aoa * rel_orthographic_density              -0.013807
rel_aoa * rel_synonyms_count                     0.030325
rel_clustering * rel_frequency                  -0.013095
rel_clustering * rel_letters_count               0.011760
rel_clustering * rel_orthographic_density        0.023499
rel_clustering * rel_synonyms_count             -0.097367
rel_frequency * rel_letters_count               -0.002865
rel_frequency * rel_orthographic_density         0.010393
rel_frequency * rel_synonyms_count               0.014217
rel_letters_count * rel_orthographic_density    -0.021781
rel_letters_count * rel_synonyms_count           0.019983
rel_orthographic_density * rel_synonyms_count   -0.039108
dtype: float64

Regressing rel synonyms_count with 493 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.05034760348086486

intercept                   0.084106
rel_aoa                    -0.012972
rel_clustering              0.031701
rel_frequency               0.020829
rel_letters_count           0.016323
rel_orthographic_density    0.021058
rel_synonyms_count          0.226817
dtype: float64

Regressing rel synonyms_count with 493 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.06985522300526203

intercept                                        0.137746
rel_aoa                                         -0.011176
rel_clustering                                  -0.034735
rel_frequency                                    0.037709
rel_letters_count                               -0.003336
rel_orthographic_density                         0.089146
rel_synonyms_count                               0.274877
rel_aoa * rel_clustering                         0.036374
rel_aoa * rel_frequency                          0.000453
rel_aoa * rel_letters_count                     -0.003366
rel_aoa * rel_orthographic_density               0.001706
rel_aoa * rel_synonyms_count                     0.011586
rel_clustering * rel_frequency                  -0.012041
rel_clustering * rel_letters_count               0.011679
rel_clustering * rel_orthographic_density        0.023505
rel_clustering * rel_synonyms_count             -0.083757
rel_frequency * rel_letters_count               -0.002155
rel_frequency * rel_orthographic_density         0.009963
rel_frequency * rel_synonyms_count               0.023305
rel_letters_count * rel_orthographic_density    -0.018608
rel_letters_count * rel_synonyms_count           0.025240
rel_orthographic_density * rel_synonyms_count    0.021523
dtype: float64

Regressing global synonyms_count with 493 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.01695233322730727

intercept                      0.670747
global_aoa                     0.025811
global_clustering              0.061418
global_frequency              -0.005214
global_letters_count          -0.014370
global_orthographic_density    0.027884
global_synonyms_count          0.127838
rel_aoa                       -0.028059
rel_clustering                -0.085852
rel_frequency                  0.027824
rel_letters_count              0.034163
rel_orthographic_density       0.011533
rel_synonyms_count            -0.042685
dtype: float64

Regressing global synonyms_count with 493 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1666212781463056

intercept                                                 4.958313
global_aoa                                                0.140480
global_clustering                                         1.863552
global_frequency                                         -0.149586
global_letters_count                                      0.522900
global_orthographic_density                               0.858844
global_synonyms_count                                     2.896958
rel_aoa                                                  -0.247436
rel_clustering                                           -2.863095
rel_frequency                                             0.375589
rel_letters_count                                        -0.512830
rel_orthographic_density                                 -0.859195
rel_synonyms_count                                       -7.132185
global_aoa * global_clustering                           -0.063768
global_aoa * global_frequency                            -0.007146
global_aoa * global_letters_count                        -0.016263
global_aoa * global_orthographic_density                 -0.180966
global_aoa * global_synonyms_count                        0.049073
global_aoa * rel_aoa                                     -0.014231
global_aoa * rel_clustering                               0.084841
global_aoa * rel_frequency                               -0.032105
global_aoa * rel_letters_count                            0.002312
global_aoa * rel_orthographic_density                     0.208737
global_aoa * rel_synonyms_count                           0.015634
global_clustering * global_frequency                     -0.105885
global_clustering * global_letters_count                  0.060098
global_clustering * global_orthographic_density          -0.147131
global_clustering * global_synonyms_count                 0.093023
global_clustering * rel_aoa                               0.040184
global_clustering * rel_clustering                       -0.000743
global_clustering * rel_frequency                         0.069772
global_clustering * rel_letters_count                    -0.162221
global_clustering * rel_orthographic_density              0.053900
global_clustering * rel_synonyms_count                   -0.178784
global_frequency * global_letters_count                  -0.036106
global_frequency * global_orthographic_density           -0.114351
global_frequency * global_synonyms_count                 -0.080867
global_frequency * rel_aoa                                0.047401
global_frequency * rel_clustering                         0.121305
global_frequency * rel_frequency                          0.000957
global_frequency * rel_letters_count                     -0.025159
global_frequency * rel_orthographic_density               0.061364
global_frequency * rel_synonyms_count                     0.270748
global_letters_count * global_orthographic_density        0.145814
global_letters_count * global_synonyms_count              0.037268
global_letters_count * rel_aoa                            0.001819
global_letters_count * rel_clustering                     0.003252
global_letters_count * rel_frequency                      0.037664
global_letters_count * rel_letters_count                 -0.005898
global_letters_count * rel_orthographic_density          -0.153677
global_letters_count * rel_synonyms_count                 0.104860
global_orthographic_density * global_synonyms_count      -0.662082
global_orthographic_density * rel_aoa                     0.016612
global_orthographic_density * rel_clustering              0.339142
global_orthographic_density * rel_frequency               0.029532
global_orthographic_density * rel_letters_count          -0.094393
global_orthographic_density * rel_orthographic_density   -0.065953
global_orthographic_density * rel_synonyms_count          0.943295
global_synonyms_count * rel_aoa                           0.010090
global_synonyms_count * rel_clustering                    0.098114
global_synonyms_count * rel_frequency                     0.073535
global_synonyms_count * rel_letters_count                 0.001759
global_synonyms_count * rel_orthographic_density          0.952126
global_synonyms_count * rel_synonyms_count                0.129627
rel_aoa * rel_clustering                                 -0.012099
rel_aoa * rel_frequency                                  -0.010712
rel_aoa * rel_letters_count                              -0.006020
rel_aoa * rel_orthographic_density                       -0.085460
rel_aoa * rel_synonyms_count                             -0.034348
rel_clustering * rel_frequency                           -0.056020
rel_clustering * rel_letters_count                        0.105595
rel_clustering * rel_orthographic_density                -0.201150
rel_clustering * rel_synonyms_count                      -0.079021
rel_frequency * rel_letters_count                         0.000582
rel_frequency * rel_orthographic_density                 -0.013109
rel_frequency * rel_synonyms_count                       -0.230733
rel_letters_count * rel_orthographic_density              0.036799
rel_letters_count * rel_synonyms_count                   -0.102032
rel_orthographic_density * rel_synonyms_count            -1.186197
dtype: float64

Regressing rel synonyms_count with 493 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.12789544312152024

intercept                      0.393172
global_aoa                     0.019148
global_clustering             -0.002091
global_frequency              -0.034838
global_letters_count          -0.012373
global_orthographic_density    0.093433
global_synonyms_count         -0.585550
rel_aoa                       -0.021642
rel_clustering                -0.013955
rel_frequency                  0.043547
rel_letters_count              0.026939
rel_orthographic_density      -0.055805
rel_synonyms_count             0.785637
dtype: float64

Regressing rel synonyms_count with 493 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.25338962895257944

intercept                                                 7.786504
global_aoa                                                0.003371
global_clustering                                         2.017415
global_frequency                                         -0.357129
global_letters_count                                      0.289550
global_orthographic_density                               0.727178
global_synonyms_count                                    -0.162208
rel_aoa                                                   0.092541
rel_clustering                                           -3.113298
rel_frequency                                             0.505675
rel_letters_count                                        -0.581961
rel_orthographic_density                                 -0.996448
rel_synonyms_count                                       -3.675307
global_aoa * global_clustering                           -0.073108
global_aoa * global_frequency                            -0.008409
global_aoa * global_letters_count                        -0.013211
global_aoa * global_orthographic_density                 -0.149028
global_aoa * global_synonyms_count                        0.048844
global_aoa * rel_aoa                                     -0.008959
global_aoa * rel_clustering                               0.103342
global_aoa * rel_frequency                               -0.017312
global_aoa * rel_letters_count                            0.002556
global_aoa * rel_orthographic_density                     0.154074
global_aoa * rel_synonyms_count                           0.035683
global_clustering * global_frequency                     -0.115646
global_clustering * global_letters_count                  0.029199
global_clustering * global_orthographic_density          -0.117669
global_clustering * global_synonyms_count                 0.034512
global_clustering * rel_aoa                               0.073538
global_clustering * rel_clustering                        0.014888
global_clustering * rel_frequency                         0.079960
global_clustering * rel_letters_count                    -0.147567
global_clustering * rel_orthographic_density              0.001362
global_clustering * rel_synonyms_count                   -0.068844
global_frequency * global_letters_count                  -0.030660
global_frequency * global_orthographic_density           -0.089352
global_frequency * global_synonyms_count                  0.036427
global_frequency * rel_aoa                                0.041882
global_frequency * rel_clustering                         0.153186
global_frequency * rel_frequency                          0.003526
global_frequency * rel_letters_count                     -0.017296
global_frequency * rel_orthographic_density               0.046756
global_frequency * rel_synonyms_count                     0.170605
global_letters_count * global_orthographic_density        0.115217
global_letters_count * global_synonyms_count              0.110679
global_letters_count * rel_aoa                           -0.007347
global_letters_count * rel_clustering                     0.015580
global_letters_count * rel_frequency                      0.020459
global_letters_count * rel_letters_count                  0.000865
global_letters_count * rel_orthographic_density          -0.098332
global_letters_count * rel_synonyms_count                 0.016466
global_orthographic_density * global_synonyms_count      -0.523540
global_orthographic_density * rel_aoa                     0.002762
global_orthographic_density * rel_clustering              0.295390
global_orthographic_density * rel_frequency               0.006214
global_orthographic_density * rel_letters_count          -0.062113
global_orthographic_density * rel_orthographic_density   -0.037841
global_orthographic_density * rel_synonyms_count          0.736812
global_synonyms_count * rel_aoa                          -0.012908
global_synonyms_count * rel_clustering                    0.120287
global_synonyms_count * rel_frequency                    -0.021476
global_synonyms_count * rel_letters_count                -0.081865
global_synonyms_count * rel_orthographic_density          0.722634
global_synonyms_count * rel_synonyms_count                0.127171
rel_aoa * rel_clustering                                 -0.051601
rel_aoa * rel_frequency                                  -0.012430
rel_aoa * rel_letters_count                               0.000091
rel_aoa * rel_orthographic_density                       -0.037460
rel_aoa * rel_synonyms_count                             -0.044229
rel_clustering * rel_frequency                           -0.085462
rel_clustering * rel_letters_count                        0.093492
rel_clustering * rel_orthographic_density                -0.160733
rel_clustering * rel_synonyms_count                      -0.157722
rel_frequency * rel_letters_count                         0.006800
rel_frequency * rel_orthographic_density                  0.003908
rel_frequency * rel_synonyms_count                       -0.146058
rel_letters_count * rel_orthographic_density              0.008945
rel_letters_count * rel_synonyms_count                   -0.014753
rel_orthographic_density * rel_synonyms_count            -0.888328
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07121448706422207

intercept                      2.278306
global_aoa                    -0.025894
global_clustering              0.082316
global_frequency              -0.005078
global_letters_count          -0.071411
global_orthographic_density    0.132944
global_synonyms_count          0.048974
dtype: float64

Regressing global orthographic_density with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10301625552796823

intercept                                              6.238040
global_aoa                                            -0.476433
global_clustering                                      0.543454
global_frequency                                      -0.242219
global_letters_count                                  -0.292009
global_orthographic_density                            0.972665
global_synonyms_count                                  0.418100
global_aoa * global_clustering                        -0.003761
global_aoa * global_frequency                          0.031521
global_aoa * global_letters_count                      0.013590
global_aoa * global_orthographic_density               0.042182
global_aoa * global_synonyms_count                     0.037160
global_clustering * global_frequency                  -0.030373
global_clustering * global_letters_count              -0.045452
global_clustering * global_orthographic_density        0.090023
global_clustering * global_synonyms_count              0.049145
global_frequency * global_letters_count               -0.011350
global_frequency * global_orthographic_density        -0.041924
global_frequency * global_synonyms_count              -0.010002
global_letters_count * global_orthographic_density    -0.029654
global_letters_count * global_synonyms_count          -0.038992
global_orthographic_density * global_synonyms_count    0.036758
dtype: float64

Regressing rel orthographic_density with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.057882491373775546

intercept                     -0.161062
global_aoa                    -0.006345
global_clustering              0.099393
global_frequency               0.014047
global_letters_count          -0.071769
global_orthographic_density    0.099315
global_synonyms_count          0.065495
dtype: float64

Regressing rel orthographic_density with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08622896200634234

intercept                                              4.602772
global_aoa                                            -0.346161
global_clustering                                      0.674075
global_frequency                                      -0.211953
global_letters_count                                  -0.500760
global_orthographic_density                            0.510881
global_synonyms_count                                  0.337789
global_aoa * global_clustering                         0.017177
global_aoa * global_frequency                          0.029842
global_aoa * global_letters_count                      0.019161
global_aoa * global_orthographic_density               0.041542
global_aoa * global_synonyms_count                     0.025842
global_clustering * global_frequency                  -0.031063
global_clustering * global_letters_count              -0.084625
global_clustering * global_orthographic_density        0.075973
global_clustering * global_synonyms_count              0.031705
global_frequency * global_letters_count               -0.020013
global_frequency * global_orthographic_density        -0.015738
global_frequency * global_synonyms_count               0.001601
global_letters_count * global_orthographic_density    -0.006175
global_letters_count * global_synonyms_count          -0.030095
global_orthographic_density * global_synonyms_count   -0.046676
dtype: float64

Regressing global orthographic_density with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.060173644152595185

intercept                   1.658213
rel_aoa                     0.034633
rel_clustering              0.084777
rel_frequency              -0.003727
rel_letters_count          -0.089159
rel_orthographic_density    0.154298
rel_synonyms_count          0.018739
dtype: float64

Regressing global orthographic_density with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09018557710643671

intercept                                        1.611965
rel_aoa                                          0.105372
rel_clustering                                   0.201528
rel_frequency                                   -0.045350
rel_letters_count                               -0.089784
rel_orthographic_density                         0.295130
rel_synonyms_count                               0.167665
rel_aoa * rel_clustering                         0.057043
rel_aoa * rel_frequency                          0.023050
rel_aoa * rel_letters_count                      0.020537
rel_aoa * rel_orthographic_density               0.077047
rel_aoa * rel_synonyms_count                     0.076325
rel_clustering * rel_frequency                  -0.020989
rel_clustering * rel_letters_count              -0.079733
rel_clustering * rel_orthographic_density        0.049772
rel_clustering * rel_synonyms_count              0.077589
rel_frequency * rel_letters_count                0.020864
rel_frequency * rel_orthographic_density         0.026385
rel_frequency * rel_synonyms_count               0.060939
rel_letters_count * rel_orthographic_density    -0.064573
rel_letters_count * rel_synonyms_count          -0.015419
rel_orthographic_density * rel_synonyms_count   -0.009926
dtype: float64

Regressing rel orthographic_density with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11044897196921721

intercept                  -0.487571
rel_aoa                     0.034987
rel_clustering              0.106912
rel_frequency               0.039316
rel_letters_count          -0.085075
rel_orthographic_density    0.212391
rel_synonyms_count          0.019984
dtype: float64

Regressing rel orthographic_density with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13822488296128987

intercept                                       -0.492613
rel_aoa                                          0.070516
rel_clustering                                   0.207305
rel_frequency                                    0.035014
rel_letters_count                               -0.088106
rel_orthographic_density                         0.337092
rel_synonyms_count                               0.050004
rel_aoa * rel_clustering                         0.058521
rel_aoa * rel_frequency                          0.009159
rel_aoa * rel_letters_count                      0.019754
rel_aoa * rel_orthographic_density               0.075566
rel_aoa * rel_synonyms_count                     0.069621
rel_clustering * rel_frequency                  -0.031701
rel_clustering * rel_letters_count              -0.084528
rel_clustering * rel_orthographic_density        0.039571
rel_clustering * rel_synonyms_count              0.051361
rel_frequency * rel_letters_count                0.011608
rel_frequency * rel_orthographic_density         0.028398
rel_frequency * rel_synonyms_count               0.042169
rel_letters_count * rel_orthographic_density    -0.049550
rel_letters_count * rel_synonyms_count          -0.035099
rel_orthographic_density * rel_synonyms_count   -0.116771
dtype: float64

Regressing global orthographic_density with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08602014686472004

intercept                      2.054252
global_aoa                    -0.095499
global_clustering             -0.000699
global_frequency              -0.028364
global_letters_count          -0.015082
global_orthographic_density    0.193548
global_synonyms_count          0.133597
rel_aoa                        0.105472
rel_clustering                 0.121539
rel_frequency                  0.029013
rel_letters_count             -0.070474
rel_orthographic_density      -0.086319
rel_synonyms_count            -0.113666
dtype: float64

Regressing global orthographic_density with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.26698253715062525

intercept                                                -33.175582
global_aoa                                                -0.324361
global_clustering                                         -1.528058
global_frequency                                           1.380258
global_letters_count                                       2.823782
global_orthographic_density                               12.326865
global_synonyms_count                                     17.695921
rel_aoa                                                   -1.103758
rel_clustering                                             4.536707
rel_frequency                                             -0.890303
rel_letters_count                                         -0.746268
rel_orthographic_density                                 -10.769512
rel_synonyms_count                                        -9.201502
global_aoa * global_clustering                            -0.232128
global_aoa * global_frequency                             -0.056717
global_aoa * global_letters_count                         -0.039976
global_aoa * global_orthographic_density                  -0.186614
global_aoa * global_synonyms_count                         0.221819
global_aoa * rel_aoa                                      -0.036684
global_aoa * rel_clustering                                0.249020
global_aoa * rel_frequency                                 0.055113
global_aoa * rel_letters_count                             0.078966
global_aoa * rel_orthographic_density                      0.272854
global_aoa * rel_synonyms_count                           -0.322138
global_clustering * global_frequency                      -0.050236
global_clustering * global_letters_count                   0.081016
global_clustering * global_orthographic_density            1.225040
global_clustering * global_synonyms_count                  0.373493
global_clustering * rel_aoa                                0.196119
global_clustering * rel_clustering                         0.019804
global_clustering * rel_frequency                          0.036606
global_clustering * rel_letters_count                      0.170806
global_clustering * rel_orthographic_density              -0.970404
global_clustering * rel_synonyms_count                    -0.245306
global_frequency * global_letters_count                   -0.124686
global_frequency * global_orthographic_density            -0.282270
global_frequency * global_synonyms_count                  -0.842015
global_frequency * rel_aoa                                 0.195539
global_frequency * rel_clustering                         -0.113719
global_frequency * rel_frequency                           0.000041
global_frequency * rel_letters_count                       0.086966
global_frequency * rel_orthographic_density                0.258647
global_frequency * rel_synonyms_count                      0.416214
global_letters_count * global_orthographic_density        -0.056175
global_letters_count * global_synonyms_count              -1.455685
global_letters_count * rel_aoa                             0.147509
global_letters_count * rel_clustering                     -0.315863
global_letters_count * rel_frequency                       0.077916
global_letters_count * rel_letters_count                  -0.058327
global_letters_count * rel_orthographic_density            0.104315
global_letters_count * rel_synonyms_count                  1.111808
global_orthographic_density * global_synonyms_count       -0.333420
global_orthographic_density * rel_aoa                     -0.084311
global_orthographic_density * rel_clustering              -1.168525
global_orthographic_density * rel_frequency                0.084576
global_orthographic_density * rel_letters_count            0.026773
global_orthographic_density * rel_orthographic_density    -0.064304
global_orthographic_density * rel_synonyms_count           0.030511
global_synonyms_count * rel_aoa                           -0.086882
global_synonyms_count * rel_clustering                    -0.123906
global_synonyms_count * rel_frequency                      0.955101
global_synonyms_count * rel_letters_count                  1.378698
global_synonyms_count * rel_orthographic_density           0.392554
global_synonyms_count * rel_synonyms_count                -0.084484
rel_aoa * rel_clustering                                  -0.140466
rel_aoa * rel_frequency                                   -0.183198
rel_aoa * rel_letters_count                               -0.134651
rel_aoa * rel_orthographic_density                         0.113876
rel_aoa * rel_synonyms_count                               0.231498
rel_clustering * rel_frequency                             0.117153
rel_clustering * rel_letters_count                        -0.068332
rel_clustering * rel_orthographic_density                  0.848510
rel_clustering * rel_synonyms_count                        0.198412
rel_frequency * rel_letters_count                         -0.047078
rel_frequency * rel_orthographic_density                  -0.090093
rel_frequency * rel_synonyms_count                        -0.560119
rel_letters_count * rel_orthographic_density              -0.263583
rel_letters_count * rel_synonyms_count                    -1.168146
rel_orthographic_density * rel_synonyms_count             -0.277596
dtype: float64

Regressing rel orthographic_density with 405 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14891583677860598

intercept                      0.910744
global_aoa                    -0.068863
global_clustering              0.004358
global_frequency              -0.001831
global_letters_count           0.019993
global_orthographic_density   -0.502409
global_synonyms_count          0.159204
rel_aoa                        0.073846
rel_clustering                 0.132006
rel_frequency                  0.023012
rel_letters_count             -0.107140
rel_orthographic_density       0.677299
rel_synonyms_count            -0.154354
dtype: float64

Regressing rel orthographic_density with 405 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.32042134418603974

intercept                                                -31.448746
global_aoa                                                 0.202327
global_clustering                                         -0.827990
global_frequency                                           1.659872
global_letters_count                                       2.022448
global_orthographic_density                               10.526803
global_synonyms_count                                     14.754944
rel_aoa                                                   -1.297669
rel_clustering                                             3.660719
rel_frequency                                             -1.213300
rel_letters_count                                         -0.518968
rel_orthographic_density                                  -9.047435
rel_synonyms_count                                        -6.828948
global_aoa * global_clustering                            -0.186618
global_aoa * global_frequency                             -0.072067
global_aoa * global_letters_count                         -0.039024
global_aoa * global_orthographic_density                  -0.190034
global_aoa * global_synonyms_count                         0.162371
global_aoa * rel_aoa                                      -0.031281
global_aoa * rel_clustering                                0.178823
global_aoa * rel_frequency                                 0.075223
global_aoa * rel_letters_count                             0.090164
global_aoa * rel_orthographic_density                      0.289824
global_aoa * rel_synonyms_count                           -0.234289
global_clustering * global_frequency                      -0.039291
global_clustering * global_letters_count                  -0.002426
global_clustering * global_orthographic_density            0.932297
global_clustering * global_synonyms_count                  0.231766
global_clustering * rel_aoa                                0.169979
global_clustering * rel_clustering                         0.050573
global_clustering * rel_frequency                          0.031648
global_clustering * rel_letters_count                      0.212366
global_clustering * rel_orthographic_density              -0.670787
global_clustering * rel_synonyms_count                    -0.062392
global_frequency * global_letters_count                   -0.111787
global_frequency * global_orthographic_density            -0.364101
global_frequency * global_synonyms_count                  -0.737508
global_frequency * rel_aoa                                 0.197066
global_frequency * rel_clustering                         -0.123546
global_frequency * rel_frequency                           0.003169
global_frequency * rel_letters_count                       0.085541
global_frequency * rel_orthographic_density                0.353390
global_frequency * rel_synonyms_count                      0.336619
global_letters_count * global_orthographic_density        -0.015965
global_letters_count * global_synonyms_count              -1.231175
global_letters_count * rel_aoa                             0.132587
global_letters_count * rel_clustering                     -0.183795
global_letters_count * rel_frequency                       0.073130
global_letters_count * rel_letters_count                  -0.051977
global_letters_count * rel_orthographic_density            0.068175
global_letters_count * rel_synonyms_count                  0.933690
global_orthographic_density * global_synonyms_count       -0.188956
global_orthographic_density * rel_aoa                     -0.093938
global_orthographic_density * rel_clustering              -0.771291
global_orthographic_density * rel_frequency                0.184969
global_orthographic_density * rel_letters_count            0.019947
global_orthographic_density * rel_orthographic_density    -0.045203
global_orthographic_density * rel_synonyms_count          -0.046240
global_synonyms_count * rel_aoa                           -0.097392
global_synonyms_count * rel_clustering                     0.094537
global_synonyms_count * rel_frequency                      0.846022
global_synonyms_count * rel_letters_count                  1.126857
global_synonyms_count * rel_orthographic_density           0.144361
global_synonyms_count * rel_synonyms_count                -0.073454
rel_aoa * rel_clustering                                  -0.110770
rel_aoa * rel_frequency                                   -0.194860
rel_aoa * rel_letters_count                               -0.138469
rel_aoa * rel_orthographic_density                         0.090441
rel_aoa * rel_synonyms_count                               0.206786
rel_clustering * rel_frequency                             0.110118
rel_clustering * rel_letters_count                        -0.162294
rel_clustering * rel_orthographic_density                  0.439814
rel_clustering * rel_synonyms_count                       -0.083981
rel_frequency * rel_letters_count                         -0.055941
rel_frequency * rel_orthographic_density                  -0.190958
rel_frequency * rel_synonyms_count                        -0.482045
rel_letters_count * rel_orthographic_density              -0.249254
rel_letters_count * rel_synonyms_count                    -0.948202
rel_orthographic_density * rel_synonyms_count             -0.089557
dtype: float64