Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.all, past=Past.last_bin, durl=Durl.all, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 44026 substitutions for model Model(time=Time.continuous, source=Source.all, past=Past.last_bin, durl=Durl.all, max_distance=1)
100% (44026 of 44026) |####################| Elapsed Time: 0:09:58 Time: 0:09:58

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | ns. | *** | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | *** | **  |
H_00 | *** | **  | *** | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | ns. | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *   | *** | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | **  | *   | *** |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *   | *** | **  |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *** | ns. | ns. | *** |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *** |
H_00 | *** | ns. | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *** | **  | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | **  | ns. | *** |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | ns. | **  | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | **  | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | **  | *** | ns. | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | *   | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | **  |
H_00 | *** | *** | **  | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | *** | *** |
H_00 | *** | *** | *   | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | **  | *** | ns. | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | **  |
H_00 | *** | *** | **  | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | ns. | ns. | **  |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | ns. | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | **  | ns. | *** |
H_00 | *** | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | ns. | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | *   | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | **  |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | ns. | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | **  | ns. | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *** |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.51942384  0.18695104  0.08285583  0.07190614  0.03608469  0.02768275
  0.02047687  0.01876023  0.01457781  0.00929253  0.00691246]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 0.520881 -0.220351 0.070375 -0.211860 -0.212291 0.444163 -0.212616 -0.239523 0.416014 -0.286982 0.173024 0.001444
Component-1 0.420499 -0.342730 0.118748 -0.299205 -0.240911 -0.415728 0.130980 -0.285275 -0.452841 0.194741 -0.167909 0.038784
Component-2 0.457062 0.634376 -0.087370 0.252691 -0.481003 -0.093363 -0.005193 0.263490 -0.038947 0.046433 -0.044208 -0.044873

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (44026 of 44026) |####################| Elapsed Time: 0:06:51 Time: 0:06:51

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | ns. | *** | **  |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | **  | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *   |
H_00 | ns. | *** | *** | *** |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.65598158  0.21098027]

Out[35]:
aoa frequency letters_count
Component-0 0.753213 -0.396061 0.525173
Component-1 -0.345657 0.440958 0.828298

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (44026 of 44026) |####################| Elapsed Time: 0:04:36 Time: 0:04:36

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | ns. |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1107 (cluster-unique) substitutions, but the PCA is in fact computed on 871 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
    * global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
   ** global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.19047678424659464

intercept                      3.726962
global_aoa                     0.099202
global_clustering              0.030530
global_frequency               0.507443
global_letters_count          -0.008235
global_orthographic_density    0.060528
global_synonyms_count          0.029526
dtype: float64

Regressing global frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.2069830506690662

intercept                                              5.432534
global_aoa                                            -0.004253
global_clustering                                      1.023872
global_frequency                                       0.396384
global_letters_count                                   0.244999
global_orthographic_density                            2.148272
global_synonyms_count                                  0.345401
global_aoa * global_clustering                        -0.048037
global_aoa * global_frequency                         -0.005962
global_aoa * global_letters_count                     -0.008607
global_aoa * global_orthographic_density              -0.096420
global_aoa * global_synonyms_count                     0.059365
global_clustering * global_frequency                  -0.081925
global_clustering * global_letters_count              -0.006048
global_clustering * global_orthographic_density        0.122006
global_clustering * global_synonyms_count             -0.005581
global_frequency * global_letters_count               -0.026766
global_frequency * global_orthographic_density        -0.121701
global_frequency * global_synonyms_count              -0.027542
global_letters_count * global_orthographic_density     0.067813
global_letters_count * global_synonyms_count          -0.081084
global_orthographic_density * global_synonyms_count   -0.002169
dtype: float64

Regressing rel frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.11606494489510898

intercept                     -7.891936
global_aoa                     0.135882
global_clustering             -0.014251
global_frequency               0.452093
global_letters_count           0.014313
global_orthographic_density    0.001746
global_synonyms_count          0.138610
dtype: float64

Regressing rel frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.12812471724490293

intercept                                             -9.145767
global_aoa                                            -0.151421
global_clustering                                     -0.366393
global_frequency                                       0.512319
global_letters_count                                   0.146639
global_orthographic_density                            1.191634
global_synonyms_count                                  0.052717
global_aoa * global_clustering                        -0.018622
global_aoa * global_frequency                          0.004800
global_aoa * global_letters_count                      0.021770
global_aoa * global_orthographic_density              -0.031595
global_aoa * global_synonyms_count                     0.068469
global_clustering * global_frequency                  -0.000312
global_clustering * global_letters_count               0.038676
global_clustering * global_orthographic_density        0.151280
global_clustering * global_synonyms_count              0.231528
global_frequency * global_letters_count               -0.012895
global_frequency * global_orthographic_density        -0.060630
global_frequency * global_synonyms_count               0.115031
global_letters_count * global_orthographic_density     0.073343
global_letters_count * global_synonyms_count          -0.028316
global_orthographic_density * global_synonyms_count    0.110558
dtype: float64

Regressing global frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09830436813397614

intercept                   9.633335
rel_aoa                     0.075844
rel_clustering             -0.037147
rel_frequency               0.312715
rel_letters_count           0.020224
rel_orthographic_density    0.073582
rel_synonyms_count         -0.145271
dtype: float64

Regressing global frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.11748649311635383

intercept                                        9.408814
rel_aoa                                          0.060851
rel_clustering                                   0.010946
rel_frequency                                    0.239298
rel_letters_count                                0.135718
rel_orthographic_density                        -0.062565
rel_synonyms_count                              -0.064851
rel_aoa * rel_clustering                        -0.032845
rel_aoa * rel_frequency                          0.004244
rel_aoa * rel_letters_count                     -0.015190
rel_aoa * rel_orthographic_density              -0.065792
rel_aoa * rel_synonyms_count                     0.017898
rel_clustering * rel_frequency                  -0.008241
rel_clustering * rel_letters_count              -0.004922
rel_clustering * rel_orthographic_density        0.035270
rel_clustering * rel_synonyms_count              0.143233
rel_frequency * rel_letters_count                0.013092
rel_frequency * rel_orthographic_density        -0.052116
rel_frequency * rel_synonyms_count               0.011058
rel_letters_count * rel_orthographic_density     0.037500
rel_letters_count * rel_synonyms_count          -0.161908
rel_orthographic_density * rel_synonyms_count   -0.291181
dtype: float64

Regressing rel frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3673232746824088

intercept                  -1.037386
rel_aoa                     0.084208
rel_clustering              0.080803
rel_frequency               0.676724
rel_letters_count          -0.044000
rel_orthographic_density   -0.072526
rel_synonyms_count         -0.000250
dtype: float64

Regressing rel frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.38294756429363525

intercept                                       -1.163849
rel_aoa                                          0.045675
rel_clustering                                   0.166338
rel_frequency                                    0.668117
rel_letters_count                                0.013722
rel_orthographic_density                        -0.288323
rel_synonyms_count                              -0.007402
rel_aoa * rel_clustering                        -0.051858
rel_aoa * rel_frequency                         -0.034156
rel_aoa * rel_letters_count                     -0.010806
rel_aoa * rel_orthographic_density               0.016690
rel_aoa * rel_synonyms_count                     0.080283
rel_clustering * rel_frequency                   0.012404
rel_clustering * rel_letters_count              -0.008656
rel_clustering * rel_orthographic_density       -0.014936
rel_clustering * rel_synonyms_count              0.177393
rel_frequency * rel_letters_count               -0.008394
rel_frequency * rel_orthographic_density        -0.063823
rel_frequency * rel_synonyms_count               0.009399
rel_letters_count * rel_orthographic_density     0.036523
rel_letters_count * rel_synonyms_count          -0.096638
rel_orthographic_density * rel_synonyms_count   -0.096232
dtype: float64

Regressing global frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.19797884855269432

intercept                      2.747036
global_aoa                     0.088538
global_clustering              0.022229
global_frequency               0.562228
global_letters_count           0.016191
global_orthographic_density    0.113676
global_synonyms_count          0.464514
rel_aoa                        0.018255
rel_clustering                 0.000057
rel_frequency                 -0.060299
rel_letters_count             -0.025593
rel_orthographic_density      -0.068704
rel_synonyms_count            -0.494072
dtype: float64

Regressing global frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.285920049974843

intercept                                                -19.656677
global_aoa                                                -0.015070
global_clustering                                         -4.081345
global_frequency                                           0.392742
global_letters_count                                       1.856105
global_orthographic_density                                5.764218
global_synonyms_count                                      9.710227
rel_aoa                                                    1.558929
rel_clustering                                             5.212280
rel_frequency                                             -0.564556
rel_letters_count                                         -0.877700
rel_orthographic_density                                  -1.184421
rel_synonyms_count                                        -8.365714
global_aoa * global_clustering                             0.129420
global_aoa * global_frequency                              0.092051
global_aoa * global_letters_count                          0.024820
global_aoa * global_orthographic_density                  -0.029194
global_aoa * global_synonyms_count                        -0.336740
global_aoa * rel_aoa                                      -0.014452
global_aoa * rel_clustering                               -0.108287
global_aoa * rel_frequency                                 0.002180
global_aoa * rel_letters_count                            -0.028297
global_aoa * rel_orthographic_density                     -0.109569
global_aoa * rel_synonyms_count                            0.365916
global_clustering * global_frequency                      -0.022018
global_clustering * global_letters_count                   0.183170
global_clustering * global_orthographic_density            1.179149
global_clustering * global_synonyms_count                  0.314235
global_clustering * rel_aoa                               -0.123110
global_clustering * rel_clustering                         0.105294
global_clustering * rel_frequency                         -0.126361
global_clustering * rel_letters_count                     -0.030328
global_clustering * rel_orthographic_density              -0.680159
global_clustering * rel_synonyms_count                    -0.295326
global_frequency * global_letters_count                   -0.130282
global_frequency * global_orthographic_density             0.062166
global_frequency * global_synonyms_count                  -0.620580
global_frequency * rel_aoa                                -0.192761
global_frequency * rel_clustering                          0.020551
global_frequency * rel_frequency                           0.014370
global_frequency * rel_letters_count                       0.115939
global_frequency * rel_orthographic_density               -0.228688
global_frequency * rel_synonyms_count                      0.551455
global_letters_count * global_orthographic_density         0.175928
global_letters_count * global_synonyms_count               0.184610
global_letters_count * rel_aoa                            -0.046984
global_letters_count * rel_clustering                     -0.313593
global_letters_count * rel_frequency                      -0.017142
global_letters_count * rel_letters_count                   0.025517
global_letters_count * rel_orthographic_density           -0.008455
global_letters_count * rel_synonyms_count                 -0.497391
global_orthographic_density * global_synonyms_count        0.139634
global_orthographic_density * rel_aoa                     -0.042767
global_orthographic_density * rel_clustering              -1.110315
global_orthographic_density * rel_frequency               -0.192044
global_orthographic_density * rel_letters_count           -0.193937
global_orthographic_density * rel_orthographic_density     0.057716
global_orthographic_density * rel_synonyms_count           0.157039
global_synonyms_count * rel_aoa                            0.321421
global_synonyms_count * rel_clustering                    -0.670574
global_synonyms_count * rel_frequency                      0.349510
global_synonyms_count * rel_letters_count                  0.220039
global_synonyms_count * rel_orthographic_density           0.385659
global_synonyms_count * rel_synonyms_count                -0.035949
rel_aoa * rel_clustering                                   0.030273
rel_aoa * rel_frequency                                    0.082948
rel_aoa * rel_letters_count                                0.040772
rel_aoa * rel_orthographic_density                         0.076619
rel_aoa * rel_synonyms_count                              -0.293131
rel_clustering * rel_frequency                             0.136826
rel_clustering * rel_letters_count                         0.149523
rel_clustering * rel_orthographic_density                  0.645926
rel_clustering * rel_synonyms_count                        0.957367
rel_frequency * rel_letters_count                          0.022290
rel_frequency * rel_orthographic_density                   0.267155
rel_frequency * rel_synonyms_count                        -0.223123
rel_letters_count * rel_orthographic_density               0.178850
rel_letters_count * rel_synonyms_count                    -0.001622
rel_orthographic_density * rel_synonyms_count             -0.866867
dtype: float64

Regressing rel frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.4257321099253483

intercept                      2.581178
global_aoa                     0.084128
global_clustering              0.108695
global_frequency              -0.384259
global_letters_count           0.054841
global_orthographic_density    0.137453
global_synonyms_count          0.426886
rel_aoa                        0.009671
rel_clustering                -0.054242
rel_frequency                  0.917238
rel_letters_count             -0.063186
rel_orthographic_density      -0.076745
rel_synonyms_count            -0.422662
dtype: float64

Regressing rel frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.4850129858737386

intercept                                                -22.641827
global_aoa                                                 0.037243
global_clustering                                         -4.247854
global_frequency                                          -0.310068
global_letters_count                                       2.115625
global_orthographic_density                                5.956233
global_synonyms_count                                      8.994150
rel_aoa                                                    1.293726
rel_clustering                                             5.639225
rel_frequency                                              0.302927
rel_letters_count                                         -1.351099
rel_orthographic_density                                  -1.892239
rel_synonyms_count                                        -7.888416
global_aoa * global_clustering                             0.112817
global_aoa * global_frequency                              0.079453
global_aoa * global_letters_count                          0.019860
global_aoa * global_orthographic_density                  -0.038002
global_aoa * global_synonyms_count                        -0.312003
global_aoa * rel_aoa                                      -0.011807
global_aoa * rel_clustering                               -0.106317
global_aoa * rel_frequency                                 0.008214
global_aoa * rel_letters_count                            -0.010702
global_aoa * rel_orthographic_density                     -0.078742
global_aoa * rel_synonyms_count                            0.344122
global_clustering * global_frequency                       0.013262
global_clustering * global_letters_count                   0.248408
global_clustering * global_orthographic_density            1.089125
global_clustering * global_synonyms_count                  0.212903
global_clustering * rel_aoa                               -0.125091
global_clustering * rel_clustering                         0.077240
global_clustering * rel_frequency                         -0.114621
global_clustering * rel_letters_count                     -0.075586
global_clustering * rel_orthographic_density              -0.587125
global_clustering * rel_synonyms_count                    -0.223068
global_frequency * global_letters_count                   -0.095668
global_frequency * global_orthographic_density             0.024579
global_frequency * global_synonyms_count                  -0.578379
global_frequency * rel_aoa                                -0.175141
global_frequency * rel_clustering                         -0.034348
global_frequency * rel_frequency                           0.027492
global_frequency * rel_letters_count                       0.104442
global_frequency * rel_orthographic_density               -0.161197
global_frequency * rel_synonyms_count                      0.498954
global_letters_count * global_orthographic_density         0.137124
global_letters_count * global_synonyms_count               0.104261
global_letters_count * rel_aoa                            -0.045945
global_letters_count * rel_clustering                     -0.380758
global_letters_count * rel_frequency                      -0.028163
global_letters_count * rel_letters_count                   0.023415
global_letters_count * rel_orthographic_density            0.051542
global_letters_count * rel_synonyms_count                 -0.378660
global_orthographic_density * global_synonyms_count        0.012548
global_orthographic_density * rel_aoa                     -0.032001
global_orthographic_density * rel_clustering              -1.058357
global_orthographic_density * rel_frequency               -0.141589
global_orthographic_density * rel_letters_count           -0.130411
global_orthographic_density * rel_orthographic_density     0.069164
global_orthographic_density * rel_synonyms_count           0.286060
global_synonyms_count * rel_aoa                            0.311466
global_synonyms_count * rel_clustering                    -0.535651
global_synonyms_count * rel_frequency                      0.294825
global_synonyms_count * rel_letters_count                  0.255376
global_synonyms_count * rel_orthographic_density           0.451108
global_synonyms_count * rel_synonyms_count                -0.032051
rel_aoa * rel_clustering                                   0.042265
rel_aoa * rel_frequency                                    0.063912
rel_aoa * rel_letters_count                                0.024405
rel_aoa * rel_orthographic_density                         0.044945
rel_aoa * rel_synonyms_count                              -0.284740
rel_clustering * rel_frequency                             0.145516
rel_clustering * rel_letters_count                         0.198052
rel_clustering * rel_orthographic_density                  0.565061
rel_clustering * rel_synonyms_count                        0.833759
rel_frequency * rel_letters_count                          0.016430
rel_frequency * rel_orthographic_density                   0.189602
rel_frequency * rel_synonyms_count                        -0.167934
rel_letters_count * rel_orthographic_density               0.100142
rel_letters_count * rel_synonyms_count                    -0.074369
rel_orthographic_density * rel_synonyms_count             -0.929434
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 635 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.21641069215510733

intercept                      4.806943
global_aoa                     0.401947
global_clustering              0.065190
global_frequency              -0.020297
global_letters_count           0.069150
global_orthographic_density   -0.132825
global_synonyms_count         -0.138163
dtype: float64

Regressing global aoa with 635 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.2437773102289419

intercept                                              8.895359
global_aoa                                             0.710912
global_clustering                                      0.271603
global_frequency                                      -0.215595
global_letters_count                                  -0.436149
global_orthographic_density                           -2.261495
global_synonyms_count                                 -3.402136
global_aoa * global_clustering                         0.062796
global_aoa * global_frequency                         -0.032168
global_aoa * global_letters_count                      0.042144
global_aoa * global_orthographic_density               0.052381
global_aoa * global_synonyms_count                     0.008449
global_clustering * global_frequency                  -0.032738
global_clustering * global_letters_count               0.007301
global_clustering * global_orthographic_density       -0.203942
global_clustering * global_synonyms_count             -0.362871
global_frequency * global_letters_count                0.023045
global_frequency * global_orthographic_density         0.056691
global_frequency * global_synonyms_count               0.004243
global_letters_count * global_orthographic_density    -0.038905
global_letters_count * global_synonyms_count           0.099463
global_orthographic_density * global_synonyms_count    0.367290
dtype: float64

Regressing rel aoa with 635 measures, no interactions
           ^^^^^^^
R^2 = 0.07998043874484528

intercept                      0.515347
global_aoa                     0.207465
global_clustering              0.069952
global_frequency              -0.103604
global_letters_count           0.031130
global_orthographic_density    0.037391
global_synonyms_count         -0.060753
dtype: float64

Regressing rel aoa with 635 measures, with interactions
           ^^^^^^^
R^2 = 0.124643534937531

intercept                                              7.989280
global_aoa                                             1.291365
global_clustering                                      1.621711
global_frequency                                      -0.445815
global_letters_count                                  -1.054611
global_orthographic_density                           -2.447776
global_synonyms_count                                 -1.703775
global_aoa * global_clustering                         0.083184
global_aoa * global_frequency                         -0.067879
global_aoa * global_letters_count                     -0.009606
global_aoa * global_orthographic_density               0.030596
global_aoa * global_synonyms_count                     0.011235
global_clustering * global_frequency                  -0.078209
global_clustering * global_letters_count              -0.128279
global_clustering * global_orthographic_density       -0.369739
global_clustering * global_synonyms_count             -0.472220
global_frequency * global_letters_count                0.056607
global_frequency * global_orthographic_density         0.058160
global_frequency * global_synonyms_count              -0.155332
global_letters_count * global_orthographic_density    -0.101578
global_letters_count * global_synonyms_count          -0.005669
global_orthographic_density * global_synonyms_count    0.173391
dtype: float64

Regressing global aoa with 635 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.10495520827873804

intercept                   6.779423
rel_aoa                     0.198611
rel_clustering              0.337362
rel_frequency               0.117159
rel_letters_count           0.012207
rel_orthographic_density   -0.480142
rel_synonyms_count         -0.143660
dtype: float64

Regressing global aoa with 635 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.1627213986056697

intercept                                        6.699849
rel_aoa                                         -0.115311
rel_clustering                                   0.200403
rel_frequency                                    0.162053
rel_letters_count                               -0.016259
rel_orthographic_density                        -0.646040
rel_synonyms_count                               0.246735
rel_aoa * rel_clustering                         0.115322
rel_aoa * rel_frequency                         -0.088856
rel_aoa * rel_letters_count                      0.041231
rel_aoa * rel_orthographic_density               0.082484
rel_aoa * rel_synonyms_count                     0.010895
rel_clustering * rel_frequency                   0.020219
rel_clustering * rel_letters_count               0.050381
rel_clustering * rel_orthographic_density       -0.028236
rel_clustering * rel_synonyms_count             -0.366351
rel_frequency * rel_letters_count               -0.007135
rel_frequency * rel_orthographic_density         0.004253
rel_frequency * rel_synonyms_count              -0.018936
rel_letters_count * rel_orthographic_density     0.030715
rel_letters_count * rel_synonyms_count           0.195804
rel_orthographic_density * rel_synonyms_count    0.832819
dtype: float64

Regressing rel aoa with 635 measures, no interactions
           ^^^^^^^
R^2 = 0.28814330901108043

intercept                   0.492751
rel_aoa                     0.541293
rel_clustering              0.084358
rel_frequency              -0.032739
rel_letters_count          -0.015979
rel_orthographic_density    0.021943
rel_synonyms_count         -0.122678
dtype: float64

Regressing rel aoa with 635 measures, with interactions
           ^^^^^^^
R^2 = 0.316313977277751

intercept                                        0.680226
rel_aoa                                          0.459187
rel_clustering                                  -0.166996
rel_frequency                                    0.076908
rel_letters_count                               -0.087764
rel_orthographic_density                         0.191656
rel_synonyms_count                               0.128068
rel_aoa * rel_clustering                         0.064564
rel_aoa * rel_frequency                         -0.016773
rel_aoa * rel_letters_count                     -0.006377
rel_aoa * rel_orthographic_density              -0.024868
rel_aoa * rel_synonyms_count                    -0.020501
rel_clustering * rel_frequency                  -0.079497
rel_clustering * rel_letters_count              -0.036186
rel_clustering * rel_orthographic_density       -0.083952
rel_clustering * rel_synonyms_count             -0.249283
rel_frequency * rel_letters_count               -0.016996
rel_frequency * rel_orthographic_density         0.041211
rel_frequency * rel_synonyms_count              -0.017309
rel_letters_count * rel_orthographic_density    -0.035389
rel_letters_count * rel_synonyms_count           0.086338
rel_orthographic_density * rel_synonyms_count    0.405007
dtype: float64

Regressing global aoa with 635 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.24774815816861873

intercept                      1.245493
global_aoa                     0.501079
global_clustering             -0.285844
global_frequency              -0.090223
global_letters_count           0.334097
global_orthographic_density    0.116325
global_synonyms_count         -0.032499
rel_aoa                       -0.162428
rel_clustering                 0.392111
rel_frequency                  0.044881
rel_letters_count             -0.289297
rel_orthographic_density      -0.211812
rel_synonyms_count            -0.115414
dtype: float64

Regressing global aoa with 635 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.36664748215715637

intercept                                                 83.329881
global_aoa                                                 0.146384
global_clustering                                         10.356099
global_frequency                                          -2.756273
global_letters_count                                      -4.999054
global_orthographic_density                              -16.949333
global_synonyms_count                                    -14.104684
rel_aoa                                                    0.589523
rel_clustering                                            -6.502502
rel_frequency                                              3.400140
rel_letters_count                                          3.506623
rel_orthographic_density                                  15.609520
rel_synonyms_count                                        -1.266979
global_aoa * global_clustering                            -0.050933
global_aoa * global_frequency                             -0.072005
global_aoa * global_letters_count                          0.080445
global_aoa * global_orthographic_density                   0.120687
global_aoa * global_synonyms_count                         0.008161
global_aoa * rel_aoa                                      -0.011969
global_aoa * rel_clustering                               -0.041701
global_aoa * rel_frequency                                -0.032589
global_aoa * rel_letters_count                            -0.052113
global_aoa * rel_orthographic_density                     -0.082434
global_aoa * rel_synonyms_count                            0.238509
global_clustering * global_frequency                      -0.238920
global_clustering * global_letters_count                  -0.253023
global_clustering * global_orthographic_density           -2.608138
global_clustering * global_synonyms_count                 -1.417339
global_clustering * rel_aoa                                0.230053
global_clustering * rel_clustering                         0.141797
global_clustering * rel_frequency                          0.425366
global_clustering * rel_letters_count                      0.033691
global_clustering * rel_orthographic_density               2.170266
global_clustering * rel_synonyms_count                    -0.263065
global_frequency * global_letters_count                    0.312041
global_frequency * global_orthographic_density             0.085279
global_frequency * global_synonyms_count                   0.342886
global_frequency * rel_aoa                                 0.112282
global_frequency * rel_clustering                          0.060664
global_frequency * rel_frequency                          -0.031440
global_frequency * rel_letters_count                      -0.296805
global_frequency * rel_orthographic_density               -0.137310
global_frequency * rel_synonyms_count                      0.012848
global_letters_count * global_orthographic_density        -0.032253
global_letters_count * global_synonyms_count               0.165747
global_letters_count * rel_aoa                            -0.049285
global_letters_count * rel_clustering                      0.338107
global_letters_count * rel_frequency                      -0.058983
global_letters_count * rel_letters_count                   0.003832
global_letters_count * rel_orthographic_density           -0.193546
global_letters_count * rel_synonyms_count                  0.157044
global_orthographic_density * global_synonyms_count        0.205903
global_orthographic_density * rel_aoa                     -0.095098
global_orthographic_density * rel_clustering               2.146654
global_orthographic_density * rel_frequency                0.142535
global_orthographic_density * rel_letters_count           -0.000250
global_orthographic_density * rel_orthographic_density     0.065884
global_orthographic_density * rel_synonyms_count          -1.014501
global_synonyms_count * rel_aoa                           -0.344316
global_synonyms_count * rel_clustering                     1.752412
global_synonyms_count * rel_frequency                     -0.600522
global_synonyms_count * rel_letters_count                 -0.707184
global_synonyms_count * rel_orthographic_density          -0.889151
global_synonyms_count * rel_synonyms_count                 0.064378
rel_aoa * rel_clustering                                  -0.000633
rel_aoa * rel_frequency                                   -0.059522
rel_aoa * rel_letters_count                                0.049469
rel_aoa * rel_orthographic_density                         0.073026
rel_aoa * rel_synonyms_count                               0.110993
rel_clustering * rel_frequency                            -0.309457
rel_clustering * rel_letters_count                        -0.092750
rel_clustering * rel_orthographic_density                 -1.666847
rel_clustering * rel_synonyms_count                       -0.777984
rel_frequency * rel_letters_count                          0.056902
rel_frequency * rel_orthographic_density                   0.012696
rel_frequency * rel_synonyms_count                         0.108166
rel_letters_count * rel_orthographic_density               0.225638
rel_letters_count * rel_synonyms_count                     0.367914
rel_orthographic_density * rel_synonyms_count              2.058311
dtype: float64

Regressing rel aoa with 635 measures, no interactions
           ^^^^^^^
R^2 = 0.3315548460995338

intercept                      1.703327
global_aoa                    -0.356353
global_clustering             -0.189946
global_frequency              -0.083607
global_letters_count           0.169457
global_orthographic_density   -0.027357
global_synonyms_count         -0.043866
rel_aoa                        0.796668
rel_clustering                 0.313938
rel_frequency                  0.034249
rel_letters_count             -0.147647
rel_orthographic_density      -0.100276
rel_synonyms_count            -0.129724
dtype: float64

Regressing rel aoa with 635 measures, with interactions
           ^^^^^^^
R^2 = 0.4319588075851787

intercept                                                 71.257771
global_aoa                                                -1.085352
global_clustering                                          9.317033
global_frequency                                          -3.025454
global_letters_count                                      -3.514004
global_orthographic_density                              -11.820954
global_synonyms_count                                     -6.468255
rel_aoa                                                    1.455040
rel_clustering                                            -5.082077
rel_frequency                                              2.993325
rel_letters_count                                          2.841240
rel_orthographic_density                                  11.342329
rel_synonyms_count                                        -4.151004
global_aoa * global_clustering                            -0.148827
global_aoa * global_frequency                             -0.085266
global_aoa * global_letters_count                          0.055310
global_aoa * global_orthographic_density                   0.135516
global_aoa * global_synonyms_count                         0.145334
global_aoa * rel_aoa                                      -0.018733
global_aoa * rel_clustering                                0.039230
global_aoa * rel_frequency                                -0.025293
global_aoa * rel_letters_count                            -0.041009
global_aoa * rel_orthographic_density                     -0.099986
global_aoa * rel_synonyms_count                            0.041918
global_clustering * global_frequency                      -0.336867
global_clustering * global_letters_count                  -0.108174
global_clustering * global_orthographic_density           -1.865235
global_clustering * global_synonyms_count                 -0.778964
global_clustering * rel_aoa                                0.213537
global_clustering * rel_clustering                         0.015198
global_clustering * rel_frequency                          0.409901
global_clustering * rel_letters_count                     -0.010568
global_clustering * rel_orthographic_density               1.397520
global_clustering * rel_synonyms_count                    -0.408214
global_frequency * global_letters_count                    0.286959
global_frequency * global_orthographic_density             0.056108
global_frequency * global_synonyms_count                   0.110057
global_frequency * rel_aoa                                 0.109879
global_frequency * rel_clustering                          0.060065
global_frequency * rel_frequency                          -0.011242
global_frequency * rel_letters_count                      -0.267153
global_frequency * rel_orthographic_density               -0.196393
global_frequency * rel_synonyms_count                      0.139561
global_letters_count * global_orthographic_density        -0.128176
global_letters_count * global_synonyms_count              -0.285104
global_letters_count * rel_aoa                            -0.040808
global_letters_count * rel_clustering                      0.152997
global_letters_count * rel_frequency                      -0.067293
global_letters_count * rel_letters_count                  -0.003328
global_letters_count * rel_orthographic_density           -0.101248
global_letters_count * rel_synonyms_count                  0.456753
global_orthographic_density * global_synonyms_count        0.173486
global_orthographic_density * rel_aoa                     -0.061887
global_orthographic_density * rel_clustering               1.551350
global_orthographic_density * rel_frequency                0.146643
global_orthographic_density * rel_letters_count            0.018550
global_orthographic_density * rel_orthographic_density     0.066784
global_orthographic_density * rel_synonyms_count          -0.813821
global_synonyms_count * rel_aoa                           -0.407957
global_synonyms_count * rel_clustering                     0.985851
global_synonyms_count * rel_frequency                     -0.247156
global_synonyms_count * rel_letters_count                 -0.089013
global_synonyms_count * rel_orthographic_density          -0.786604
global_synonyms_count * rel_synonyms_count                 0.107258
rel_aoa * rel_clustering                                   0.013893
rel_aoa * rel_frequency                                   -0.044233
rel_aoa * rel_letters_count                                0.032785
rel_aoa * rel_orthographic_density                         0.017713
rel_aoa * rel_synonyms_count                               0.210965
rel_clustering * rel_frequency                            -0.210709
rel_clustering * rel_letters_count                        -0.021494
rel_clustering * rel_orthographic_density                 -1.120671
rel_clustering * rel_synonyms_count                       -0.458011
rel_frequency * rel_letters_count                          0.055660
rel_frequency * rel_orthographic_density                   0.038556
rel_frequency * rel_synonyms_count                        -0.127160
rel_letters_count * rel_orthographic_density               0.183725
rel_letters_count * rel_synonyms_count                    -0.135598
rel_orthographic_density * rel_synonyms_count              1.613313
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 583 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.22149059116017444

intercept                     -2.640488
global_aoa                    -0.021017
global_clustering              0.467384
global_frequency              -0.041793
global_letters_count           0.014567
global_orthographic_density    0.022121
global_synonyms_count         -0.091479
dtype: float64

Regressing global clustering with 583 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2753754195501543

intercept                                             -2.257137
global_aoa                                             0.191561
global_clustering                                      0.083187
global_frequency                                      -0.584695
global_letters_count                                   0.074004
global_orthographic_density                            0.121794
global_synonyms_count                                 -0.915248
global_aoa * global_clustering                         0.059906
global_aoa * global_frequency                          0.003440
global_aoa * global_letters_count                      0.012512
global_aoa * global_orthographic_density               0.026621
global_aoa * global_synonyms_count                     0.011949
global_clustering * global_frequency                  -0.056900
global_clustering * global_letters_count               0.051876
global_clustering * global_orthographic_density        0.112550
global_clustering * global_synonyms_count             -0.012333
global_frequency * global_letters_count                0.016979
global_frequency * global_orthographic_density         0.052244
global_frequency * global_synonyms_count               0.042614
global_letters_count * global_orthographic_density    -0.020901
global_letters_count * global_synonyms_count           0.039082
global_orthographic_density * global_synonyms_count    0.043670
dtype: float64

Regressing rel clustering with 583 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.17335324029636512

intercept                      3.055699
global_aoa                    -0.017444
global_clustering              0.412083
global_frequency              -0.030841
global_letters_count           0.012851
global_orthographic_density    0.026619
global_synonyms_count         -0.082539
dtype: float64

Regressing rel clustering with 583 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.23203140121092014

intercept                                              3.727566
global_aoa                                             0.199911
global_clustering                                      0.053467
global_frequency                                      -0.590114
global_letters_count                                  -0.012813
global_orthographic_density                            0.270364
global_synonyms_count                                 -0.805273
global_aoa * global_clustering                         0.066602
global_aoa * global_frequency                          0.007069
global_aoa * global_letters_count                      0.013090
global_aoa * global_orthographic_density               0.028927
global_aoa * global_synonyms_count                    -0.002651
global_clustering * global_frequency                  -0.058778
global_clustering * global_letters_count               0.041318
global_clustering * global_orthographic_density        0.113837
global_clustering * global_synonyms_count              0.007185
global_frequency * global_letters_count                0.018003
global_frequency * global_orthographic_density         0.036887
global_frequency * global_synonyms_count               0.028132
global_letters_count * global_orthographic_density    -0.024060
global_letters_count * global_synonyms_count           0.073412
global_orthographic_density * global_synonyms_count    0.069777
dtype: float64

Regressing global clustering with 583 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.15384885685774163

intercept                  -5.922057
rel_aoa                    -0.001964
rel_clustering              0.404317
rel_frequency              -0.009993
rel_letters_count           0.000529
rel_orthographic_density    0.017514
rel_synonyms_count         -0.109583
dtype: float64

Regressing global clustering with 583 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.19593487253659447

intercept                                       -5.798641
rel_aoa                                         -0.011160
rel_clustering                                   0.193749
rel_frequency                                    0.037245
rel_letters_count                               -0.049763
rel_orthographic_density                         0.019896
rel_synonyms_count                              -0.231225
rel_aoa * rel_clustering                         0.040345
rel_aoa * rel_frequency                         -0.007916
rel_aoa * rel_letters_count                     -0.000392
rel_aoa * rel_orthographic_density               0.021048
rel_aoa * rel_synonyms_count                    -0.000376
rel_clustering * rel_frequency                  -0.033381
rel_clustering * rel_letters_count               0.061542
rel_clustering * rel_orthographic_density        0.059450
rel_clustering * rel_synonyms_count              0.015580
rel_frequency * rel_letters_count               -0.007772
rel_frequency * rel_orthographic_density         0.010034
rel_frequency * rel_synonyms_count              -0.027098
rel_letters_count * rel_orthographic_density    -0.007338
rel_letters_count * rel_synonyms_count           0.033995
rel_orthographic_density * rel_synonyms_count    0.055994
dtype: float64

Regressing rel clustering with 583 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.3282700381477641

intercept                   0.172696
rel_aoa                    -0.013917
rel_clustering              0.591847
rel_frequency              -0.002682
rel_letters_count           0.011022
rel_orthographic_density    0.041643
rel_synonyms_count         -0.062482
dtype: float64

Regressing rel clustering with 583 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.3654897625949718

intercept                                        0.276192
rel_aoa                                         -0.009324
rel_clustering                                   0.392704
rel_frequency                                    0.029282
rel_letters_count                               -0.041882
rel_orthographic_density                         0.005925
rel_synonyms_count                              -0.219623
rel_aoa * rel_clustering                         0.031383
rel_aoa * rel_frequency                         -0.003504
rel_aoa * rel_letters_count                     -0.000588
rel_aoa * rel_orthographic_density               0.018292
rel_aoa * rel_synonyms_count                    -0.015181
rel_clustering * rel_frequency                  -0.035429
rel_clustering * rel_letters_count               0.067197
rel_clustering * rel_orthographic_density        0.076605
rel_clustering * rel_synonyms_count             -0.049856
rel_frequency * rel_letters_count               -0.008026
rel_frequency * rel_orthographic_density        -0.004735
rel_frequency * rel_synonyms_count              -0.040768
rel_letters_count * rel_orthographic_density    -0.006045
rel_letters_count * rel_synonyms_count           0.046112
rel_orthographic_density * rel_synonyms_count    0.050190
dtype: float64

Regressing global clustering with 583 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2322294165719666

intercept                     -1.724040
global_aoa                    -0.032974
global_clustering              0.477473
global_frequency              -0.090206
global_letters_count           0.003774
global_orthographic_density   -0.056538
global_synonyms_count         -0.028321
rel_aoa                        0.011249
rel_clustering                -0.005473
rel_frequency                  0.053941
rel_letters_count              0.007546
rel_orthographic_density       0.083112
rel_synonyms_count            -0.082183
dtype: float64

Regressing global clustering with 583 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3668869102630491

intercept                                                 4.113247
global_aoa                                                0.296653
global_clustering                                         2.144053
global_frequency                                         -1.130608
global_letters_count                                      0.809482
global_orthographic_density                               1.775965
global_synonyms_count                                    -1.829821
rel_aoa                                                  -0.276797
rel_clustering                                           -3.314883
rel_frequency                                            -0.094624
rel_letters_count                                        -0.978348
rel_orthographic_density                                 -1.534269
rel_synonyms_count                                       -0.921822
global_aoa * global_clustering                            0.003079
global_aoa * global_frequency                             0.017620
global_aoa * global_letters_count                        -0.049370
global_aoa * global_orthographic_density                 -0.132401
global_aoa * global_synonyms_count                       -0.043525
global_aoa * rel_aoa                                      0.014838
global_aoa * rel_clustering                               0.056449
global_aoa * rel_frequency                                0.002242
global_aoa * rel_letters_count                            0.063170
global_aoa * rel_orthographic_density                     0.149225
global_aoa * rel_synonyms_count                           0.160996
global_clustering * global_frequency                     -0.174809
global_clustering * global_letters_count                  0.017032
global_clustering * global_orthographic_density           0.077170
global_clustering * global_synonyms_count                -0.366905
global_clustering * rel_aoa                              -0.013017
global_clustering * rel_clustering                       -0.058757
global_clustering * rel_frequency                         0.028036
global_clustering * rel_letters_count                     0.007477
global_clustering * rel_orthographic_density              0.080207
global_clustering * rel_synonyms_count                    0.532844
global_frequency * global_letters_count                  -0.031612
global_frequency * global_orthographic_density           -0.026575
global_frequency * global_synonyms_count                  0.207355
global_frequency * rel_aoa                               -0.035750
global_frequency * rel_clustering                         0.157907
global_frequency * rel_frequency                         -0.006275
global_frequency * rel_letters_count                      0.054409
global_frequency * rel_orthographic_density               0.108282
global_frequency * rel_synonyms_count                     0.023364
global_letters_count * global_orthographic_density        0.028009
global_letters_count * global_synonyms_count             -0.242871
global_letters_count * rel_aoa                            0.036985
global_letters_count * rel_clustering                     0.161678
global_letters_count * rel_frequency                      0.081822
global_letters_count * rel_letters_count                  0.003009
global_letters_count * rel_orthographic_density          -0.061678
global_letters_count * rel_synonyms_count                 0.275679
global_orthographic_density * global_synonyms_count      -0.670076
global_orthographic_density * rel_aoa                     0.170974
global_orthographic_density * rel_clustering             -0.020087
global_orthographic_density * rel_frequency               0.049223
global_orthographic_density * rel_letters_count          -0.040988
global_orthographic_density * rel_orthographic_density    0.034631
global_orthographic_density * rel_synonyms_count          0.753570
global_synonyms_count * rel_aoa                           0.006743
global_synonyms_count * rel_clustering                    0.270821
global_synonyms_count * rel_frequency                    -0.099560
global_synonyms_count * rel_letters_count                 0.127769
global_synonyms_count * rel_orthographic_density          0.240801
global_synonyms_count * rel_synonyms_count                0.043995
rel_aoa * rel_clustering                                 -0.011901
rel_aoa * rel_frequency                                   0.018966
rel_aoa * rel_letters_count                              -0.046195
rel_aoa * rel_orthographic_density                       -0.150701
rel_aoa * rel_synonyms_count                             -0.104551
rel_clustering * rel_frequency                           -0.115077
rel_clustering * rel_letters_count                       -0.095373
rel_clustering * rel_orthographic_density                 0.032737
rel_clustering * rel_synonyms_count                      -0.403529
rel_frequency * rel_letters_count                        -0.095566
rel_frequency * rel_orthographic_density                 -0.071879
rel_frequency * rel_synonyms_count                       -0.080767
rel_letters_count * rel_orthographic_density              0.075773
rel_letters_count * rel_synonyms_count                   -0.109571
rel_orthographic_density * rel_synonyms_count            -0.210977
dtype: float64

Regressing rel clustering with 583 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.3672815003269452

intercept                     -0.996120
global_aoa                    -0.026118
global_clustering             -0.378315
global_frequency              -0.078600
global_letters_count          -0.007878
global_orthographic_density   -0.049937
global_synonyms_count         -0.002958
rel_aoa                        0.007100
rel_clustering                 0.906184
rel_frequency                  0.048031
rel_letters_count              0.016846
rel_orthographic_density       0.066891
rel_synonyms_count            -0.081581
dtype: float64

Regressing rel clustering with 583 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.47728176158445335

intercept                                                 6.143614
global_aoa                                                0.234737
global_clustering                                         1.135237
global_frequency                                         -1.121886
global_letters_count                                      0.181471
global_orthographic_density                               1.623418
global_synonyms_count                                    -0.868391
rel_aoa                                                  -0.162690
rel_clustering                                           -2.322648
rel_frequency                                             0.077387
rel_letters_count                                        -0.571854
rel_orthographic_density                                 -1.772874
rel_synonyms_count                                       -1.379124
global_aoa * global_clustering                            0.016423
global_aoa * global_frequency                             0.016856
global_aoa * global_letters_count                        -0.030282
global_aoa * global_orthographic_density                 -0.102459
global_aoa * global_synonyms_count                       -0.041918
global_aoa * rel_aoa                                      0.014018
global_aoa * rel_clustering                               0.042148
global_aoa * rel_frequency                               -0.003621
global_aoa * rel_letters_count                            0.047893
global_aoa * rel_orthographic_density                     0.127702
global_aoa * rel_synonyms_count                           0.133214
global_clustering * global_frequency                     -0.157898
global_clustering * global_letters_count                 -0.032638
global_clustering * global_orthographic_density           0.122626
global_clustering * global_synonyms_count                -0.256490
global_clustering * rel_aoa                              -0.031997
global_clustering * rel_clustering                       -0.081740
global_clustering * rel_frequency                         0.033291
global_clustering * rel_letters_count                     0.039359
global_clustering * rel_orthographic_density              0.007363
global_clustering * rel_synonyms_count                    0.412371
global_frequency * global_letters_count                  -0.012912
global_frequency * global_orthographic_density           -0.004787
global_frequency * global_synonyms_count                  0.145352
global_frequency * rel_aoa                               -0.037102
global_frequency * rel_clustering                         0.153134
global_frequency * rel_frequency                         -0.005413
global_frequency * rel_letters_count                      0.040719
global_frequency * rel_orthographic_density               0.091802
global_frequency * rel_synonyms_count                     0.062786
global_letters_count * global_orthographic_density        0.018028
global_letters_count * global_synonyms_count             -0.210383
global_letters_count * rel_aoa                            0.014048
global_letters_count * rel_clustering                     0.173052
global_letters_count * rel_frequency                      0.061315
global_letters_count * rel_letters_count                  0.005088
global_letters_count * rel_orthographic_density          -0.038568
global_letters_count * rel_synonyms_count                 0.243617
global_orthographic_density * global_synonyms_count      -0.574590
global_orthographic_density * rel_aoa                     0.124482
global_orthographic_density * rel_clustering             -0.071430
global_orthographic_density * rel_frequency               0.027130
global_orthographic_density * rel_letters_count          -0.024254
global_orthographic_density * rel_orthographic_density    0.047516
global_orthographic_density * rel_synonyms_count          0.584714
global_synonyms_count * rel_aoa                          -0.008997
global_synonyms_count * rel_clustering                    0.200006
global_synonyms_count * rel_frequency                    -0.076906
global_synonyms_count * rel_letters_count                 0.110081
global_synonyms_count * rel_orthographic_density          0.174273
global_synonyms_count * rel_synonyms_count                0.037264
rel_aoa * rel_clustering                                 -0.002111
rel_aoa * rel_frequency                                   0.022524
rel_aoa * rel_letters_count                              -0.029891
rel_aoa * rel_orthographic_density                       -0.112854
rel_aoa * rel_synonyms_count                             -0.077046
rel_clustering * rel_frequency                           -0.115324
rel_clustering * rel_letters_count                       -0.092884
rel_clustering * rel_orthographic_density                 0.079382
rel_clustering * rel_synonyms_count                      -0.322874
rel_frequency * rel_letters_count                        -0.079049
rel_frequency * rel_orthographic_density                 -0.065577
rel_frequency * rel_synonyms_count                       -0.097965
rel_letters_count * rel_orthographic_density              0.053314
rel_letters_count * rel_synonyms_count                   -0.109799
rel_orthographic_density * rel_synonyms_count            -0.125389
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.20449070629011534

intercept                      3.515533
global_aoa                     0.001607
global_clustering              0.002727
global_frequency               0.035909
global_letters_count           0.442266
global_orthographic_density   -0.078975
global_synonyms_count         -0.370762
dtype: float64

Regressing global letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.23087528203884689

intercept                                              1.547413
global_aoa                                             0.156593
global_clustering                                     -1.552266
global_frequency                                       0.010368
global_letters_count                                  -0.147580
global_orthographic_density                           -1.225063
global_synonyms_count                                 -2.679001
global_aoa * global_clustering                         0.136198
global_aoa * global_frequency                          0.058417
global_aoa * global_letters_count                      0.014074
global_aoa * global_orthographic_density               0.001666
global_aoa * global_synonyms_count                     0.093184
global_clustering * global_frequency                   0.109439
global_clustering * global_letters_count              -0.087833
global_clustering * global_orthographic_density        0.076549
global_clustering * global_synonyms_count              0.139467
global_frequency * global_letters_count               -0.005648
global_frequency * global_orthographic_density         0.171270
global_frequency * global_synonyms_count               0.148970
global_letters_count * global_orthographic_density    -0.019993
global_letters_count * global_synonyms_count           0.124546
global_orthographic_density * global_synonyms_count    0.361680
dtype: float64

Regressing rel letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.13636835261997182

intercept                      0.765714
global_aoa                    -0.043322
global_clustering              0.001756
global_frequency              -0.034820
global_letters_count           0.377041
global_orthographic_density    0.007295
global_synonyms_count         -0.446755
dtype: float64

Regressing rel letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16020607240455953

intercept                                             -3.500622
global_aoa                                             0.473026
global_clustering                                     -1.581015
global_frequency                                       0.176153
global_letters_count                                  -0.192677
global_orthographic_density                           -1.193094
global_synonyms_count                                 -3.118193
global_aoa * global_clustering                         0.162572
global_aoa * global_frequency                          0.044426
global_aoa * global_letters_count                     -0.001186
global_aoa * global_orthographic_density              -0.002389
global_aoa * global_synonyms_count                     0.101152
global_clustering * global_frequency                   0.123580
global_clustering * global_letters_count              -0.107503
global_clustering * global_orthographic_density        0.016257
global_clustering * global_synonyms_count             -0.061117
global_frequency * global_letters_count               -0.005243
global_frequency * global_orthographic_density         0.153701
global_frequency * global_synonyms_count               0.059260
global_letters_count * global_orthographic_density    -0.043438
global_letters_count * global_synonyms_count           0.113381
global_orthographic_density * global_synonyms_count    0.351649
dtype: float64

Regressing global letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1531549765665412

intercept                   5.558578
rel_aoa                    -0.105043
rel_clustering              0.250331
rel_frequency               0.101497
rel_letters_count           0.332200
rel_orthographic_density   -0.301056
rel_synonyms_count         -0.263822
dtype: float64

Regressing global letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17559030851368693

intercept                                        5.538064
rel_aoa                                         -0.228942
rel_clustering                                   0.436786
rel_frequency                                    0.178362
rel_letters_count                                0.345903
rel_orthographic_density                        -0.389600
rel_synonyms_count                               0.004997
rel_aoa * rel_clustering                         0.090193
rel_aoa * rel_frequency                         -0.023259
rel_aoa * rel_letters_count                     -0.021278
rel_aoa * rel_orthographic_density              -0.057393
rel_aoa * rel_synonyms_count                     0.101330
rel_clustering * rel_frequency                  -0.002103
rel_clustering * rel_letters_count              -0.078252
rel_clustering * rel_orthographic_density        0.102899
rel_clustering * rel_synonyms_count              0.203713
rel_frequency * rel_letters_count               -0.032035
rel_frequency * rel_orthographic_density         0.003472
rel_frequency * rel_synonyms_count               0.104788
rel_letters_count * rel_orthographic_density     0.032720
rel_letters_count * rel_synonyms_count           0.192388
rel_orthographic_density * rel_synonyms_count    0.573004
dtype: float64

Regressing rel letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.23886606053531134

intercept                   0.988260
rel_aoa                    -0.093061
rel_clustering              0.151150
rel_frequency              -0.102941
rel_letters_count           0.485701
rel_orthographic_density   -0.036166
rel_synonyms_count         -0.298112
dtype: float64

Regressing rel letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2632467128272423

intercept                                        0.909551
rel_aoa                                         -0.161632
rel_clustering                                   0.297125
rel_frequency                                   -0.070035
rel_letters_count                                0.564869
rel_orthographic_density                        -0.054508
rel_synonyms_count                               0.088686
rel_aoa * rel_clustering                         0.096454
rel_aoa * rel_frequency                          0.005173
rel_aoa * rel_letters_count                     -0.046331
rel_aoa * rel_orthographic_density              -0.125042
rel_aoa * rel_synonyms_count                     0.133286
rel_clustering * rel_frequency                   0.004765
rel_clustering * rel_letters_count              -0.044801
rel_clustering * rel_orthographic_density        0.124154
rel_clustering * rel_synonyms_count              0.175642
rel_frequency * rel_letters_count               -0.014611
rel_frequency * rel_orthographic_density         0.023339
rel_frequency * rel_synonyms_count               0.128860
rel_letters_count * rel_orthographic_density     0.043124
rel_letters_count * rel_synonyms_count           0.158527
rel_orthographic_density * rel_synonyms_count    0.531776
dtype: float64

Regressing global letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2365064525162902

intercept                     -3.026144
global_aoa                     0.120817
global_clustering             -0.567948
global_frequency               0.114758
global_letters_count           0.682261
global_orthographic_density    0.113922
global_synonyms_count         -0.462468
rel_aoa                       -0.182091
rel_clustering                 0.631006
rel_frequency                 -0.126597
rel_letters_count             -0.259962
rel_orthographic_density      -0.159413
rel_synonyms_count             0.144941
dtype: float64

Regressing global letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3420154408638349

intercept                                                -15.267036
global_aoa                                                 2.345221
global_clustering                                         -4.479587
global_frequency                                           0.543772
global_letters_count                                      -0.829321
global_orthographic_density                               -4.439372
global_synonyms_count                                     -2.303207
rel_aoa                                                   -3.946052
rel_clustering                                             1.416346
rel_frequency                                             -0.139905
rel_letters_count                                          0.425174
rel_orthographic_density                                   1.930782
rel_synonyms_count                                        -2.054580
global_aoa * global_clustering                             0.320683
global_aoa * global_frequency                             -0.039213
global_aoa * global_letters_count                         -0.067383
global_aoa * global_orthographic_density                   0.177510
global_aoa * global_synonyms_count                         0.085199
global_aoa * rel_aoa                                       0.008733
global_aoa * rel_clustering                               -0.249862
global_aoa * rel_frequency                                 0.040389
global_aoa * rel_letters_count                             0.094815
global_aoa * rel_orthographic_density                     -0.096063
global_aoa * rel_synonyms_count                           -0.086390
global_clustering * global_frequency                       0.291371
global_clustering * global_letters_count                  -0.000365
global_clustering * global_orthographic_density           -0.284532
global_clustering * global_synonyms_count                 -0.379172
global_clustering * rel_aoa                                0.016753
global_clustering * rel_clustering                         0.038951
global_clustering * rel_frequency                         -0.067514
global_clustering * rel_letters_count                     -0.167946
global_clustering * rel_orthographic_density               0.229654
global_clustering * rel_synonyms_count                     0.027425
global_frequency * global_letters_count                    0.246905
global_frequency * global_orthographic_density             0.295969
global_frequency * global_synonyms_count                   0.048304
global_frequency * rel_aoa                                 0.267363
global_frequency * rel_clustering                         -0.029024
global_frequency * rel_frequency                          -0.051249
global_frequency * rel_letters_count                      -0.298863
global_frequency * rel_orthographic_density               -0.052876
global_frequency * rel_synonyms_count                      0.041138
global_letters_count * global_orthographic_density        -0.295240
global_letters_count * global_synonyms_count              -0.154494
global_letters_count * rel_aoa                             0.238898
global_letters_count * rel_clustering                      0.144790
global_letters_count * rel_frequency                      -0.033632
global_letters_count * rel_letters_count                   0.029290
global_letters_count * rel_orthographic_density            0.065545
global_letters_count * rel_synonyms_count                  0.528163
global_orthographic_density * global_synonyms_count       -0.023118
global_orthographic_density * rel_aoa                     -0.049979
global_orthographic_density * rel_clustering               0.210898
global_orthographic_density * rel_frequency               -0.027212
global_orthographic_density * rel_letters_count            0.380005
global_orthographic_density * rel_orthographic_density     0.114355
global_orthographic_density * rel_synonyms_count           0.022438
global_synonyms_count * rel_aoa                           -0.020988
global_synonyms_count * rel_clustering                     0.394543
global_synonyms_count * rel_frequency                     -0.040169
global_synonyms_count * rel_letters_count                 -0.517246
global_synonyms_count * rel_orthographic_density          -0.515840
global_synonyms_count * rel_synonyms_count                -0.089194
rel_aoa * rel_clustering                                   0.131408
rel_aoa * rel_frequency                                   -0.228687
rel_aoa * rel_letters_count                               -0.289287
rel_aoa * rel_orthographic_density                        -0.039925
rel_aoa * rel_synonyms_count                               0.236016
rel_clustering * rel_frequency                            -0.089203
rel_clustering * rel_letters_count                        -0.074184
rel_clustering * rel_orthographic_density                  0.073694
rel_clustering * rel_synonyms_count                        0.191449
rel_frequency * rel_letters_count                          0.054489
rel_frequency * rel_orthographic_density                  -0.018389
rel_frequency * rel_synonyms_count                         0.098318
rel_letters_count * rel_orthographic_density              -0.076055
rel_letters_count * rel_synonyms_count                     0.255607
rel_orthographic_density * rel_synonyms_count              1.099866
dtype: float64

Regressing rel letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2737884852271619

intercept                     -2.961240
global_aoa                     0.095223
global_clustering             -0.551169
global_frequency               0.101362
global_letters_count          -0.253500
global_orthographic_density    0.099835
global_synonyms_count         -0.513166
rel_aoa                       -0.144992
rel_clustering                 0.601091
rel_frequency                 -0.139487
rel_letters_count              0.690122
rel_orthographic_density      -0.176692
rel_synonyms_count             0.179696
dtype: float64

Regressing rel letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3708436512615203

intercept                                                -23.380380
global_aoa                                                 1.663824
global_clustering                                         -6.567010
global_frequency                                           0.584159
global_letters_count                                      -0.433991
global_orthographic_density                               -3.830093
global_synonyms_count                                     -1.588340
rel_aoa                                                   -3.047901
rel_clustering                                             2.841485
rel_frequency                                             -0.276745
rel_letters_count                                          0.006535
rel_orthographic_density                                   0.870199
rel_synonyms_count                                        -3.362171
global_aoa * global_clustering                             0.286101
global_aoa * global_frequency                             -0.003727
global_aoa * global_letters_count                         -0.033540
global_aoa * global_orthographic_density                   0.138197
global_aoa * global_synonyms_count                         0.040871
global_aoa * rel_aoa                                       0.013127
global_aoa * rel_clustering                               -0.189070
global_aoa * rel_frequency                                 0.011297
global_aoa * rel_letters_count                             0.061899
global_aoa * rel_orthographic_density                     -0.066877
global_aoa * rel_synonyms_count                           -0.065449
global_clustering * global_frequency                       0.345467
global_clustering * global_letters_count                   0.226751
global_clustering * global_orthographic_density            0.061013
global_clustering * global_synonyms_count                 -0.385113
global_clustering * rel_aoa                                0.068866
global_clustering * rel_clustering                         0.025908
global_clustering * rel_frequency                         -0.135420
global_clustering * rel_letters_count                     -0.412976
global_clustering * rel_orthographic_density              -0.185981
global_clustering * rel_synonyms_count                    -0.064666
global_frequency * global_letters_count                    0.201428
global_frequency * global_orthographic_density             0.404747
global_frequency * global_synonyms_count                   0.071695
global_frequency * rel_aoa                                 0.219858
global_frequency * rel_clustering                         -0.089850
global_frequency * rel_frequency                          -0.049398
global_frequency * rel_letters_count                      -0.252203
global_frequency * rel_orthographic_density               -0.167975
global_frequency * rel_synonyms_count                      0.027788
global_letters_count * global_orthographic_density        -0.153329
global_letters_count * global_synonyms_count              -0.166633
global_letters_count * rel_aoa                             0.200507
global_letters_count * rel_clustering                     -0.068947
global_letters_count * rel_frequency                      -0.012689
global_letters_count * rel_letters_count                   0.023419
global_letters_count * rel_orthographic_density           -0.050989
global_letters_count * rel_synonyms_count                  0.529149
global_orthographic_density * global_synonyms_count       -0.374889
global_orthographic_density * rel_aoa                      0.017065
global_orthographic_density * rel_clustering               0.001918
global_orthographic_density * rel_frequency               -0.126697
global_orthographic_density * rel_letters_count            0.218299
global_orthographic_density * rel_orthographic_density     0.078567
global_orthographic_density * rel_synonyms_count           0.391372
global_synonyms_count * rel_aoa                            0.023609
global_synonyms_count * rel_clustering                     0.547529
global_synonyms_count * rel_frequency                     -0.074294
global_synonyms_count * rel_letters_count                 -0.492236
global_synonyms_count * rel_orthographic_density          -0.117435
global_synonyms_count * rel_synonyms_count                -0.106391
rel_aoa * rel_clustering                                   0.025504
rel_aoa * rel_frequency                                   -0.183980
rel_aoa * rel_letters_count                               -0.251892
rel_aoa * rel_orthographic_density                        -0.082048
rel_aoa * rel_synonyms_count                               0.220658
rel_clustering * rel_frequency                            -0.036188
rel_clustering * rel_letters_count                         0.178677
rel_clustering * rel_orthographic_density                  0.351066
rel_clustering * rel_synonyms_count                        0.189311
rel_frequency * rel_letters_count                          0.038944
rel_frequency * rel_orthographic_density                   0.071638
rel_frequency * rel_synonyms_count                         0.124063
rel_letters_count * rel_orthographic_density               0.051838
rel_letters_count * rel_synonyms_count                     0.252747
rel_orthographic_density * rel_synonyms_count              0.711916
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 665 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1670975732558042

intercept                      1.129930
global_aoa                    -0.027723
global_clustering              0.071488
global_frequency              -0.024347
global_letters_count          -0.011264
global_orthographic_density    0.005977
global_synonyms_count          0.380298
dtype: float64

Regressing global synonyms_count with 665 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17734196539385194

intercept                                              0.727989
global_aoa                                            -0.039457
global_clustering                                      0.009296
global_frequency                                       0.026229
global_letters_count                                  -0.003273
global_orthographic_density                           -0.113743
global_synonyms_count                                  0.827528
global_aoa * global_clustering                         0.001364
global_aoa * global_frequency                         -0.003421
global_aoa * global_letters_count                      0.004218
global_aoa * global_orthographic_density               0.018096
global_aoa * global_synonyms_count                     0.016622
global_clustering * global_frequency                   0.000984
global_clustering * global_letters_count               0.002690
global_clustering * global_orthographic_density       -0.007213
global_clustering * global_synonyms_count              0.100464
global_frequency * global_letters_count               -0.002541
global_frequency * global_orthographic_density        -0.003720
global_frequency * global_synonyms_count               0.004420
global_letters_count * global_orthographic_density    -0.001570
global_letters_count * global_synonyms_count          -0.004399
global_orthographic_density * global_synonyms_count   -0.006138
dtype: float64

Regressing rel synonyms_count with 665 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.12989050502737287

intercept                      0.724822
global_aoa                    -0.025610
global_clustering              0.060264
global_frequency              -0.019209
global_letters_count          -0.008912
global_orthographic_density   -0.005590
global_synonyms_count          0.327415
dtype: float64

Regressing rel synonyms_count with 665 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.14297244259508646

intercept                                              0.409640
global_aoa                                            -0.070374
global_clustering                                     -0.019696
global_frequency                                      -0.004353
global_letters_count                                   0.011344
global_orthographic_density                           -0.085409
global_synonyms_count                                  1.024466
global_aoa * global_clustering                        -0.000031
global_aoa * global_frequency                          0.000440
global_aoa * global_letters_count                      0.003494
global_aoa * global_orthographic_density               0.017974
global_aoa * global_synonyms_count                     0.009012
global_clustering * global_frequency                  -0.000105
global_clustering * global_letters_count               0.003637
global_clustering * global_orthographic_density        0.010538
global_clustering * global_synonyms_count              0.116693
global_frequency * global_letters_count               -0.002798
global_frequency * global_orthographic_density         0.002646
global_frequency * global_synonyms_count              -0.002584
global_letters_count * global_orthographic_density     0.002596
global_letters_count * global_synonyms_count          -0.007686
global_orthographic_density * global_synonyms_count   -0.021315
dtype: float64

Regressing global synonyms_count with 665 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12972887627658392

intercept                   0.349296
rel_aoa                    -0.007081
rel_clustering              0.018951
rel_frequency              -0.032731
rel_letters_count          -0.020665
rel_orthographic_density    0.026942
rel_synonyms_count          0.343419
dtype: float64

Regressing global synonyms_count with 665 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1592784041508979

intercept                                        0.373150
rel_aoa                                         -0.040024
rel_clustering                                  -0.020427
rel_frequency                                   -0.031521
rel_letters_count                               -0.054958
rel_orthographic_density                         0.030368
rel_synonyms_count                               0.228427
rel_aoa * rel_clustering                         0.006399
rel_aoa * rel_frequency                         -0.009991
rel_aoa * rel_letters_count                      0.017001
rel_aoa * rel_orthographic_density               0.031983
rel_aoa * rel_synonyms_count                     0.014327
rel_clustering * rel_frequency                   0.011225
rel_clustering * rel_letters_count               0.009536
rel_clustering * rel_orthographic_density       -0.040366
rel_clustering * rel_synonyms_count              0.086261
rel_frequency * rel_letters_count                0.000328
rel_frequency * rel_orthographic_density         0.000189
rel_frequency * rel_synonyms_count              -0.005246
rel_letters_count * rel_orthographic_density    -0.010387
rel_letters_count * rel_synonyms_count          -0.010133
rel_orthographic_density * rel_synonyms_count   -0.066780
dtype: float64

Regressing rel synonyms_count with 665 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.2487315958533145

intercept                   0.028354
rel_aoa                    -0.018347
rel_clustering              0.055725
rel_frequency              -0.020866
rel_letters_count          -0.017346
rel_orthographic_density   -0.000773
rel_synonyms_count          0.484819
dtype: float64

Regressing rel synonyms_count with 665 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.26771471752014253

intercept                                        0.054589
rel_aoa                                         -0.036615
rel_clustering                                   0.020661
rel_frequency                                   -0.012610
rel_letters_count                               -0.050231
rel_orthographic_density                        -0.015266
rel_synonyms_count                               0.463532
rel_aoa * rel_clustering                         0.011713
rel_aoa * rel_frequency                         -0.003254
rel_aoa * rel_letters_count                      0.013450
rel_aoa * rel_orthographic_density               0.021016
rel_aoa * rel_synonyms_count                    -0.001686
rel_clustering * rel_frequency                   0.006707
rel_clustering * rel_letters_count               0.009858
rel_clustering * rel_orthographic_density       -0.014658
rel_clustering * rel_synonyms_count              0.092009
rel_frequency * rel_letters_count               -0.005477
rel_frequency * rel_orthographic_density         0.000315
rel_frequency * rel_synonyms_count               0.009415
rel_letters_count * rel_orthographic_density     0.000343
rel_letters_count * rel_synonyms_count          -0.009194
rel_orthographic_density * rel_synonyms_count   -0.036716
dtype: float64

Regressing global synonyms_count with 665 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17371912532305622

intercept                      1.022339
global_aoa                    -0.037596
global_clustering              0.121011
global_frequency               0.007630
global_letters_count           0.031219
global_orthographic_density   -0.020426
global_synonyms_count          0.372700
rel_aoa                        0.013705
rel_clustering                -0.057676
rel_frequency                 -0.034670
rel_letters_count             -0.045671
rel_orthographic_density       0.030384
rel_synonyms_count             0.004691
dtype: float64

Regressing global synonyms_count with 665 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2803640721528628

intercept                                                 8.599575
global_aoa                                               -0.354970
global_clustering                                         2.307858
global_frequency                                         -0.228971
global_letters_count                                      0.052331
global_orthographic_density                               0.293001
global_synonyms_count                                     5.301405
rel_aoa                                                  -0.257480
rel_clustering                                           -2.236507
rel_frequency                                            -0.121218
rel_letters_count                                        -0.347752
rel_orthographic_density                                 -0.617486
rel_synonyms_count                                       -5.365166
global_aoa * global_clustering                           -0.054460
global_aoa * global_frequency                             0.000962
global_aoa * global_letters_count                        -0.017913
global_aoa * global_orthographic_density                  0.020457
global_aoa * global_synonyms_count                        0.011632
global_aoa * rel_aoa                                     -0.003676
global_aoa * rel_clustering                               0.078397
global_aoa * rel_frequency                                0.000300
global_aoa * rel_letters_count                            0.016032
global_aoa * rel_orthographic_density                    -0.008143
global_aoa * rel_synonyms_count                           0.013386
global_clustering * global_frequency                     -0.097213
global_clustering * global_letters_count                 -0.084835
global_clustering * global_orthographic_density          -0.269272
global_clustering * global_synonyms_count                 0.517547
global_clustering * rel_aoa                              -0.072596
global_clustering * rel_clustering                       -0.020193
global_clustering * rel_frequency                         0.040322
global_clustering * rel_letters_count                     0.033269
global_clustering * rel_orthographic_density              0.259512
global_clustering * rel_synonyms_count                   -0.409390
global_frequency * global_letters_count                  -0.003882
global_frequency * global_orthographic_density           -0.135304
global_frequency * global_synonyms_count                 -0.089062
global_frequency * rel_aoa                               -0.001180
global_frequency * rel_clustering                         0.087593
global_frequency * rel_frequency                         -0.002976
global_frequency * rel_letters_count                     -0.002498
global_frequency * rel_orthographic_density               0.160449
global_frequency * rel_synonyms_count                     0.134027
global_letters_count * global_orthographic_density       -0.146863
global_letters_count * global_synonyms_count             -0.149714
global_letters_count * rel_aoa                           -0.011045
global_letters_count * rel_clustering                     0.079541
global_letters_count * rel_frequency                      0.005244
global_letters_count * rel_letters_count                 -0.001780
global_letters_count * rel_orthographic_density           0.144775
global_letters_count * rel_synonyms_count                 0.214493
global_orthographic_density * global_synonyms_count       0.048421
global_orthographic_density * rel_aoa                    -0.071347
global_orthographic_density * rel_clustering              0.183981
global_orthographic_density * rel_frequency               0.110172
global_orthographic_density * rel_letters_count           0.168929
global_orthographic_density * rel_orthographic_density   -0.021169
global_orthographic_density * rel_synonyms_count         -0.018720
global_synonyms_count * rel_aoa                           0.072142
global_synonyms_count * rel_clustering                   -0.316435
global_synonyms_count * rel_frequency                     0.247195
global_synonyms_count * rel_letters_count                 0.087969
global_synonyms_count * rel_orthographic_density         -0.167617
global_synonyms_count * rel_synonyms_count                0.149904
rel_aoa * rel_clustering                                  0.051192
rel_aoa * rel_frequency                                  -0.013129
rel_aoa * rel_letters_count                               0.023496
rel_aoa * rel_orthographic_density                        0.071871
rel_aoa * rel_synonyms_count                             -0.084077
rel_clustering * rel_frequency                           -0.021969
rel_clustering * rel_letters_count                       -0.037912
rel_clustering * rel_orthographic_density                -0.201512
rel_clustering * rel_synonyms_count                       0.249342
rel_frequency * rel_letters_count                        -0.000092
rel_frequency * rel_orthographic_density                 -0.131204
rel_frequency * rel_synonyms_count                       -0.280393
rel_letters_count * rel_orthographic_density             -0.186933
rel_letters_count * rel_synonyms_count                   -0.129075
rel_orthographic_density * rel_synonyms_count             0.142219
dtype: float64

Regressing rel synonyms_count with 665 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.29833498317517126

intercept                      0.745237
global_aoa                    -0.032387
global_clustering              0.100833
global_frequency               0.010756
global_letters_count           0.035891
global_orthographic_density   -0.012320
global_synonyms_count         -0.495137
rel_aoa                        0.009899
rel_clustering                -0.044855
rel_frequency                 -0.033727
rel_letters_count             -0.047480
rel_orthographic_density       0.013921
rel_synonyms_count             0.938250
dtype: float64

Regressing rel synonyms_count with 665 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.393929098905931

intercept                                                 7.797313
global_aoa                                               -0.226464
global_clustering                                         2.189310
global_frequency                                         -0.216060
global_letters_count                                     -0.030169
global_orthographic_density                               0.369558
global_synonyms_count                                     4.102248
rel_aoa                                                  -0.223065
rel_clustering                                           -2.135665
rel_frequency                                            -0.191956
rel_letters_count                                        -0.358369
rel_orthographic_density                                 -0.527738
rel_synonyms_count                                       -4.044709
global_aoa * global_clustering                           -0.041405
global_aoa * global_frequency                            -0.000296
global_aoa * global_letters_count                        -0.017430
global_aoa * global_orthographic_density                  0.003308
global_aoa * global_synonyms_count                       -0.011800
global_aoa * rel_aoa                                     -0.004104
global_aoa * rel_clustering                               0.060237
global_aoa * rel_frequency                               -0.002052
global_aoa * rel_letters_count                            0.017640
global_aoa * rel_orthographic_density                     0.005367
global_aoa * rel_synonyms_count                           0.038176
global_clustering * global_frequency                     -0.092950
global_clustering * global_letters_count                 -0.095227
global_clustering * global_orthographic_density          -0.256179
global_clustering * global_synonyms_count                 0.409504
global_clustering * rel_aoa                              -0.063798
global_clustering * rel_clustering                       -0.022654
global_clustering * rel_frequency                         0.022745
global_clustering * rel_letters_count                     0.041121
global_clustering * rel_orthographic_density              0.268140
global_clustering * rel_synonyms_count                   -0.332036
global_frequency * global_letters_count                  -0.004845
global_frequency * global_orthographic_density           -0.128845
global_frequency * global_synonyms_count                 -0.104693
global_frequency * rel_aoa                               -0.002001
global_frequency * rel_clustering                         0.084895
global_frequency * rel_frequency                         -0.003650
global_frequency * rel_letters_count                      0.006902
global_frequency * rel_orthographic_density               0.156561
global_frequency * rel_synonyms_count                     0.143301
global_letters_count * global_orthographic_density       -0.131276
global_letters_count * global_synonyms_count             -0.144117
global_letters_count * rel_aoa                           -0.008710
global_letters_count * rel_clustering                     0.097479
global_letters_count * rel_frequency                      0.007609
global_letters_count * rel_letters_count                 -0.004521
global_letters_count * rel_orthographic_density           0.121019
global_letters_count * rel_synonyms_count                 0.193493
global_orthographic_density * global_synonyms_count       0.026062
global_orthographic_density * rel_aoa                    -0.058592
global_orthographic_density * rel_clustering              0.169797
global_orthographic_density * rel_frequency               0.105309
global_orthographic_density * rel_letters_count           0.152453
global_orthographic_density * rel_orthographic_density   -0.025312
global_orthographic_density * rel_synonyms_count         -0.028876
global_synonyms_count * rel_aoa                           0.074344
global_synonyms_count * rel_clustering                   -0.252154
global_synonyms_count * rel_frequency                     0.256167
global_synonyms_count * rel_letters_count                 0.097676
global_synonyms_count * rel_orthographic_density         -0.152400
global_synonyms_count * rel_synonyms_count                0.136725
rel_aoa * rel_clustering                                  0.050817
rel_aoa * rel_frequency                                  -0.008743
rel_aoa * rel_letters_count                               0.017883
rel_aoa * rel_orthographic_density                        0.057119
rel_aoa * rel_synonyms_count                             -0.099025
rel_clustering * rel_frequency                           -0.008402
rel_clustering * rel_letters_count                       -0.056482
rel_clustering * rel_orthographic_density                -0.207505
rel_clustering * rel_synonyms_count                       0.195590
rel_frequency * rel_letters_count                        -0.011462
rel_frequency * rel_orthographic_density                 -0.127176
rel_frequency * rel_synonyms_count                       -0.284563
rel_letters_count * rel_orthographic_density             -0.161228
rel_letters_count * rel_synonyms_count                   -0.134458
rel_orthographic_density * rel_synonyms_count             0.147940
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 608 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2387828637434525

intercept                      0.713359
global_aoa                    -0.009510
global_clustering             -0.036699
global_frequency              -0.007320
global_letters_count          -0.023154
global_orthographic_density    0.418896
global_synonyms_count          0.104333
dtype: float64

Regressing global orthographic_density with 608 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2590358704789478

intercept                                              3.004172
global_aoa                                            -0.242512
global_clustering                                      0.286978
global_frequency                                       0.116683
global_letters_count                                  -0.261109
global_orthographic_density                           -0.141368
global_synonyms_count                                  0.175663
global_aoa * global_clustering                        -0.030850
global_aoa * global_frequency                         -0.008835
global_aoa * global_letters_count                      0.011097
global_aoa * global_orthographic_density               0.058011
global_aoa * global_synonyms_count                    -0.012947
global_clustering * global_frequency                   0.012158
global_clustering * global_letters_count              -0.015989
global_clustering * global_orthographic_density       -0.080609
global_clustering * global_synonyms_count             -0.041330
global_frequency * global_letters_count                0.008342
global_frequency * global_orthographic_density        -0.030081
global_frequency * global_synonyms_count               0.011228
global_letters_count * global_orthographic_density     0.001285
global_letters_count * global_synonyms_count          -0.036469
global_orthographic_density * global_synonyms_count   -0.081567
dtype: float64

Regressing rel orthographic_density with 608 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.19548207888530622

intercept                     -1.759619
global_aoa                     0.003721
global_clustering             -0.057172
global_frequency               0.001832
global_letters_count          -0.016283
global_orthographic_density    0.375999
global_synonyms_count          0.122793
dtype: float64

Regressing rel orthographic_density with 608 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2146347141842767

intercept                                              2.004563
global_aoa                                            -0.367828
global_clustering                                      0.054868
global_frequency                                      -0.059746
global_letters_count                                  -0.443433
global_orthographic_density                           -0.316275
global_synonyms_count                                 -0.293824
global_aoa * global_clustering                        -0.019282
global_aoa * global_frequency                         -0.000849
global_aoa * global_letters_count                      0.029164
global_aoa * global_orthographic_density               0.067641
global_aoa * global_synonyms_count                     0.017224
global_clustering * global_frequency                   0.009996
global_clustering * global_letters_count              -0.004886
global_clustering * global_orthographic_density       -0.024389
global_clustering * global_synonyms_count             -0.014608
global_frequency * global_letters_count                0.019143
global_frequency * global_orthographic_density         0.001349
global_frequency * global_synonyms_count               0.037477
global_letters_count * global_orthographic_density     0.015456
global_letters_count * global_synonyms_count          -0.012329
global_orthographic_density * global_synonyms_count   -0.037031
dtype: float64

Regressing global orthographic_density with 608 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.195831147089373

intercept                   1.590282
rel_aoa                    -0.009981
rel_clustering             -0.113198
rel_frequency              -0.026871
rel_letters_count          -0.002417
rel_orthographic_density    0.419926
rel_synonyms_count          0.089738
dtype: float64

Regressing global orthographic_density with 608 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.20773519507663352

intercept                                        1.535309
rel_aoa                                          0.018156
rel_clustering                                  -0.049256
rel_frequency                                   -0.045452
rel_letters_count                                0.017267
rel_orthographic_density                         0.389664
rel_synonyms_count                               0.278019
rel_aoa * rel_clustering                         0.007787
rel_aoa * rel_frequency                          0.005277
rel_aoa * rel_letters_count                     -0.003885
rel_aoa * rel_orthographic_density               0.013182
rel_aoa * rel_synonyms_count                     0.011624
rel_clustering * rel_frequency                   0.015299
rel_clustering * rel_letters_count              -0.029478
rel_clustering * rel_orthographic_density       -0.044955
rel_clustering * rel_synonyms_count             -0.150173
rel_frequency * rel_letters_count               -0.002237
rel_frequency * rel_orthographic_density        -0.013087
rel_frequency * rel_synonyms_count               0.012026
rel_letters_count * rel_orthographic_density     0.004588
rel_letters_count * rel_synonyms_count          -0.053233
rel_orthographic_density * rel_synonyms_count   -0.031496
dtype: float64

Regressing rel orthographic_density with 608 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.30217525993521577

intercept                  -0.437179
rel_aoa                    -0.002169
rel_clustering             -0.091656
rel_frequency               0.017833
rel_letters_count           0.012821
rel_orthographic_density    0.523498
rel_synonyms_count          0.075330
dtype: float64

Regressing rel orthographic_density with 608 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3106015163970216

intercept                                       -0.437413
rel_aoa                                          0.019190
rel_clustering                                  -0.038763
rel_frequency                                    0.023423
rel_letters_count                                0.022506
rel_orthographic_density                         0.513019
rel_synonyms_count                               0.192754
rel_aoa * rel_clustering                         0.016352
rel_aoa * rel_frequency                          0.002471
rel_aoa * rel_letters_count                      0.003398
rel_aoa * rel_orthographic_density               0.033273
rel_aoa * rel_synonyms_count                     0.019300
rel_clustering * rel_frequency                   0.016523
rel_clustering * rel_letters_count              -0.008043
rel_clustering * rel_orthographic_density       -0.003436
rel_clustering * rel_synonyms_count             -0.132532
rel_frequency * rel_letters_count               -0.002606
rel_frequency * rel_orthographic_density         0.009523
rel_frequency * rel_synonyms_count               0.004506
rel_letters_count * rel_orthographic_density     0.009504
rel_letters_count * rel_synonyms_count          -0.049328
rel_orthographic_density * rel_synonyms_count   -0.046599
dtype: float64

Regressing global orthographic_density with 608 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.26178626571596764

intercept                      3.508248
global_aoa                     0.006903
global_clustering              0.220414
global_frequency              -0.056067
global_letters_count          -0.183448
global_orthographic_density    0.370716
global_synonyms_count          0.104733
rel_aoa                       -0.015059
rel_clustering                -0.286705
rel_frequency                  0.063181
rel_letters_count              0.174260
rel_orthographic_density       0.049328
rel_synonyms_count             0.002248
dtype: float64

Regressing global orthographic_density with 608 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.33683849794408127

intercept                                                 10.026141
global_aoa                                                -1.438427
global_clustering                                          0.876129
global_frequency                                           0.343193
global_letters_count                                      -0.562806
global_orthographic_density                               -1.455428
global_synonyms_count                                     -1.651990
rel_aoa                                                    0.364759
rel_clustering                                            -0.798210
rel_frequency                                             -0.182524
rel_letters_count                                          0.506819
rel_orthographic_density                                  -0.376436
rel_synonyms_count                                         6.879831
global_aoa * global_clustering                            -0.125225
global_aoa * global_frequency                              0.023832
global_aoa * global_letters_count                          0.043113
global_aoa * global_orthographic_density                   0.123199
global_aoa * global_synonyms_count                         0.041539
global_aoa * rel_aoa                                       0.007277
global_aoa * rel_clustering                                0.123603
global_aoa * rel_frequency                                -0.021135
global_aoa * rel_letters_count                            -0.021800
global_aoa * rel_orthographic_density                     -0.049432
global_aoa * rel_synonyms_count                           -0.068748
global_clustering * global_frequency                       0.065265
global_clustering * global_letters_count                  -0.107826
global_clustering * global_orthographic_density           -0.175868
global_clustering * global_synonyms_count                 -0.004405
global_clustering * rel_aoa                               -0.090959
global_clustering * rel_clustering                        -0.068461
global_clustering * rel_frequency                         -0.098001
global_clustering * rel_letters_count                      0.181353
global_clustering * rel_orthographic_density               0.048648
global_clustering * rel_synonyms_count                     0.513857
global_frequency * global_letters_count                   -0.048188
global_frequency * global_orthographic_density             0.012531
global_frequency * global_synonyms_count                   0.038936
global_frequency * rel_aoa                                -0.044858
global_frequency * rel_clustering                         -0.045942
global_frequency * rel_frequency                           0.021668
global_frequency * rel_letters_count                       0.094268
global_frequency * rel_orthographic_density                0.009852
global_frequency * rel_synonyms_count                     -0.116523
global_letters_count * global_orthographic_density        -0.037801
global_letters_count * global_synonyms_count               0.113153
global_letters_count * rel_aoa                            -0.072009
global_letters_count * rel_clustering                      0.061137
global_letters_count * rel_frequency                      -0.034167
global_letters_count * rel_letters_count                   0.005902
global_letters_count * rel_orthographic_density            0.172538
global_letters_count * rel_synonyms_count                 -0.250524
global_orthographic_density * global_synonyms_count        0.351917
global_orthographic_density * rel_aoa                     -0.096682
global_orthographic_density * rel_clustering              -0.029723
global_orthographic_density * rel_frequency               -0.132827
global_orthographic_density * rel_letters_count           -0.051853
global_orthographic_density * rel_orthographic_density     0.026191
global_orthographic_density * rel_synonyms_count          -0.492184
global_synonyms_count * rel_aoa                            0.069950
global_synonyms_count * rel_clustering                    -0.385862
global_synonyms_count * rel_frequency                      0.045162
global_synonyms_count * rel_letters_count                 -0.027480
global_synonyms_count * rel_orthographic_density          -0.339042
global_synonyms_count * rel_synonyms_count                -0.039487
rel_aoa * rel_clustering                                   0.062306
rel_aoa * rel_frequency                                    0.041507
rel_aoa * rel_letters_count                                0.043349
rel_aoa * rel_orthographic_density                         0.058104
rel_aoa * rel_synonyms_count                              -0.067666
rel_clustering * rel_frequency                             0.080958
rel_clustering * rel_letters_count                        -0.135495
rel_clustering * rel_orthographic_density                  0.079928
rel_clustering * rel_synonyms_count                       -0.209208
rel_frequency * rel_letters_count                         -0.008115
rel_frequency * rel_orthographic_density                   0.050121
rel_frequency * rel_synonyms_count                         0.045128
rel_letters_count * rel_orthographic_density              -0.047850
rel_letters_count * rel_synonyms_count                     0.142928
rel_orthographic_density * rel_synonyms_count              0.464393
dtype: float64

Regressing rel orthographic_density with 608 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3375633264141009

intercept                      2.666022
global_aoa                     0.008442
global_clustering              0.189439
global_frequency              -0.047148
global_letters_count          -0.143233
global_orthographic_density   -0.429364
global_synonyms_count          0.137199
rel_aoa                       -0.017890
rel_clustering                -0.245459
rel_frequency                  0.058131
rel_letters_count              0.125089
rel_orthographic_density       0.887773
rel_synonyms_count            -0.038564
dtype: float64

Regressing rel orthographic_density with 608 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.40149989098159944

intercept                                                 7.868545
global_aoa                                               -1.012643
global_clustering                                         1.091526
global_frequency                                          0.359958
global_letters_count                                     -0.666214
global_orthographic_density                              -1.496811
global_synonyms_count                                    -1.523054
rel_aoa                                                   0.052688
rel_clustering                                           -1.394885
rel_frequency                                            -0.446072
rel_letters_count                                         0.535029
rel_orthographic_density                                 -0.206606
rel_synonyms_count                                        6.308633
global_aoa * global_clustering                           -0.114333
global_aoa * global_frequency                             0.010937
global_aoa * global_letters_count                         0.027498
global_aoa * global_orthographic_density                  0.054517
global_aoa * global_synonyms_count                        0.004060
global_aoa * rel_aoa                                      0.004745
global_aoa * rel_clustering                               0.106158
global_aoa * rel_frequency                               -0.010866
global_aoa * rel_letters_count                            0.001667
global_aoa * rel_orthographic_density                     0.021645
global_aoa * rel_synonyms_count                          -0.014768
global_clustering * global_frequency                      0.043732
global_clustering * global_letters_count                 -0.111148
global_clustering * global_orthographic_density          -0.214979
global_clustering * global_synonyms_count                -0.052000
global_clustering * rel_aoa                              -0.086921
global_clustering * rel_clustering                       -0.054360
global_clustering * rel_frequency                        -0.084614
global_clustering * rel_letters_count                     0.211237
global_clustering * rel_orthographic_density              0.122828
global_clustering * rel_synonyms_count                    0.520236
global_frequency * global_letters_count                  -0.027761
global_frequency * global_orthographic_density           -0.050723
global_frequency * global_synonyms_count                  0.018873
global_frequency * rel_aoa                               -0.035775
global_frequency * rel_clustering                        -0.003464
global_frequency * rel_frequency                          0.017490
global_frequency * rel_letters_count                      0.076593
global_frequency * rel_orthographic_density               0.082993
global_frequency * rel_synonyms_count                    -0.107957
global_letters_count * global_orthographic_density       -0.014225
global_letters_count * global_synonyms_count              0.141232
global_letters_count * rel_aoa                           -0.046222
global_letters_count * rel_clustering                     0.094852
global_letters_count * rel_frequency                     -0.024325
global_letters_count * rel_letters_count                  0.003890
global_letters_count * rel_orthographic_density           0.152610
global_letters_count * rel_synonyms_count                -0.262522
global_orthographic_density * global_synonyms_count       0.346063
global_orthographic_density * rel_aoa                    -0.037613
global_orthographic_density * rel_clustering              0.079957
global_orthographic_density * rel_frequency              -0.010949
global_orthographic_density * rel_letters_count          -0.004551
global_orthographic_density * rel_orthographic_density    0.048048
global_orthographic_density * rel_synonyms_count         -0.435062
global_synonyms_count * rel_aoa                           0.098155
global_synonyms_count * rel_clustering                   -0.232931
global_synonyms_count * rel_frequency                     0.074919
global_synonyms_count * rel_letters_count                -0.076540
global_synonyms_count * rel_orthographic_density         -0.327679
global_synonyms_count * rel_synonyms_count               -0.070900
rel_aoa * rel_clustering                                  0.071551
rel_aoa * rel_frequency                                   0.032383
rel_aoa * rel_letters_count                               0.016638
rel_aoa * rel_orthographic_density                       -0.002987
rel_aoa * rel_synonyms_count                             -0.103650
rel_clustering * rel_frequency                            0.058188
rel_clustering * rel_letters_count                       -0.193338
rel_clustering * rel_orthographic_density                -0.048489
rel_clustering * rel_synonyms_count                      -0.312695
rel_frequency * rel_letters_count                        -0.022782
rel_frequency * rel_orthographic_density                 -0.074442
rel_frequency * rel_synonyms_count                        0.004293
rel_letters_count * rel_orthographic_density             -0.105057
rel_letters_count * rel_synonyms_count                    0.168414
rel_orthographic_density * rel_synonyms_count             0.400755
dtype: float64