Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 3641 substitutions for model Model(time=Time.continuous, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)
100% (3641 of 3641) |######################| Elapsed Time: 0:01:19 Time: 0:01:19

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | *** | *** | *** | *   |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *   |
H_00 | *** | ns. | *** | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | **  | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | **  | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *** | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | ns. | *** |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | **  | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *** | *** |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | *** | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *** |
H_00 | *** | *   | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *   | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *   | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | *   | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | ns. | ns. | *** |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *   | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | *   | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | **  |
H_00 | ns. | *** | *   | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | **  | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | **  | *   | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | **  | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | **  | *   | *   |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *   | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | **  | ns. | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | **  | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | *   | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *** | *   | ns. | *** |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.53887324  0.17885153  0.07678351  0.06656826  0.03555517  0.02773656
  0.02037801  0.01865767  0.01656899  0.00933154]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.483704 0.228647 -0.076458 0.210872 0.216210 -0.454966 0.215477 0.253650 -0.433574 0.288111 -0.166944 0.007933
Component-1 -0.378151 0.387900 -0.164043 0.305139 0.259899 0.408160 -0.141917 0.305147 0.415313 -0.213335 0.144841 -0.011955
Component-2 -0.618874 -0.640249 0.070554 -0.213209 0.276023 0.097740 0.004217 -0.232912 0.067607 -0.084718 0.038280 0.058475

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (3641 of 3641) |######################| Elapsed Time: 0:01:09 Time: 0:01:09

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | **  | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *** | **  | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.67738475  0.20303107]

Out[35]:
aoa frequency letters_count
Component-0 -0.750396 0.382365 -0.539168
Component-1 0.417695 -0.357882 -0.835136

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (3641 of 3641) |######################| Elapsed Time: 0:00:24 Time: 0:00:24

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1305 (cluster-unique) substitutions, but the PCA is in fact computed on 1001 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
    * global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.15238203928262029

intercept                      4.278016
global_aoa                     0.074426
global_clustering              0.087522
global_frequency               0.523831
global_letters_count          -0.022200
global_orthographic_density   -0.037976
global_synonyms_count          0.006073
dtype: float64

Regressing global frequency with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.17338539451881185

intercept                                             -4.730022
global_aoa                                             0.600903
global_clustering                                     -0.270165
global_frequency                                       1.561821
global_letters_count                                   0.056856
global_orthographic_density                            1.664570
global_synonyms_count                                  1.543491
global_aoa * global_clustering                         0.029046
global_aoa * global_frequency                         -0.029697
global_aoa * global_letters_count                     -0.012385
global_aoa * global_orthographic_density               0.000162
global_aoa * global_synonyms_count                    -0.113689
global_clustering * global_frequency                   0.059402
global_clustering * global_letters_count              -0.074531
global_clustering * global_orthographic_density        0.043735
global_clustering * global_synonyms_count              0.099202
global_frequency * global_letters_count               -0.047164
global_frequency * global_orthographic_density        -0.170847
global_frequency * global_synonyms_count              -0.020936
global_letters_count * global_orthographic_density     0.036550
global_letters_count * global_synonyms_count           0.010412
global_orthographic_density * global_synonyms_count   -0.068843
dtype: float64

Regressing rel frequency with 771 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.08963333899032931

intercept                     -7.021526
global_aoa                     0.095082
global_clustering              0.037526
global_frequency               0.433858
global_letters_count           0.016450
global_orthographic_density   -0.125194
global_synonyms_count          0.084506
dtype: float64

Regressing rel frequency with 771 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.10759137368977456

intercept                                             -18.135481
global_aoa                                              0.545901
global_clustering                                      -0.579354
global_frequency                                        1.558287
global_letters_count                                    0.429399
global_orthographic_density                             1.945261
global_synonyms_count                                   1.336014
global_aoa * global_clustering                          0.011162
global_aoa * global_frequency                          -0.035318
global_aoa * global_letters_count                      -0.009885
global_aoa * global_orthographic_density                0.001764
global_aoa * global_synonyms_count                     -0.072495
global_clustering * global_frequency                    0.057655
global_clustering * global_letters_count               -0.025915
global_clustering * global_orthographic_density         0.111714
global_clustering * global_synonyms_count               0.139570
global_frequency * global_letters_count                -0.056687
global_frequency * global_orthographic_density         -0.175277
global_frequency * global_synonyms_count               -0.001094
global_letters_count * global_orthographic_density      0.045002
global_letters_count * global_synonyms_count            0.015519
global_orthographic_density * global_synonyms_count    -0.054211
dtype: float64

Regressing global frequency with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06715415815823189

intercept                   9.630803
rel_aoa                     0.104500
rel_clustering             -0.087571
rel_frequency               0.293359
rel_letters_count          -0.011300
rel_orthographic_density   -0.004734
rel_synonyms_count         -0.013570
dtype: float64

Regressing global frequency with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09473933308638473

intercept                                        9.492613
rel_aoa                                          0.251386
rel_clustering                                  -0.053641
rel_frequency                                    0.235237
rel_letters_count                                0.043183
rel_orthographic_density                        -0.207940
rel_synonyms_count                               0.144491
rel_aoa * rel_clustering                         0.024846
rel_aoa * rel_frequency                          0.045553
rel_aoa * rel_letters_count                     -0.002713
rel_aoa * rel_orthographic_density               0.028978
rel_aoa * rel_synonyms_count                    -0.102007
rel_clustering * rel_frequency                  -0.041440
rel_clustering * rel_letters_count               0.008419
rel_clustering * rel_orthographic_density        0.185138
rel_clustering * rel_synonyms_count              0.026296
rel_frequency * rel_letters_count                0.001350
rel_frequency * rel_orthographic_density        -0.049329
rel_frequency * rel_synonyms_count               0.034353
rel_letters_count * rel_orthographic_density     0.030490
rel_letters_count * rel_synonyms_count          -0.073891
rel_orthographic_density * rel_synonyms_count   -0.196602
dtype: float64

Regressing rel frequency with 771 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.28369272547783253

intercept                  -1.226112
rel_aoa                     0.071225
rel_clustering              0.125077
rel_frequency               0.655343
rel_letters_count          -0.065769
rel_orthographic_density   -0.214411
rel_synonyms_count          0.017318
dtype: float64

Regressing rel frequency with 771 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.307721220513119

intercept                                       -1.420023
rel_aoa                                          0.121700
rel_clustering                                   0.142557
rel_frequency                                    0.616754
rel_letters_count                                0.026229
rel_orthographic_density                        -0.495440
rel_synonyms_count                               0.022233
rel_aoa * rel_clustering                        -0.014863
rel_aoa * rel_frequency                         -0.023679
rel_aoa * rel_letters_count                     -0.011226
rel_aoa * rel_orthographic_density               0.079029
rel_aoa * rel_synonyms_count                    -0.024516
rel_clustering * rel_frequency                  -0.040547
rel_clustering * rel_letters_count              -0.031046
rel_clustering * rel_orthographic_density        0.041893
rel_clustering * rel_synonyms_count              0.046918
rel_frequency * rel_letters_count               -0.001699
rel_frequency * rel_orthographic_density        -0.090745
rel_frequency * rel_synonyms_count              -0.005302
rel_letters_count * rel_orthographic_density     0.031507
rel_letters_count * rel_synonyms_count          -0.073520
rel_orthographic_density * rel_synonyms_count   -0.130464
dtype: float64

Regressing global frequency with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.16378579480233257

intercept                      3.711857
global_aoa                     0.022300
global_clustering              0.284489
global_frequency               0.628825
global_letters_count           0.089736
global_orthographic_density    0.142290
global_synonyms_count         -0.050268
rel_aoa                        0.082427
rel_clustering                -0.248539
rel_frequency                 -0.121708
rel_letters_count             -0.119273
rel_orthographic_density      -0.218920
rel_synonyms_count             0.066922
dtype: float64

Regressing global frequency with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.25754173366526645

intercept                                                -14.875185
global_aoa                                                 0.341124
global_clustering                                         -4.544837
global_frequency                                           0.922017
global_letters_count                                       0.472746
global_orthographic_density                                3.723049
global_synonyms_count                                     -7.097896
rel_aoa                                                    2.728963
rel_clustering                                             2.005174
rel_frequency                                              0.145579
rel_letters_count                                          0.230142
rel_orthographic_density                                   0.918545
rel_synonyms_count                                        11.532972
global_aoa * global_clustering                             0.098067
global_aoa * global_frequency                              0.032791
global_aoa * global_letters_count                          0.022258
global_aoa * global_orthographic_density                  -0.034314
global_aoa * global_synonyms_count                        -0.306510
global_aoa * rel_aoa                                      -0.033550
global_aoa * rel_clustering                                0.043881
global_aoa * rel_frequency                                 0.016827
global_aoa * rel_letters_count                            -0.060836
global_aoa * rel_orthographic_density                     -0.096183
global_aoa * rel_synonyms_count                            0.127759
global_clustering * global_frequency                       0.143989
global_clustering * global_letters_count                   0.060767
global_clustering * global_orthographic_density            1.123963
global_clustering * global_synonyms_count                  0.231807
global_clustering * rel_aoa                               -0.086547
global_clustering * rel_clustering                         0.114561
global_clustering * rel_frequency                         -0.137408
global_clustering * rel_letters_count                     -0.054560
global_clustering * rel_orthographic_density              -0.861457
global_clustering * rel_synonyms_count                    -0.020761
global_frequency * global_letters_count                   -0.106077
global_frequency * global_orthographic_density             0.186540
global_frequency * global_synonyms_count                   0.456741
global_frequency * rel_aoa                                -0.201284
global_frequency * rel_clustering                          0.198649
global_frequency * rel_frequency                          -0.018614
global_frequency * rel_letters_count                       0.056010
global_frequency * rel_orthographic_density               -0.432299
global_frequency * rel_synonyms_count                     -0.645749
global_letters_count * global_orthographic_density         0.272706
global_letters_count * global_synonyms_count               0.796813
global_letters_count * rel_aoa                            -0.111953
global_letters_count * rel_clustering                     -0.202303
global_letters_count * rel_frequency                      -0.019818
global_letters_count * rel_letters_count                   0.019193
global_letters_count * rel_orthographic_density           -0.215640
global_letters_count * rel_synonyms_count                 -0.863283
global_orthographic_density * global_synonyms_count        0.604694
global_orthographic_density * rel_aoa                     -0.114347
global_orthographic_density * rel_clustering              -1.551591
global_orthographic_density * rel_frequency               -0.316515
global_orthographic_density * rel_letters_count           -0.299451
global_orthographic_density * rel_orthographic_density    -0.149519
global_orthographic_density * rel_synonyms_count          -0.458044
global_synonyms_count * rel_aoa                           -0.082372
global_synonyms_count * rel_clustering                     0.184012
global_synonyms_count * rel_frequency                     -0.546767
global_synonyms_count * rel_letters_count                 -0.640037
global_synonyms_count * rel_orthographic_density          -0.785010
global_synonyms_count * rel_synonyms_count                -0.058999
rel_aoa * rel_clustering                                  -0.016664
rel_aoa * rel_frequency                                    0.121697
rel_aoa * rel_letters_count                                0.131976
rel_aoa * rel_orthographic_density                         0.206431
rel_aoa * rel_synonyms_count                               0.124025
rel_clustering * rel_frequency                            -0.151644
rel_clustering * rel_letters_count                         0.113400
rel_clustering * rel_orthographic_density                  1.310787
rel_clustering * rel_synonyms_count                       -0.149510
rel_frequency * rel_letters_count                          0.034790
rel_frequency * rel_orthographic_density                   0.397721
rel_frequency * rel_synonyms_count                         0.745637
rel_letters_count * rel_orthographic_density               0.287393
rel_letters_count * rel_synonyms_count                     0.676302
rel_orthographic_density * rel_synonyms_count              0.453128
dtype: float64

Regressing rel frequency with 771 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.31876316905023827

intercept                      2.267130
global_aoa                     0.028854
global_clustering              0.281742
global_frequency              -0.272510
global_letters_count           0.137977
global_orthographic_density    0.199327
global_synonyms_count         -0.023770
rel_aoa                        0.050557
rel_clustering                -0.190402
rel_frequency                  0.809749
rel_letters_count             -0.163826
rel_orthographic_density      -0.284428
rel_synonyms_count             0.039622
dtype: float64

Regressing rel frequency with 771 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.3922769932265918

intercept                                                -22.427558
global_aoa                                                 1.013680
global_clustering                                         -4.959802
global_frequency                                           0.279488
global_letters_count                                       0.393994
global_orthographic_density                                5.162188
global_synonyms_count                                     -4.826368
rel_aoa                                                    2.226021
rel_clustering                                             2.727478
rel_frequency                                              1.023623
rel_letters_count                                          0.171658
rel_orthographic_density                                  -0.267408
rel_synonyms_count                                        10.010112
global_aoa * global_clustering                             0.116403
global_aoa * global_frequency                             -0.001267
global_aoa * global_letters_count                          0.003617
global_aoa * global_orthographic_density                  -0.071705
global_aoa * global_synonyms_count                        -0.299479
global_aoa * rel_aoa                                      -0.026601
global_aoa * rel_clustering                                0.005611
global_aoa * rel_frequency                                 0.054445
global_aoa * rel_letters_count                            -0.028978
global_aoa * rel_orthographic_density                     -0.029933
global_aoa * rel_synonyms_count                            0.099216
global_clustering * global_frequency                       0.162571
global_clustering * global_letters_count                   0.103601
global_clustering * global_orthographic_density            1.128850
global_clustering * global_synonyms_count                  0.309560
global_clustering * rel_aoa                               -0.104843
global_clustering * rel_clustering                         0.123649
global_clustering * rel_frequency                         -0.123599
global_clustering * rel_letters_count                     -0.063509
global_clustering * rel_orthographic_density              -0.749540
global_clustering * rel_synonyms_count                    -0.003966
global_frequency * global_letters_count                   -0.048693
global_frequency * global_orthographic_density             0.110398
global_frequency * global_synonyms_count                   0.336556
global_frequency * rel_aoa                                -0.178776
global_frequency * rel_clustering                          0.159785
global_frequency * rel_frequency                          -0.021298
global_frequency * rel_letters_count                       0.033556
global_frequency * rel_orthographic_density               -0.337903
global_frequency * rel_synonyms_count                     -0.552386
global_letters_count * global_orthographic_density         0.240546
global_letters_count * global_synonyms_count               0.766249
global_letters_count * rel_aoa                            -0.116466
global_letters_count * rel_clustering                     -0.233511
global_letters_count * rel_frequency                      -0.073384
global_letters_count * rel_letters_count                   0.014512
global_letters_count * rel_orthographic_density           -0.165521
global_letters_count * rel_synonyms_count                 -0.762462
global_orthographic_density * global_synonyms_count        0.520684
global_orthographic_density * rel_aoa                     -0.103860
global_orthographic_density * rel_clustering              -1.528078
global_orthographic_density * rel_frequency               -0.242490
global_orthographic_density * rel_letters_count           -0.293766
global_orthographic_density * rel_orthographic_density    -0.135941
global_orthographic_density * rel_synonyms_count          -0.413782
global_synonyms_count * rel_aoa                           -0.008968
global_synonyms_count * rel_clustering                     0.128790
global_synonyms_count * rel_frequency                     -0.431534
global_synonyms_count * rel_letters_count                 -0.667042
global_synonyms_count * rel_orthographic_density          -0.629203
global_synonyms_count * rel_synonyms_count                -0.047461
rel_aoa * rel_clustering                                   0.002206
rel_aoa * rel_frequency                                    0.082853
rel_aoa * rel_letters_count                                0.112264
rel_aoa * rel_orthographic_density                         0.157201
rel_aoa * rel_synonyms_count                               0.072755
rel_clustering * rel_frequency                            -0.158308
rel_clustering * rel_letters_count                         0.111153
rel_clustering * rel_orthographic_density                  1.161761
rel_clustering * rel_synonyms_count                       -0.137226
rel_frequency * rel_letters_count                          0.053075
rel_frequency * rel_orthographic_density                   0.305228
rel_frequency * rel_synonyms_count                         0.642032
rel_letters_count * rel_orthographic_density               0.263180
rel_letters_count * rel_synonyms_count                     0.632079
rel_orthographic_density * rel_synonyms_count              0.328562
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 698 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.18471637025163323

intercept                      4.054256
global_aoa                     0.406554
global_clustering             -0.204398
global_frequency              -0.139763
global_letters_count           0.071210
global_orthographic_density   -0.060949
global_synonyms_count          0.018716
dtype: float64

Regressing global aoa with 698 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.21164869067933667

intercept                                             -2.551977
global_aoa                                             0.826382
global_clustering                                     -1.247082
global_frequency                                       0.219421
global_letters_count                                   0.693668
global_orthographic_density                           -1.422048
global_synonyms_count                                 -2.289843
global_aoa * global_clustering                         0.082760
global_aoa * global_frequency                         -0.004568
global_aoa * global_letters_count                      0.011074
global_aoa * global_orthographic_density              -0.012927
global_aoa * global_synonyms_count                     0.132351
global_clustering * global_frequency                   0.030741
global_clustering * global_letters_count               0.073955
global_clustering * global_orthographic_density       -0.177408
global_clustering * global_synonyms_count             -0.131467
global_frequency * global_letters_count               -0.034325
global_frequency * global_orthographic_density         0.029491
global_frequency * global_synonyms_count               0.005701
global_letters_count * global_orthographic_density    -0.014447
global_letters_count * global_synonyms_count           0.035887
global_orthographic_density * global_synonyms_count    0.296303
dtype: float64

Regressing rel aoa with 698 measures, no interactions
           ^^^^^^^
R^2 = 0.07084865031173426

intercept                     -0.165182
global_aoa                     0.207027
global_clustering             -0.063131
global_frequency              -0.150789
global_letters_count           0.053706
global_orthographic_density    0.142566
global_synonyms_count          0.025417
dtype: float64

Regressing rel aoa with 698 measures, with interactions
           ^^^^^^^
R^2 = 0.10736833667280732

intercept                                             -1.800382
global_aoa                                             1.131062
global_clustering                                     -0.320713
global_frequency                                      -0.166344
global_letters_count                                  -0.156595
global_orthographic_density                           -1.737430
global_synonyms_count                                 -1.864833
global_aoa * global_clustering                         0.101954
global_aoa * global_frequency                         -0.042303
global_aoa * global_letters_count                     -0.000813
global_aoa * global_orthographic_density              -0.002138
global_aoa * global_synonyms_count                     0.109413
global_clustering * global_frequency                  -0.010886
global_clustering * global_letters_count              -0.003155
global_clustering * global_orthographic_density       -0.208667
global_clustering * global_synonyms_count             -0.199340
global_frequency * global_letters_count                0.024709
global_frequency * global_orthographic_density         0.075041
global_frequency * global_synonyms_count              -0.055456
global_letters_count * global_orthographic_density    -0.039801
global_letters_count * global_synonyms_count           0.016032
global_orthographic_density * global_synonyms_count    0.316225
dtype: float64

Regressing global aoa with 698 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.06384174372868445

intercept                   6.601011
rel_aoa                     0.125436
rel_clustering              0.304875
rel_frequency               0.087080
rel_letters_count           0.042774
rel_orthographic_density   -0.407176
rel_synonyms_count         -0.106241
dtype: float64

Regressing global aoa with 698 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.11027620977886654

intercept                                        6.624788
rel_aoa                                         -0.188718
rel_clustering                                   0.363972
rel_frequency                                    0.150970
rel_letters_count                               -0.050492
rel_orthographic_density                        -0.457304
rel_synonyms_count                              -0.004737
rel_aoa * rel_clustering                         0.133198
rel_aoa * rel_frequency                         -0.094214
rel_aoa * rel_letters_count                      0.021324
rel_aoa * rel_orthographic_density               0.044025
rel_aoa * rel_synonyms_count                     0.082265
rel_clustering * rel_frequency                   0.135096
rel_clustering * rel_letters_count               0.013873
rel_clustering * rel_orthographic_density       -0.174092
rel_clustering * rel_synonyms_count              0.045986
rel_frequency * rel_letters_count               -0.040061
rel_frequency * rel_orthographic_density        -0.002665
rel_frequency * rel_synonyms_count              -0.047481
rel_letters_count * rel_orthographic_density     0.019463
rel_letters_count * rel_synonyms_count           0.093696
rel_orthographic_density * rel_synonyms_count    0.515551
dtype: float64

Regressing rel aoa with 698 measures, no interactions
           ^^^^^^^
R^2 = 0.2403042309660497

intercept                   0.431268
rel_aoa                     0.519501
rel_clustering              0.008652
rel_frequency              -0.065284
rel_letters_count           0.016251
rel_orthographic_density    0.155669
rel_synonyms_count         -0.068576
dtype: float64

Regressing rel aoa with 698 measures, with interactions
           ^^^^^^^
R^2 = 0.27374094291220397

intercept                                        0.690534
rel_aoa                                          0.398785
rel_clustering                                  -0.063529
rel_frequency                                    0.055508
rel_letters_count                               -0.069998
rel_orthographic_density                         0.422684
rel_synonyms_count                               0.088165
rel_aoa * rel_clustering                         0.106244
rel_aoa * rel_frequency                         -0.022187
rel_aoa * rel_letters_count                      0.007844
rel_aoa * rel_orthographic_density              -0.000322
rel_aoa * rel_synonyms_count                     0.027904
rel_clustering * rel_frequency                   0.051110
rel_clustering * rel_letters_count               0.028988
rel_clustering * rel_orthographic_density       -0.068968
rel_clustering * rel_synonyms_count              0.014186
rel_frequency * rel_letters_count               -0.018468
rel_frequency * rel_orthographic_density         0.109104
rel_frequency * rel_synonyms_count               0.008895
rel_letters_count * rel_orthographic_density    -0.007508
rel_letters_count * rel_synonyms_count           0.037799
rel_orthographic_density * rel_synonyms_count    0.247256
dtype: float64

Regressing global aoa with 698 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.2299215386078043

intercept                      0.851695
global_aoa                     0.543517
global_clustering             -0.732883
global_frequency              -0.255258
global_letters_count           0.201000
global_orthographic_density   -0.070983
global_synonyms_count          0.320383
rel_aoa                       -0.229022
rel_clustering                 0.644668
rel_frequency                  0.126313
rel_letters_count             -0.149247
rel_orthographic_density       0.080680
rel_synonyms_count            -0.369757
dtype: float64

Regressing global aoa with 698 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.3275522185346883

intercept                                                 30.391233
global_aoa                                                 0.781518
global_clustering                                          5.110880
global_frequency                                           1.233840
global_letters_count                                      -1.848531
global_orthographic_density                              -13.913004
global_synonyms_count                                     -7.061642
rel_aoa                                                   -2.002165
rel_clustering                                             0.893933
rel_frequency                                              1.021992
rel_letters_count                                          2.141924
rel_orthographic_density                                  11.807087
rel_synonyms_count                                        -2.133510
global_aoa * global_clustering                             0.057083
global_aoa * global_frequency                             -0.002672
global_aoa * global_letters_count                         -0.025406
global_aoa * global_orthographic_density                   0.024513
global_aoa * global_synonyms_count                         0.018661
global_aoa * rel_aoa                                       0.055141
global_aoa * rel_clustering                               -0.187165
global_aoa * rel_frequency                                -0.028916
global_aoa * rel_letters_count                             0.018452
global_aoa * rel_orthographic_density                     -0.127395
global_aoa * rel_synonyms_count                            0.392031
global_clustering * global_frequency                       0.141510
global_clustering * global_letters_count                  -0.333911
global_clustering * global_orthographic_density           -2.623193
global_clustering * global_synonyms_count                 -1.470080
global_clustering * rel_aoa                               -0.094850
global_clustering * rel_clustering                         0.125790
global_clustering * rel_frequency                          0.102097
global_clustering * rel_letters_count                      0.262228
global_clustering * rel_orthographic_density               2.123982
global_clustering * rel_synonyms_count                     1.102931
global_frequency * global_letters_count                    0.015828
global_frequency * global_orthographic_density            -0.238675
global_frequency * global_synonyms_count                  -0.163242
global_frequency * rel_aoa                                 0.055772
global_frequency * rel_clustering                         -0.419642
global_frequency * rel_frequency                          -0.036307
global_frequency * rel_letters_count                      -0.067438
global_frequency * rel_orthographic_density                0.200376
global_frequency * rel_synonyms_count                      0.495545
global_letters_count * global_orthographic_density         0.079499
global_letters_count * global_synonyms_count               0.024127
global_letters_count * rel_aoa                             0.007253
global_letters_count * rel_clustering                      0.307281
global_letters_count * rel_frequency                       0.051144
global_letters_count * rel_letters_count                   0.016757
global_letters_count * rel_orthographic_density           -0.080116
global_letters_count * rel_synonyms_count                  0.183593
global_orthographic_density * global_synonyms_count        0.126086
global_orthographic_density * rel_aoa                      0.085944
global_orthographic_density * rel_clustering               2.041165
global_orthographic_density * rel_frequency                0.037749
global_orthographic_density * rel_letters_count           -0.200088
global_orthographic_density * rel_orthographic_density     0.182369
global_orthographic_density * rel_synonyms_count          -0.069217
global_synonyms_count * rel_aoa                            0.118585
global_synonyms_count * rel_clustering                     0.963643
global_synonyms_count * rel_frequency                      0.012344
global_synonyms_count * rel_letters_count                 -0.342725
global_synonyms_count * rel_orthographic_density          -0.469309
global_synonyms_count * rel_synonyms_count                 0.028366
rel_aoa * rel_clustering                                   0.299155
rel_aoa * rel_frequency                                   -0.042369
rel_aoa * rel_letters_count                               -0.022724
rel_aoa * rel_orthographic_density                         0.006955
rel_aoa * rel_synonyms_count                              -0.322097
rel_clustering * rel_frequency                             0.195196
rel_clustering * rel_letters_count                        -0.138714
rel_clustering * rel_orthographic_density                 -1.536353
rel_clustering * rel_synonyms_count                       -0.736085
rel_frequency * rel_letters_count                         -0.079782
rel_frequency * rel_orthographic_density                   0.072003
rel_frequency * rel_synonyms_count                        -0.350783
rel_letters_count * rel_orthographic_density               0.284407
rel_letters_count * rel_synonyms_count                     0.139055
rel_orthographic_density * rel_synonyms_count              0.960279
dtype: float64

Regressing rel aoa with 698 measures, no interactions
           ^^^^^^^
R^2 = 0.27687698570061847

intercept                      0.223551
global_aoa                    -0.284697
global_clustering             -0.556624
global_frequency              -0.190359
global_letters_count           0.141166
global_orthographic_density   -0.086753
global_synonyms_count          0.372020
rel_aoa                        0.717455
rel_clustering                 0.548820
rel_frequency                  0.053465
rel_letters_count             -0.084721
rel_orthographic_density       0.133697
rel_synonyms_count            -0.464582
dtype: float64

Regressing rel aoa with 698 measures, with interactions
           ^^^^^^^
R^2 = 0.35868355258826445

intercept                                                 24.647235
global_aoa                                                -0.000468
global_clustering                                          3.373046
global_frequency                                          -0.108797
global_letters_count                                      -1.509515
global_orthographic_density                               -9.080500
global_synonyms_count                                     -6.115387
rel_aoa                                                   -0.681396
rel_clustering                                             1.129341
rel_frequency                                              1.282873
rel_letters_count                                          2.461153
rel_orthographic_density                                   8.668105
rel_synonyms_count                                         0.391188
global_aoa * global_clustering                             0.085596
global_aoa * global_frequency                             -0.032997
global_aoa * global_letters_count                          0.020316
global_aoa * global_orthographic_density                   0.128037
global_aoa * global_synonyms_count                         0.195186
global_aoa * rel_aoa                                       0.031023
global_aoa * rel_clustering                               -0.129801
global_aoa * rel_frequency                                -0.023821
global_aoa * rel_letters_count                            -0.041291
global_aoa * rel_orthographic_density                     -0.202056
global_aoa * rel_synonyms_count                            0.051317
global_clustering * global_frequency                      -0.014448
global_clustering * global_letters_count                  -0.147877
global_clustering * global_orthographic_density           -1.534534
global_clustering * global_synonyms_count                 -1.099432
global_clustering * rel_aoa                               -0.108864
global_clustering * rel_clustering                         0.092984
global_clustering * rel_frequency                          0.141182
global_clustering * rel_letters_count                      0.166851
global_clustering * rel_orthographic_density               1.278377
global_clustering * rel_synonyms_count                     0.871308
global_frequency * global_letters_count                    0.074067
global_frequency * global_orthographic_density            -0.086058
global_frequency * global_synonyms_count                  -0.091775
global_frequency * rel_aoa                                 0.060260
global_frequency * rel_clustering                         -0.218703
global_frequency * rel_frequency                          -0.019222
global_frequency * rel_letters_count                      -0.111094
global_frequency * rel_orthographic_density                0.007410
global_frequency * rel_synonyms_count                      0.332454
global_letters_count * global_orthographic_density         0.008004
global_letters_count * global_synonyms_count              -0.091361
global_letters_count * rel_aoa                            -0.015895
global_letters_count * rel_clustering                      0.151674
global_letters_count * rel_frequency                       0.014049
global_letters_count * rel_letters_count                   0.017588
global_letters_count * rel_orthographic_density            0.015279
global_letters_count * rel_synonyms_count                  0.215017
global_orthographic_density * global_synonyms_count        0.097053
global_orthographic_density * rel_aoa                      0.001111
global_orthographic_density * rel_clustering               1.058276
global_orthographic_density * rel_frequency               -0.021196
global_orthographic_density * rel_letters_count           -0.168947
global_orthographic_density * rel_orthographic_density     0.223293
global_orthographic_density * rel_synonyms_count          -0.127720
global_synonyms_count * rel_aoa                           -0.046737
global_synonyms_count * rel_clustering                     0.591884
global_synonyms_count * rel_frequency                      0.033257
global_synonyms_count * rel_letters_count                 -0.158576
global_synonyms_count * rel_orthographic_density          -0.410310
global_synonyms_count * rel_synonyms_count                -0.010485
rel_aoa * rel_clustering                                   0.241415
rel_aoa * rel_frequency                                   -0.029351
rel_aoa * rel_letters_count                                0.016035
rel_aoa * rel_orthographic_density                         0.061169
rel_aoa * rel_synonyms_count                              -0.081860
rel_clustering * rel_frequency                             0.112225
rel_clustering * rel_letters_count                        -0.065960
rel_clustering * rel_orthographic_density                 -0.745102
rel_clustering * rel_synonyms_count                       -0.499042
rel_frequency * rel_letters_count                         -0.021416
rel_frequency * rel_orthographic_density                   0.172570
rel_frequency * rel_synonyms_count                        -0.225831
rel_letters_count * rel_orthographic_density               0.246422
rel_letters_count * rel_synonyms_count                     0.015288
rel_orthographic_density * rel_synonyms_count              0.784935
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 634 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.09254025990086712

intercept                     -3.584313
global_aoa                    -0.030259
global_clustering              0.272269
global_frequency              -0.045219
global_letters_count           0.008991
global_orthographic_density   -0.034315
global_synonyms_count         -0.039287
dtype: float64

Regressing global clustering with 634 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1539911730188338

intercept                                             -1.171600
global_aoa                                             0.357906
global_clustering                                      0.576802
global_frequency                                      -0.557885
global_letters_count                                  -0.103616
global_orthographic_density                           -0.303794
global_synonyms_count                                 -0.682515
global_aoa * global_clustering                         0.050829
global_aoa * global_frequency                         -0.003615
global_aoa * global_letters_count                     -0.002534
global_aoa * global_orthographic_density              -0.034618
global_aoa * global_synonyms_count                    -0.004160
global_clustering * global_frequency                  -0.066787
global_clustering * global_letters_count              -0.008415
global_clustering * global_orthographic_density        0.015924
global_clustering * global_synonyms_count             -0.083128
global_frequency * global_letters_count                0.008245
global_frequency * global_orthographic_density         0.056996
global_frequency * global_synonyms_count               0.054873
global_letters_count * global_orthographic_density     0.019157
global_letters_count * global_synonyms_count          -0.020682
global_orthographic_density * global_synonyms_count   -0.153513
dtype: float64

Regressing rel clustering with 634 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.06501173510619607

intercept                      2.304740
global_aoa                    -0.019038
global_clustering              0.218825
global_frequency              -0.041913
global_letters_count          -0.015018
global_orthographic_density   -0.057715
global_synonyms_count         -0.021955
dtype: float64

Regressing rel clustering with 634 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.10552788078628306

intercept                                              5.447327
global_aoa                                             0.255271
global_clustering                                      0.435515
global_frequency                                      -0.561568
global_letters_count                                  -0.279654
global_orthographic_density                           -0.508555
global_synonyms_count                                 -0.586167
global_aoa * global_clustering                         0.050690
global_aoa * global_frequency                          0.003155
global_aoa * global_letters_count                      0.003696
global_aoa * global_orthographic_density              -0.022356
global_aoa * global_synonyms_count                    -0.010511
global_clustering * global_frequency                  -0.054850
global_clustering * global_letters_count              -0.012420
global_clustering * global_orthographic_density        0.016472
global_clustering * global_synonyms_count             -0.079242
global_frequency * global_letters_count                0.014993
global_frequency * global_orthographic_density         0.060209
global_frequency * global_synonyms_count               0.022209
global_letters_count * global_orthographic_density     0.029689
global_letters_count * global_synonyms_count           0.014661
global_orthographic_density * global_synonyms_count   -0.093904
dtype: float64

Regressing global clustering with 634 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05806201498991548

intercept                  -5.871030
rel_aoa                    -0.007508
rel_clustering              0.246671
rel_frequency               0.008460
rel_letters_count           0.016763
rel_orthographic_density    0.012285
rel_synonyms_count          0.013580
dtype: float64

Regressing global clustering with 634 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.09484974130300461

intercept                                       -5.810048
rel_aoa                                         -0.034244
rel_clustering                                   0.153224
rel_frequency                                    0.062083
rel_letters_count                                0.031128
rel_orthographic_density                         0.038001
rel_synonyms_count                              -0.079787
rel_aoa * rel_clustering                         0.049569
rel_aoa * rel_frequency                         -0.015816
rel_aoa * rel_letters_count                     -0.024561
rel_aoa * rel_orthographic_density              -0.023239
rel_aoa * rel_synonyms_count                    -0.019205
rel_clustering * rel_frequency                  -0.014369
rel_clustering * rel_letters_count              -0.021769
rel_clustering * rel_orthographic_density       -0.075975
rel_clustering * rel_synonyms_count              0.000414
rel_frequency * rel_letters_count               -0.014376
rel_frequency * rel_orthographic_density        -0.001730
rel_frequency * rel_synonyms_count              -0.008935
rel_letters_count * rel_orthographic_density     0.010037
rel_letters_count * rel_synonyms_count          -0.008907
rel_orthographic_density * rel_synonyms_count   -0.078604
dtype: float64

Regressing rel clustering with 634 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1791621076040274

intercept                   0.202977
rel_aoa                    -0.025905
rel_clustering              0.430414
rel_frequency               0.005374
rel_letters_count           0.006022
rel_orthographic_density   -0.009255
rel_synonyms_count          0.043054
dtype: float64

Regressing rel clustering with 634 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.20606793353748876

intercept                                        0.252142
rel_aoa                                         -0.047646
rel_clustering                                   0.304278
rel_frequency                                    0.043380
rel_letters_count                                0.011955
rel_orthographic_density                        -0.014187
rel_synonyms_count                              -0.051627
rel_aoa * rel_clustering                         0.033621
rel_aoa * rel_frequency                         -0.008606
rel_aoa * rel_letters_count                     -0.022850
rel_aoa * rel_orthographic_density              -0.039374
rel_aoa * rel_synonyms_count                    -0.013835
rel_clustering * rel_frequency                  -0.018584
rel_clustering * rel_letters_count               0.016191
rel_clustering * rel_orthographic_density       -0.024450
rel_clustering * rel_synonyms_count              0.002158
rel_frequency * rel_letters_count               -0.011550
rel_frequency * rel_orthographic_density        -0.007952
rel_frequency * rel_synonyms_count              -0.015745
rel_letters_count * rel_orthographic_density     0.013021
rel_letters_count * rel_synonyms_count          -0.004638
rel_orthographic_density * rel_synonyms_count   -0.054506
dtype: float64

Regressing global clustering with 634 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11731306798835972

intercept                     -1.815265
global_aoa                    -0.035070
global_clustering              0.291075
global_frequency              -0.121861
global_letters_count          -0.060342
global_orthographic_density   -0.195659
global_synonyms_count         -0.196749
rel_aoa                        0.005974
rel_clustering                -0.008454
rel_frequency                  0.092897
rel_letters_count              0.071637
rel_orthographic_density       0.186820
rel_synonyms_count             0.174228
dtype: float64

Regressing global clustering with 634 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2582679109621435

intercept                                                 11.999013
global_aoa                                                 0.337024
global_clustering                                          2.234348
global_frequency                                          -1.436005
global_letters_count                                      -0.428575
global_orthographic_density                               -2.137132
global_synonyms_count                                     -0.844683
rel_aoa                                                   -0.960745
rel_clustering                                            -2.477160
rel_frequency                                              0.196833
rel_letters_count                                          0.148343
rel_orthographic_density                                   1.032530
rel_synonyms_count                                        -2.218989
global_aoa * global_clustering                             0.000593
global_aoa * global_frequency                              0.010680
global_aoa * global_letters_count                         -0.037807
global_aoa * global_orthographic_density                  -0.201487
global_aoa * global_synonyms_count                         0.020843
global_aoa * rel_aoa                                       0.018269
global_aoa * rel_clustering                                0.072348
global_aoa * rel_frequency                                -0.016248
global_aoa * rel_letters_count                             0.050800
global_aoa * rel_orthographic_density                      0.177843
global_aoa * rel_synonyms_count                            0.026752
global_clustering * global_frequency                      -0.161666
global_clustering * global_letters_count                  -0.033846
global_clustering * global_orthographic_density           -0.101648
global_clustering * global_synonyms_count                 -0.226691
global_clustering * rel_aoa                               -0.040967
global_clustering * rel_clustering                        -0.126110
global_clustering * rel_frequency                          0.022378
global_clustering * rel_letters_count                      0.057677
global_clustering * rel_orthographic_density               0.082008
global_clustering * rel_synonyms_count                     0.255604
global_frequency * global_letters_count                    0.000497
global_frequency * global_orthographic_density             0.146519
global_frequency * global_synonyms_count                   0.140649
global_frequency * rel_aoa                                 0.007331
global_frequency * rel_clustering                          0.084280
global_frequency * rel_frequency                           0.008715
global_frequency * rel_letters_count                       0.021239
global_frequency * rel_orthographic_density               -0.024788
global_frequency * rel_synonyms_count                      0.084543
global_letters_count * global_orthographic_density         0.260382
global_letters_count * global_synonyms_count              -0.203746
global_letters_count * rel_aoa                             0.052122
global_letters_count * rel_clustering                      0.036790
global_letters_count * rel_frequency                       0.039818
global_letters_count * rel_letters_count                  -0.000893
global_letters_count * rel_orthographic_density           -0.275214
global_letters_count * rel_synonyms_count                  0.359646
global_orthographic_density * global_synonyms_count       -0.667489
global_orthographic_density * rel_aoa                      0.175663
global_orthographic_density * rel_clustering               0.171746
global_orthographic_density * rel_frequency               -0.046204
global_orthographic_density * rel_letters_count           -0.164269
global_orthographic_density * rel_orthographic_density    -0.017924
global_orthographic_density * rel_synonyms_count           0.545593
global_synonyms_count * rel_aoa                           -0.007038
global_synonyms_count * rel_clustering                     0.066259
global_synonyms_count * rel_frequency                     -0.099402
global_synonyms_count * rel_letters_count                  0.002198
global_synonyms_count * rel_orthographic_density           0.326156
global_synonyms_count * rel_synonyms_count                -0.052148
rel_aoa * rel_clustering                                   0.041589
rel_aoa * rel_frequency                                    0.001510
rel_aoa * rel_letters_count                               -0.072824
rel_aoa * rel_orthographic_density                        -0.154145
rel_aoa * rel_synonyms_count                              -0.045224
rel_clustering * rel_frequency                            -0.012053
rel_clustering * rel_letters_count                        -0.078846
rel_clustering * rel_orthographic_density                 -0.156070
rel_clustering * rel_synonyms_count                       -0.095765
rel_frequency * rel_letters_count                         -0.053195
rel_frequency * rel_orthographic_density                  -0.012783
rel_frequency * rel_synonyms_count                        -0.085336
rel_letters_count * rel_orthographic_density               0.186115
rel_letters_count * rel_synonyms_count                    -0.142968
rel_orthographic_density * rel_synonyms_count             -0.228006
dtype: float64

Regressing rel clustering with 634 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.26563760243992907

intercept                     -1.083643
global_aoa                    -0.028892
global_clustering             -0.541314
global_frequency              -0.107050
global_letters_count          -0.060115
global_orthographic_density   -0.166107
global_synonyms_count         -0.184788
rel_aoa                        0.002169
rel_clustering                 0.890154
rel_frequency                  0.081101
rel_letters_count              0.062779
rel_orthographic_density       0.135520
rel_synonyms_count             0.190699
dtype: float64

Regressing rel clustering with 634 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.37146486212094565

intercept                                                 10.836352
global_aoa                                                 0.143360
global_clustering                                          0.716230
global_frequency                                          -1.174387
global_letters_count                                      -0.675153
global_orthographic_density                               -2.230579
global_synonyms_count                                     -0.909766
rel_aoa                                                   -0.733459
rel_clustering                                            -0.959036
rel_frequency                                              0.165845
rel_letters_count                                          0.293755
rel_orthographic_density                                   1.005383
rel_synonyms_count                                        -1.791894
global_aoa * global_clustering                            -0.002138
global_aoa * global_frequency                              0.013440
global_aoa * global_letters_count                         -0.021346
global_aoa * global_orthographic_density                  -0.157190
global_aoa * global_synonyms_count                         0.026330
global_aoa * rel_aoa                                       0.016777
global_aoa * rel_clustering                                0.056433
global_aoa * rel_frequency                                -0.017636
global_aoa * rel_letters_count                             0.035002
global_aoa * rel_orthographic_density                      0.141280
global_aoa * rel_synonyms_count                            0.004081
global_clustering * global_frequency                      -0.101794
global_clustering * global_letters_count                  -0.038730
global_clustering * global_orthographic_density           -0.056010
global_clustering * global_synonyms_count                 -0.139700
global_clustering * rel_aoa                               -0.036080
global_clustering * rel_clustering                        -0.122942
global_clustering * rel_frequency                         -0.001187
global_clustering * rel_letters_count                      0.039785
global_clustering * rel_orthographic_density               0.006094
global_clustering * rel_synonyms_count                     0.190748
global_frequency * global_letters_count                    0.013499
global_frequency * global_orthographic_density             0.161527
global_frequency * global_synonyms_count                   0.122347
global_frequency * rel_aoa                                 0.004833
global_frequency * rel_clustering                          0.038109
global_frequency * rel_frequency                           0.006605
global_frequency * rel_letters_count                       0.004447
global_frequency * rel_orthographic_density               -0.058548
global_frequency * rel_synonyms_count                      0.086168
global_letters_count * global_orthographic_density         0.234855
global_letters_count * global_synonyms_count              -0.137373
global_letters_count * rel_aoa                             0.035222
global_letters_count * rel_clustering                      0.046470
global_letters_count * rel_frequency                       0.026904
global_letters_count * rel_letters_count                   0.000013
global_letters_count * rel_orthographic_density           -0.237697
global_letters_count * rel_synonyms_count                  0.289540
global_orthographic_density * global_synonyms_count       -0.455750
global_orthographic_density * rel_aoa                      0.130181
global_orthographic_density * rel_clustering               0.119040
global_orthographic_density * rel_frequency               -0.065679
global_orthographic_density * rel_letters_count           -0.141959
global_orthographic_density * rel_orthographic_density    -0.011218
global_orthographic_density * rel_synonyms_count           0.378914
global_synonyms_count * rel_aoa                           -0.036673
global_synonyms_count * rel_clustering                     0.061123
global_synonyms_count * rel_frequency                     -0.098382
global_synonyms_count * rel_letters_count                 -0.063596
global_synonyms_count * rel_orthographic_density           0.099175
global_synonyms_count * rel_synonyms_count                -0.050027
rel_aoa * rel_clustering                                   0.028366
rel_aoa * rel_frequency                                   -0.001538
rel_aoa * rel_letters_count                               -0.060426
rel_aoa * rel_orthographic_density                        -0.124836
rel_aoa * rel_synonyms_count                              -0.005184
rel_clustering * rel_frequency                             0.007935
rel_clustering * rel_letters_count                        -0.040213
rel_clustering * rel_orthographic_density                 -0.074288
rel_clustering * rel_synonyms_count                       -0.085270
rel_frequency * rel_letters_count                         -0.035834
rel_frequency * rel_orthographic_density                   0.010927
rel_frequency * rel_synonyms_count                        -0.078009
rel_letters_count * rel_orthographic_density               0.160369
rel_letters_count * rel_synonyms_count                    -0.091171
rel_orthographic_density * rel_synonyms_count             -0.070529
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1319651195765712

intercept                      4.627240
global_aoa                    -0.008766
global_clustering             -0.108881
global_frequency              -0.078299
global_letters_count           0.329320
global_orthographic_density   -0.154702
global_synonyms_count         -0.243266
dtype: float64

Regressing global letters_count with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.163248258726011

intercept                                             -1.339165
global_aoa                                            -0.182549
global_clustering                                     -2.775700
global_frequency                                      -0.017928
global_letters_count                                   0.115146
global_orthographic_density                           -1.689523
global_synonyms_count                                  1.768247
global_aoa * global_clustering                         0.136737
global_aoa * global_frequency                          0.072379
global_aoa * global_letters_count                      0.039139
global_aoa * global_orthographic_density               0.026296
global_aoa * global_synonyms_count                     0.107180
global_clustering * global_frequency                   0.148500
global_clustering * global_letters_count               0.018879
global_clustering * global_orthographic_density        0.086665
global_clustering * global_synonyms_count              0.346267
global_frequency * global_letters_count                0.003917
global_frequency * global_orthographic_density         0.195455
global_frequency * global_synonyms_count               0.039097
global_letters_count * global_orthographic_density     0.010701
global_letters_count * global_synonyms_count          -0.129462
global_orthographic_density * global_synonyms_count   -0.225029
dtype: float64

Regressing rel letters_count with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.07394644708522835

intercept                      1.501368
global_aoa                    -0.055352
global_clustering             -0.087624
global_frequency              -0.086046
global_letters_count           0.257999
global_orthographic_density   -0.089721
global_synonyms_count         -0.293910
dtype: float64

Regressing rel letters_count with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.10954898889835907

intercept                                             -5.697870
global_aoa                                             0.240262
global_clustering                                     -3.028784
global_frequency                                      -0.138032
global_letters_count                                  -0.108943
global_orthographic_density                           -1.914599
global_synonyms_count                                  1.456926
global_aoa * global_clustering                         0.186641
global_aoa * global_frequency                          0.072144
global_aoa * global_letters_count                      0.015561
global_aoa * global_orthographic_density              -0.007943
global_aoa * global_synonyms_count                     0.090599
global_clustering * global_frequency                   0.164504
global_clustering * global_letters_count              -0.003563
global_clustering * global_orthographic_density        0.045650
global_clustering * global_synonyms_count              0.247192
global_frequency * global_letters_count                0.029109
global_frequency * global_orthographic_density         0.236380
global_frequency * global_synonyms_count               0.021963
global_letters_count * global_orthographic_density    -0.005447
global_letters_count * global_synonyms_count          -0.135805
global_orthographic_density * global_synonyms_count   -0.213781
dtype: float64

Regressing global letters_count with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11559722648906667

intercept                   5.394513
rel_aoa                    -0.150419
rel_clustering              0.184276
rel_frequency               0.044852
rel_letters_count           0.315865
rel_orthographic_density   -0.278232
rel_synonyms_count         -0.273982
dtype: float64

Regressing global letters_count with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13149006911378902

intercept                                        5.359096
rel_aoa                                         -0.286758
rel_clustering                                   0.188947
rel_frequency                                    0.082255
rel_letters_count                                0.448497
rel_orthographic_density                        -0.205383
rel_synonyms_count                              -0.296831
rel_aoa * rel_clustering                         0.066509
rel_aoa * rel_frequency                         -0.010143
rel_aoa * rel_letters_count                      0.044217
rel_aoa * rel_orthographic_density               0.020913
rel_aoa * rel_synonyms_count                     0.090435
rel_clustering * rel_frequency                   0.016537
rel_clustering * rel_letters_count               0.002993
rel_clustering * rel_orthographic_density        0.024352
rel_clustering * rel_synonyms_count              0.282969
rel_frequency * rel_letters_count                0.024637
rel_frequency * rel_orthographic_density         0.107681
rel_frequency * rel_synonyms_count               0.027790
rel_letters_count * rel_orthographic_density     0.084901
rel_letters_count * rel_synonyms_count          -0.039558
rel_orthographic_density * rel_synonyms_count   -0.004692
dtype: float64

Regressing rel letters_count with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1736356283011109

intercept                   0.966435
rel_aoa                    -0.106908
rel_clustering              0.019310
rel_frequency              -0.144303
rel_letters_count           0.448482
rel_orthographic_density    0.001020
rel_synonyms_count         -0.258664
dtype: float64

Regressing rel letters_count with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1927364722767667

intercept                                        0.986070
rel_aoa                                         -0.159268
rel_clustering                                   0.039902
rel_frequency                                   -0.114872
rel_letters_count                                0.578928
rel_orthographic_density                         0.177761
rel_synonyms_count                              -0.237731
rel_aoa * rel_clustering                         0.094960
rel_aoa * rel_frequency                          0.027419
rel_aoa * rel_letters_count                      0.003074
rel_aoa * rel_orthographic_density              -0.069951
rel_aoa * rel_synonyms_count                     0.099745
rel_clustering * rel_frequency                   0.044604
rel_clustering * rel_letters_count               0.065448
rel_clustering * rel_orthographic_density        0.130136
rel_clustering * rel_synonyms_count              0.217397
rel_frequency * rel_letters_count                0.028192
rel_frequency * rel_orthographic_density         0.139288
rel_frequency * rel_synonyms_count               0.036105
rel_letters_count * rel_orthographic_density     0.075771
rel_letters_count * rel_synonyms_count          -0.062122
rel_orthographic_density * rel_synonyms_count   -0.073862
dtype: float64

Regressing global letters_count with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15998648486540645

intercept                      1.791230
global_aoa                     0.147329
global_clustering             -0.618428
global_frequency              -0.149960
global_letters_count           0.202127
global_orthographic_density   -0.160383
global_synonyms_count          0.094284
rel_aoa                       -0.241973
rel_clustering                 0.599520
rel_frequency                  0.071291
rel_letters_count              0.140197
rel_orthographic_density       0.038198
rel_synonyms_count            -0.362432
dtype: float64

Regressing global letters_count with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.24659034136250257

intercept                                                 8.737240
global_aoa                                                0.307989
global_clustering                                         2.294101
global_frequency                                          1.585391
global_letters_count                                     -2.309978
global_orthographic_density                              -4.594256
global_synonyms_count                                     8.970861
rel_aoa                                                  -4.233190
rel_clustering                                           -3.778229
rel_frequency                                             0.474867
rel_letters_count                                         3.299279
rel_orthographic_density                                 -0.251172
rel_synonyms_count                                       -7.275644
global_aoa * global_clustering                            0.173942
global_aoa * global_frequency                             0.041538
global_aoa * global_letters_count                        -0.000501
global_aoa * global_orthographic_density                  0.183454
global_aoa * global_synonyms_count                        0.125217
global_aoa * rel_aoa                                      0.067118
global_aoa * rel_clustering                              -0.120203
global_aoa * rel_frequency                               -0.013595
global_aoa * rel_letters_count                           -0.021067
global_aoa * rel_orthographic_density                    -0.195526
global_aoa * rel_synonyms_count                           0.010413
global_clustering * global_frequency                      0.151074
global_clustering * global_letters_count                 -0.685646
global_clustering * global_orthographic_density          -1.133187
global_clustering * global_synonyms_count                -0.366323
global_clustering * rel_aoa                              -0.149352
global_clustering * rel_clustering                        0.080100
global_clustering * rel_frequency                         0.142550
global_clustering * rel_letters_count                     0.656838
global_clustering * rel_orthographic_density              0.827402
global_clustering * rel_synonyms_count                    0.764817
global_frequency * global_letters_count                  -0.086115
global_frequency * global_orthographic_density           -0.164472
global_frequency * global_synonyms_count                 -0.591291
global_frequency * rel_aoa                                0.185557
global_frequency * rel_clustering                         0.097059
global_frequency * rel_frequency                         -0.011875
global_frequency * rel_letters_count                      0.038316
global_frequency * rel_orthographic_density               0.442383
global_frequency * rel_synonyms_count                     0.673883
global_letters_count * global_orthographic_density       -0.294202
global_letters_count * global_synonyms_count             -0.632899
global_letters_count * rel_aoa                            0.101648
global_letters_count * rel_clustering                     0.672040
global_letters_count * rel_frequency                      0.050396
global_letters_count * rel_letters_count                  0.026264
global_letters_count * rel_orthographic_density           0.257279
global_letters_count * rel_synonyms_count                 0.592807
global_orthographic_density * global_synonyms_count      -0.948406
global_orthographic_density * rel_aoa                     0.059069
global_orthographic_density * rel_clustering              0.746728
global_orthographic_density * rel_frequency               0.097896
global_orthographic_density * rel_letters_count           0.201177
global_orthographic_density * rel_orthographic_density    0.095973
global_orthographic_density * rel_synonyms_count          0.513482
global_synonyms_count * rel_aoa                           0.381876
global_synonyms_count * rel_clustering                    0.218855
global_synonyms_count * rel_frequency                     0.587078
global_synonyms_count * rel_letters_count                 0.294989
global_synonyms_count * rel_orthographic_density          0.892019
global_synonyms_count * rel_synonyms_count                0.032669
rel_aoa * rel_clustering                                  0.200457
rel_aoa * rel_frequency                                  -0.136576
rel_aoa * rel_letters_count                              -0.098951
rel_aoa * rel_orthographic_density                        0.004520
rel_aoa * rel_synonyms_count                             -0.323545
rel_clustering * rel_frequency                           -0.261695
rel_clustering * rel_letters_count                       -0.631366
rel_clustering * rel_orthographic_density                -0.278494
rel_clustering * rel_synonyms_count                      -0.364787
rel_frequency * rel_letters_count                        -0.011027
rel_frequency * rel_orthographic_density                 -0.150247
rel_frequency * rel_synonyms_count                       -0.593656
rel_letters_count * rel_orthographic_density             -0.065714
rel_letters_count * rel_synonyms_count                   -0.383091
rel_orthographic_density * rel_synonyms_count            -0.558830
dtype: float64

Regressing rel letters_count with 771 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2183900994705117

intercept                      1.006558
global_aoa                     0.080667
global_clustering             -0.571925
global_frequency              -0.095427
global_letters_count          -0.590311
global_orthographic_density   -0.175266
global_synonyms_count          0.042872
rel_aoa                       -0.154688
rel_clustering                 0.540752
rel_frequency                  0.013521
rel_letters_count              0.950250
rel_orthographic_density       0.043416
rel_synonyms_count            -0.301121
dtype: float64

Regressing rel letters_count with 771 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2947149423857002

intercept                                                 13.285382
global_aoa                                                -0.691248
global_clustering                                          0.315999
global_frequency                                           0.708147
global_letters_count                                      -3.405989
global_orthographic_density                               -7.769035
global_synonyms_count                                      8.755045
rel_aoa                                                   -2.974644
rel_clustering                                            -1.941032
rel_frequency                                              0.993148
rel_letters_count                                          4.056629
rel_orthographic_density                                   1.082669
rel_synonyms_count                                        -7.425840
global_aoa * global_clustering                             0.131282
global_aoa * global_frequency                              0.086196
global_aoa * global_letters_count                          0.048668
global_aoa * global_orthographic_density                   0.160737
global_aoa * global_synonyms_count                         0.033984
global_aoa * rel_aoa                                       0.066690
global_aoa * rel_clustering                               -0.059018
global_aoa * rel_frequency                                -0.062705
global_aoa * rel_letters_count                            -0.061311
global_aoa * rel_orthographic_density                     -0.180506
global_aoa * rel_synonyms_count                            0.083272
global_clustering * global_frequency                       0.222139
global_clustering * global_letters_count                  -0.454394
global_clustering * global_orthographic_density           -0.988699
global_clustering * global_synonyms_count                  0.085762
global_clustering * rel_aoa                               -0.123156
global_clustering * rel_clustering                        -0.006102
global_clustering * rel_frequency                          0.077067
global_clustering * rel_letters_count                      0.393540
global_clustering * rel_orthographic_density               0.501179
global_clustering * rel_synonyms_count                     0.017538
global_frequency * global_letters_count                    0.017796
global_frequency * global_orthographic_density             0.146784
global_frequency * global_synonyms_count                  -0.381961
global_frequency * rel_aoa                                 0.130010
global_frequency * rel_clustering                         -0.047538
global_frequency * rel_frequency                          -0.001272
global_frequency * rel_letters_count                      -0.054022
global_frequency * rel_orthographic_density                0.196065
global_frequency * rel_synonyms_count                      0.370536
global_letters_count * global_orthographic_density        -0.153378
global_letters_count * global_synonyms_count              -0.465856
global_letters_count * rel_aoa                             0.035046
global_letters_count * rel_clustering                      0.470091
global_letters_count * rel_frequency                      -0.008249
global_letters_count * rel_letters_count                   0.010989
global_letters_count * rel_orthographic_density            0.140464
global_letters_count * rel_synonyms_count                  0.369605
global_orthographic_density * global_synonyms_count       -0.804775
global_orthographic_density * rel_aoa                      0.033465
global_orthographic_density * rel_clustering               0.657317
global_orthographic_density * rel_frequency               -0.144342
global_orthographic_density * rel_letters_count            0.113767
global_orthographic_density * rel_orthographic_density     0.076060
global_orthographic_density * rel_synonyms_count           0.378392
global_synonyms_count * rel_aoa                            0.383282
global_synonyms_count * rel_clustering                    -0.090318
global_synonyms_count * rel_frequency                      0.421277
global_synonyms_count * rel_letters_count                  0.198719
global_synonyms_count * rel_orthographic_density           0.691440
global_synonyms_count * rel_synonyms_count                -0.008866
rel_aoa * rel_clustering                                   0.137127
rel_aoa * rel_frequency                                   -0.070676
rel_aoa * rel_letters_count                               -0.037116
rel_aoa * rel_orthographic_density                         0.042427
rel_aoa * rel_synonyms_count                              -0.268683
rel_clustering * rel_frequency                            -0.160615
rel_clustering * rel_letters_count                        -0.393994
rel_clustering * rel_orthographic_density                 -0.039500
rel_clustering * rel_synonyms_count                        0.140617
rel_frequency * rel_letters_count                          0.044075
rel_frequency * rel_orthographic_density                   0.031633
rel_frequency * rel_synonyms_count                        -0.366420
rel_letters_count * rel_orthographic_density              -0.013527
rel_letters_count * rel_synonyms_count                    -0.251723
rel_orthographic_density * rel_synonyms_count             -0.326880
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06727962027554635

intercept                      1.020130
global_aoa                    -0.009552
global_clustering              0.037504
global_frequency              -0.016365
global_letters_count          -0.039924
global_orthographic_density   -0.018949
global_synonyms_count          0.205264
dtype: float64

Regressing global synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09119994992310188

intercept                                              3.547043
global_aoa                                            -0.104664
global_clustering                                      0.362033
global_frequency                                      -0.109939
global_letters_count                                  -0.163134
global_orthographic_density                           -0.695460
global_synonyms_count                                 -0.129863
global_aoa * global_clustering                        -0.011871
global_aoa * global_frequency                         -0.008404
global_aoa * global_letters_count                      0.008968
global_aoa * global_orthographic_density               0.029520
global_aoa * global_synonyms_count                     0.048942
global_clustering * global_frequency                  -0.019939
global_clustering * global_letters_count              -0.001945
global_clustering * global_orthographic_density       -0.044406
global_clustering * global_synonyms_count              0.038143
global_frequency * global_letters_count                0.004077
global_frequency * global_orthographic_density         0.016789
global_frequency * global_synonyms_count              -0.002644
global_letters_count * global_orthographic_density     0.002633
global_letters_count * global_synonyms_count           0.014056
global_orthographic_density * global_synonyms_count    0.119056
dtype: float64

Regressing rel synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.04531237878029348

intercept                      0.677974
global_aoa                    -0.013804
global_clustering              0.019824
global_frequency              -0.019826
global_letters_count          -0.034631
global_orthographic_density   -0.033397
global_synonyms_count          0.160492
dtype: float64

Regressing rel synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07074688275717289

intercept                                              4.455618
global_aoa                                            -0.155339
global_clustering                                      0.530906
global_frequency                                      -0.225000
global_letters_count                                  -0.179520
global_orthographic_density                           -0.777129
global_synonyms_count                                 -0.078319
global_aoa * global_clustering                        -0.017301
global_aoa * global_frequency                         -0.002663
global_aoa * global_letters_count                      0.004745
global_aoa * global_orthographic_density               0.023692
global_aoa * global_synonyms_count                     0.044759
global_clustering * global_frequency                  -0.030730
global_clustering * global_letters_count              -0.009764
global_clustering * global_orthographic_density       -0.052022
global_clustering * global_synonyms_count              0.044169
global_frequency * global_letters_count                0.004873
global_frequency * global_orthographic_density         0.024487
global_frequency * global_synonyms_count              -0.005195
global_letters_count * global_orthographic_density     0.004217
global_letters_count * global_synonyms_count           0.016177
global_orthographic_density * global_synonyms_count    0.100828
dtype: float64

Regressing global synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06345763843169527

intercept                   0.420593
rel_aoa                     0.017530
rel_clustering             -0.030578
rel_frequency              -0.022247
rel_letters_count          -0.045992
rel_orthographic_density   -0.008486
rel_synonyms_count          0.209356
dtype: float64

Regressing global synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09541159040140601

intercept                                        0.516953
rel_aoa                                          0.024114
rel_clustering                                  -0.145272
rel_frequency                                    0.003046
rel_letters_count                               -0.094751
rel_orthographic_density                         0.035409
rel_synonyms_count                               0.161771
rel_aoa * rel_clustering                        -0.009063
rel_aoa * rel_frequency                          0.000593
rel_aoa * rel_letters_count                      0.022230
rel_aoa * rel_orthographic_density               0.047362
rel_aoa * rel_synonyms_count                     0.026827
rel_clustering * rel_frequency                  -0.017427
rel_clustering * rel_letters_count               0.019267
rel_clustering * rel_orthographic_density       -0.034697
rel_clustering * rel_synonyms_count              0.040498
rel_frequency * rel_letters_count               -0.000415
rel_frequency * rel_orthographic_density         0.019584
rel_frequency * rel_synonyms_count               0.004053
rel_letters_count * rel_orthographic_density    -0.012042
rel_letters_count * rel_synonyms_count           0.022697
rel_orthographic_density * rel_synonyms_count    0.046399
dtype: float64

Regressing rel synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.13275095707629714

intercept                   0.073491
rel_aoa                     0.000636
rel_clustering             -0.011444
rel_frequency              -0.022061
rel_letters_count          -0.036195
rel_orthographic_density   -0.029017
rel_synonyms_count          0.348122
dtype: float64

Regressing rel synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.16210136694761043

intercept                                        0.177848
rel_aoa                                          0.013342
rel_clustering                                  -0.127215
rel_frequency                                    0.014247
rel_letters_count                               -0.079769
rel_orthographic_density                         0.017303
rel_synonyms_count                               0.370360
rel_aoa * rel_clustering                        -0.001927
rel_aoa * rel_frequency                          0.004278
rel_aoa * rel_letters_count                      0.016413
rel_aoa * rel_orthographic_density               0.033881
rel_aoa * rel_synonyms_count                     0.019584
rel_clustering * rel_frequency                  -0.023352
rel_clustering * rel_letters_count               0.007853
rel_clustering * rel_orthographic_density       -0.042394
rel_clustering * rel_synonyms_count              0.040315
rel_frequency * rel_letters_count               -0.004820
rel_frequency * rel_orthographic_density         0.021346
rel_frequency * rel_synonyms_count               0.014099
rel_letters_count * rel_orthographic_density    -0.005899
rel_letters_count * rel_synonyms_count           0.026976
rel_orthographic_density * rel_synonyms_count    0.086557
dtype: float64

Regressing global synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08555473122715951

intercept                      1.645725
global_aoa                    -0.030928
global_clustering              0.182674
global_frequency               0.000435
global_letters_count          -0.001432
global_orthographic_density    0.017453
global_synonyms_count          0.111472
rel_aoa                        0.034100
rel_clustering                -0.169444
rel_frequency                 -0.017448
rel_letters_count             -0.042606
rel_orthographic_density      -0.043090
rel_synonyms_count             0.104493
dtype: float64

Regressing global synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.20394207476722326

intercept                                                 4.476159
global_aoa                                                0.432203
global_clustering                                         1.908550
global_frequency                                         -0.209538
global_letters_count                                      0.378805
global_orthographic_density                               1.741854
global_synonyms_count                                     0.508562
rel_aoa                                                  -0.205505
rel_clustering                                           -2.023260
rel_frequency                                            -0.198931
rel_letters_count                                        -0.795853
rel_orthographic_density                                 -0.462677
rel_synonyms_count                                       -2.107635
global_aoa * global_clustering                           -0.021892
global_aoa * global_frequency                            -0.035418
global_aoa * global_letters_count                        -0.030619
global_aoa * global_orthographic_density                 -0.041791
global_aoa * global_synonyms_count                        0.084177
global_aoa * rel_aoa                                     -0.003218
global_aoa * rel_clustering                               0.051832
global_aoa * rel_frequency                                0.026191
global_aoa * rel_letters_count                            0.026953
global_aoa * rel_orthographic_density                     0.053850
global_aoa * rel_synonyms_count                          -0.005810
global_clustering * global_frequency                     -0.116810
global_clustering * global_letters_count                 -0.028095
global_clustering * global_orthographic_density          -0.072112
global_clustering * global_synonyms_count                 0.345463
global_clustering * rel_aoa                              -0.032987
global_clustering * rel_clustering                        0.018705
global_clustering * rel_frequency                         0.057285
global_clustering * rel_letters_count                    -0.014125
global_clustering * rel_orthographic_density              0.182364
global_clustering * rel_synonyms_count                   -0.444334
global_frequency * global_letters_count                  -0.017991
global_frequency * global_orthographic_density           -0.136846
global_frequency * global_synonyms_count                  0.168238
global_frequency * rel_aoa                                0.001926
global_frequency * rel_clustering                         0.153387
global_frequency * rel_frequency                          0.006674
global_frequency * rel_letters_count                      0.013430
global_frequency * rel_orthographic_density               0.081217
global_frequency * rel_synonyms_count                    -0.149271
global_letters_count * global_orthographic_density       -0.075984
global_letters_count * global_synonyms_count             -0.060427
global_letters_count * rel_aoa                            0.015540
global_letters_count * rel_clustering                    -0.010047
global_letters_count * rel_frequency                      0.017665
global_letters_count * rel_letters_count                  0.005138
global_letters_count * rel_orthographic_density           0.056022
global_letters_count * rel_synonyms_count                 0.112089
global_orthographic_density * global_synonyms_count      -0.101806
global_orthographic_density * rel_aoa                    -0.013765
global_orthographic_density * rel_clustering             -0.090853
global_orthographic_density * rel_frequency               0.145821
global_orthographic_density * rel_letters_count           0.168386
global_orthographic_density * rel_orthographic_density    0.068174
global_orthographic_density * rel_synonyms_count          0.214337
global_synonyms_count * rel_aoa                          -0.060585
global_synonyms_count * rel_clustering                   -0.096908
global_synonyms_count * rel_frequency                    -0.042155
global_synonyms_count * rel_letters_count                -0.010880
global_synonyms_count * rel_orthographic_density         -0.030567
global_synonyms_count * rel_synonyms_count                0.134507
rel_aoa * rel_clustering                                 -0.017850
rel_aoa * rel_frequency                                  -0.001647
rel_aoa * rel_letters_count                               0.012483
rel_aoa * rel_orthographic_density                        0.038979
rel_aoa * rel_synonyms_count                              0.031226
rel_clustering * rel_frequency                           -0.099860
rel_clustering * rel_letters_count                        0.045733
rel_clustering * rel_orthographic_density                -0.081934
rel_clustering * rel_synonyms_count                       0.207058
rel_frequency * rel_letters_count                        -0.007987
rel_frequency * rel_orthographic_density                 -0.078899
rel_frequency * rel_synonyms_count                        0.025515
rel_letters_count * rel_orthographic_density             -0.111092
rel_letters_count * rel_synonyms_count                    0.011955
rel_orthographic_density * rel_synonyms_count             0.053093
dtype: float64

Regressing rel synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.21808616729247787

intercept                      1.060961
global_aoa                    -0.023219
global_clustering              0.128543
global_frequency               0.007164
global_letters_count           0.004054
global_orthographic_density    0.020592
global_synonyms_count         -0.607983
rel_aoa                        0.023097
rel_clustering                -0.126866
rel_frequency                 -0.019902
rel_letters_count             -0.039244
rel_orthographic_density      -0.047745
rel_synonyms_count             0.911203
dtype: float64

Regressing rel synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.3272581968950826

intercept                                                 2.077724
global_aoa                                                0.213722
global_clustering                                         1.447286
global_frequency                                         -0.078443
global_letters_count                                      0.488732
global_orthographic_density                               1.603631
global_synonyms_count                                     1.313961
rel_aoa                                                   0.049366
rel_clustering                                           -1.552267
rel_frequency                                            -0.432557
rel_letters_count                                        -0.805397
rel_orthographic_density                                 -0.409948
rel_synonyms_count                                       -2.973804
global_aoa * global_clustering                           -0.028134
global_aoa * global_frequency                            -0.021820
global_aoa * global_letters_count                        -0.021360
global_aoa * global_orthographic_density                 -0.037008
global_aoa * global_synonyms_count                        0.053188
global_aoa * rel_aoa                                     -0.002141
global_aoa * rel_clustering                               0.057853
global_aoa * rel_frequency                                0.018091
global_aoa * rel_letters_count                            0.017484
global_aoa * rel_orthographic_density                     0.043048
global_aoa * rel_synonyms_count                           0.034521
global_clustering * global_frequency                     -0.085114
global_clustering * global_letters_count                 -0.010207
global_clustering * global_orthographic_density          -0.096066
global_clustering * global_synonyms_count                 0.339347
global_clustering * rel_aoa                              -0.021609
global_clustering * rel_clustering                        0.018724
global_clustering * rel_frequency                         0.005408
global_clustering * rel_letters_count                    -0.010182
global_clustering * rel_orthographic_density              0.234707
global_clustering * rel_synonyms_count                   -0.404736
global_frequency * global_letters_count                  -0.019393
global_frequency * global_orthographic_density           -0.131731
global_frequency * global_synonyms_count                  0.083981
global_frequency * rel_aoa                               -0.012431
global_frequency * rel_clustering                         0.117179
global_frequency * rel_frequency                          0.002543
global_frequency * rel_letters_count                      0.021447
global_frequency * rel_orthographic_density               0.095764
global_frequency * rel_synonyms_count                    -0.036031
global_letters_count * global_orthographic_density       -0.081172
global_letters_count * global_synonyms_count             -0.097983
global_letters_count * rel_aoa                            0.001018
global_letters_count * rel_clustering                    -0.031480
global_letters_count * rel_frequency                      0.013742
global_letters_count * rel_letters_count                  0.004970
global_letters_count * rel_orthographic_density           0.078200
global_letters_count * rel_synonyms_count                 0.152063
global_orthographic_density * global_synonyms_count      -0.225110
global_orthographic_density * rel_aoa                    -0.009788
global_orthographic_density * rel_clustering             -0.047729
global_orthographic_density * rel_frequency               0.142662
global_orthographic_density * rel_letters_count           0.155094
global_orthographic_density * rel_orthographic_density    0.055902
global_orthographic_density * rel_synonyms_count          0.331748
global_synonyms_count * rel_aoa                          -0.035072
global_synonyms_count * rel_clustering                   -0.085488
global_synonyms_count * rel_frequency                     0.051122
global_synonyms_count * rel_letters_count                 0.070905
global_synonyms_count * rel_orthographic_density          0.151780
global_synonyms_count * rel_synonyms_count                0.156068
rel_aoa * rel_clustering                                 -0.024731
rel_aoa * rel_frequency                                   0.009011
rel_aoa * rel_letters_count                               0.019535
rel_aoa * rel_orthographic_density                        0.027005
rel_aoa * rel_synonyms_count                             -0.007705
rel_clustering * rel_frequency                           -0.051748
rel_clustering * rel_letters_count                        0.037767
rel_clustering * rel_orthographic_density                -0.149659
rel_clustering * rel_synonyms_count                       0.143448
rel_frequency * rel_letters_count                        -0.014196
rel_frequency * rel_orthographic_density                 -0.094650
rel_frequency * rel_synonyms_count                       -0.087144
rel_letters_count * rel_orthographic_density             -0.114901
rel_letters_count * rel_synonyms_count                   -0.070066
rel_orthographic_density * rel_synonyms_count            -0.119468
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 653 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14939126795936597

intercept                      1.033383
global_aoa                    -0.024468
global_clustering              0.032832
global_frequency               0.026955
global_letters_count          -0.009255
global_orthographic_density    0.335449
global_synonyms_count          0.026876
dtype: float64

Regressing global orthographic_density with 653 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18143793314190848

intercept                                             -0.605878
global_aoa                                             0.033848
global_clustering                                      0.180025
global_frequency                                       0.379396
global_letters_count                                   0.060560
global_orthographic_density                            0.584955
global_synonyms_count                                 -0.377798
global_aoa * global_clustering                        -0.051028
global_aoa * global_frequency                         -0.035429
global_aoa * global_letters_count                     -0.012015
global_aoa * global_orthographic_density               0.036013
global_aoa * global_synonyms_count                    -0.012706
global_clustering * global_frequency                   0.022297
global_clustering * global_letters_count               0.005409
global_clustering * global_orthographic_density       -0.027612
global_clustering * global_synonyms_count              0.027539
global_frequency * global_letters_count                0.009588
global_frequency * global_orthographic_density        -0.055043
global_frequency * global_synonyms_count               0.081200
global_letters_count * global_orthographic_density    -0.021746
global_letters_count * global_synonyms_count          -0.019141
global_orthographic_density * global_synonyms_count    0.015941
dtype: float64

Regressing rel orthographic_density with 653 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11684833275777773

intercept                     -1.276627
global_aoa                    -0.011321
global_clustering              0.009537
global_frequency               0.023350
global_letters_count          -0.015192
global_orthographic_density    0.279076
global_synonyms_count          0.049068
dtype: float64

Regressing rel orthographic_density with 653 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14912771766733568

intercept                                             -1.364772
global_aoa                                            -0.036309
global_clustering                                      0.499514
global_frequency                                       0.312450
global_letters_count                                   0.063439
global_orthographic_density                            0.611099
global_synonyms_count                                 -0.963420
global_aoa * global_clustering                        -0.056350
global_aoa * global_frequency                         -0.029711
global_aoa * global_letters_count                     -0.010286
global_aoa * global_orthographic_density               0.028529
global_aoa * global_synonyms_count                    -0.005479
global_clustering * global_frequency                   0.000642
global_clustering * global_letters_count              -0.008410
global_clustering * global_orthographic_density       -0.022392
global_clustering * global_synonyms_count             -0.066607
global_frequency * global_letters_count               -0.003768
global_frequency * global_orthographic_density        -0.062779
global_frequency * global_synonyms_count               0.065324
global_letters_count * global_orthographic_density    -0.010007
global_letters_count * global_synonyms_count           0.004787
global_orthographic_density * global_synonyms_count    0.030615
dtype: float64

Regressing global orthographic_density with 653 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11962092761561027

intercept                   1.666333
rel_aoa                     0.019084
rel_clustering             -0.081243
rel_frequency              -0.004443
rel_letters_count          -0.005614
rel_orthographic_density    0.359271
rel_synonyms_count          0.086284
dtype: float64

Regressing global orthographic_density with 653 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14116367257419582

intercept                                        1.635122
rel_aoa                                          0.085362
rel_clustering                                   0.069587
rel_frequency                                   -0.037883
rel_letters_count                               -0.033152
rel_orthographic_density                         0.377951
rel_synonyms_count                               0.393595
rel_aoa * rel_clustering                        -0.002708
rel_aoa * rel_frequency                          0.006468
rel_aoa * rel_letters_count                     -0.014163
rel_aoa * rel_orthographic_density               0.020511
rel_aoa * rel_synonyms_count                     0.016377
rel_clustering * rel_frequency                   0.038430
rel_clustering * rel_letters_count               0.014224
rel_clustering * rel_orthographic_density        0.090522
rel_clustering * rel_synonyms_count              0.024810
rel_frequency * rel_letters_count                0.002431
rel_frequency * rel_orthographic_density        -0.006437
rel_frequency * rel_synonyms_count               0.071742
rel_letters_count * rel_orthographic_density    -0.037129
rel_letters_count * rel_synonyms_count          -0.026621
rel_orthographic_density * rel_synonyms_count    0.054535
dtype: float64

Regressing rel orthographic_density with 653 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1862456481670256

intercept                  -0.426263
rel_aoa                     0.022588
rel_clustering             -0.067796
rel_frequency               0.033689
rel_letters_count           0.000621
rel_orthographic_density    0.427074
rel_synonyms_count          0.065403
dtype: float64

Regressing rel orthographic_density with 653 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.20304733709827072

intercept                                       -0.416611
rel_aoa                                          0.085432
rel_clustering                                   0.048997
rel_frequency                                    0.037023
rel_letters_count                               -0.012248
rel_orthographic_density                         0.434905
rel_synonyms_count                               0.279742
rel_aoa * rel_clustering                        -0.012590
rel_aoa * rel_frequency                         -0.000971
rel_aoa * rel_letters_count                     -0.013550
rel_aoa * rel_orthographic_density               0.030519
rel_aoa * rel_synonyms_count                     0.011359
rel_clustering * rel_frequency                   0.021600
rel_clustering * rel_letters_count              -0.000336
rel_clustering * rel_orthographic_density        0.059493
rel_clustering * rel_synonyms_count              0.004084
rel_frequency * rel_letters_count               -0.004385
rel_frequency * rel_orthographic_density         0.001492
rel_frequency * rel_synonyms_count               0.048681
rel_letters_count * rel_orthographic_density    -0.016681
rel_letters_count * rel_synonyms_count          -0.024590
rel_orthographic_density * rel_synonyms_count    0.028806
dtype: float64

Regressing global orthographic_density with 653 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17030118618339052

intercept                      3.290688
global_aoa                    -0.057714
global_clustering              0.267333
global_frequency               0.006301
global_letters_count          -0.071613
global_orthographic_density    0.309871
global_synonyms_count         -0.209041
rel_aoa                        0.058190
rel_clustering                -0.272144
rel_frequency                  0.029626
rel_letters_count              0.069643
rel_orthographic_density       0.021428
rel_synonyms_count             0.280614
dtype: float64

Regressing global orthographic_density with 653 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.24957591497232998

intercept                                                 4.710798
global_aoa                                               -0.062297
global_clustering                                         1.232199
global_frequency                                         -0.491047
global_letters_count                                     -0.015151
global_orthographic_density                               4.594833
global_synonyms_count                                    -2.045016
rel_aoa                                                   0.563576
rel_clustering                                           -1.325332
rel_frequency                                             0.576538
rel_letters_count                                        -0.484927
rel_orthographic_density                                 -4.272344
rel_synonyms_count                                        3.202532
global_aoa * global_clustering                           -0.074625
global_aoa * global_frequency                            -0.042884
global_aoa * global_letters_count                        -0.015228
global_aoa * global_orthographic_density                  0.052191
global_aoa * global_synonyms_count                       -0.027729
global_aoa * rel_aoa                                     -0.012314
global_aoa * rel_clustering                               0.046426
global_aoa * rel_frequency                                0.023358
global_aoa * rel_letters_count                            0.035848
global_aoa * rel_orthographic_density                     0.005917
global_aoa * rel_synonyms_count                          -0.002898
global_clustering * global_frequency                     -0.142751
global_clustering * global_letters_count                 -0.023386
global_clustering * global_orthographic_density           0.470618
global_clustering * global_synonyms_count                 0.254863
global_clustering * rel_aoa                               0.021619
global_clustering * rel_clustering                       -0.035415
global_clustering * rel_frequency                         0.059551
global_clustering * rel_letters_count                     0.063384
global_clustering * rel_orthographic_density             -0.346334
global_clustering * rel_synonyms_count                    0.046408
global_frequency * global_letters_count                   0.007014
global_frequency * global_orthographic_density           -0.118288
global_frequency * global_synonyms_count                  0.068472
global_frequency * rel_aoa                               -0.026843
global_frequency * rel_clustering                         0.201707
global_frequency * rel_frequency                          0.019430
global_frequency * rel_letters_count                      0.073014
global_frequency * rel_orthographic_density               0.122324
global_frequency * rel_synonyms_count                     0.020909
global_letters_count * global_orthographic_density       -0.129347
global_letters_count * global_synonyms_count              0.239297
global_letters_count * rel_aoa                           -0.005201
global_letters_count * rel_clustering                     0.029330
global_letters_count * rel_frequency                     -0.091219
global_letters_count * rel_letters_count                 -0.009433
global_letters_count * rel_orthographic_density           0.202907
global_letters_count * rel_synonyms_count                -0.250884
global_orthographic_density * global_synonyms_count       0.816188
global_orthographic_density * rel_aoa                     0.020939
global_orthographic_density * rel_clustering             -0.620961
global_orthographic_density * rel_frequency              -0.031206
global_orthographic_density * rel_letters_count           0.014360
global_orthographic_density * rel_orthographic_density    0.052519
global_orthographic_density * rel_synonyms_count         -0.722489
global_synonyms_count * rel_aoa                          -0.016072
global_synonyms_count * rel_clustering                   -0.784712
global_synonyms_count * rel_frequency                    -0.054032
global_synonyms_count * rel_letters_count                -0.104623
global_synonyms_count * rel_orthographic_density         -0.832362
global_synonyms_count * rel_synonyms_count               -0.005430
rel_aoa * rel_clustering                                 -0.004046
rel_aoa * rel_frequency                                   0.025866
rel_aoa * rel_letters_count                              -0.010953
rel_aoa * rel_orthographic_density                       -0.023353
rel_aoa * rel_synonyms_count                              0.032883
rel_clustering * rel_frequency                           -0.054741
rel_clustering * rel_letters_count                       -0.067951
rel_clustering * rel_orthographic_density                 0.495303
rel_clustering * rel_synonyms_count                       0.445763
rel_frequency * rel_letters_count                         0.020811
rel_frequency * rel_orthographic_density                 -0.024557
rel_frequency * rel_synonyms_count                        0.039049
rel_letters_count * rel_orthographic_density             -0.109656
rel_letters_count * rel_synonyms_count                    0.082548
rel_orthographic_density * rel_synonyms_count             0.733074
dtype: float64

Regressing rel orthographic_density with 653 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.21376506351513724

intercept                      2.276773
global_aoa                    -0.035809
global_clustering              0.263547
global_frequency               0.020708
global_letters_count          -0.057644
global_orthographic_density   -0.390290
global_synonyms_count         -0.128641
rel_aoa                        0.037413
rel_clustering                -0.265662
rel_frequency                  0.017558
rel_letters_count              0.041397
rel_orthographic_density       0.752283
rel_synonyms_count             0.182976
dtype: float64

Regressing rel orthographic_density with 653 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2885892622502735

intercept                                                 7.147348
global_aoa                                                0.293719
global_clustering                                         2.043075
global_frequency                                         -0.711270
global_letters_count                                     -0.442241
global_orthographic_density                               3.981906
global_synonyms_count                                    -2.412572
rel_aoa                                                   0.153107
rel_clustering                                           -2.283056
rel_frequency                                             0.464693
rel_letters_count                                        -0.070803
rel_orthographic_density                                 -3.436693
rel_synonyms_count                                        3.360187
global_aoa * global_clustering                           -0.043435
global_aoa * global_frequency                            -0.040771
global_aoa * global_letters_count                        -0.009001
global_aoa * global_orthographic_density                 -0.022973
global_aoa * global_synonyms_count                       -0.044382
global_aoa * rel_aoa                                     -0.012840
global_aoa * rel_clustering                               0.033694
global_aoa * rel_frequency                                0.029639
global_aoa * rel_letters_count                            0.027195
global_aoa * rel_orthographic_density                     0.081230
global_aoa * rel_synonyms_count                           0.001193
global_clustering * global_frequency                     -0.197634
global_clustering * global_letters_count                 -0.058908
global_clustering * global_orthographic_density           0.382647
global_clustering * global_synonyms_count                 0.062084
global_clustering * rel_aoa                              -0.005300
global_clustering * rel_clustering                       -0.027054
global_clustering * rel_frequency                         0.082955
global_clustering * rel_letters_count                     0.103741
global_clustering * rel_orthographic_density             -0.196206
global_clustering * rel_synonyms_count                    0.193430
global_frequency * global_letters_count                   0.009180
global_frequency * global_orthographic_density           -0.165339
global_frequency * global_synonyms_count                  0.045300
global_frequency * rel_aoa                               -0.021171
global_frequency * rel_clustering                         0.242558
global_frequency * rel_frequency                          0.016022
global_frequency * rel_letters_count                      0.069851
global_frequency * rel_orthographic_density               0.176943
global_frequency * rel_synonyms_count                     0.031224
global_letters_count * global_orthographic_density       -0.031601
global_letters_count * global_synonyms_count              0.214461
global_letters_count * rel_aoa                            0.004036
global_letters_count * rel_clustering                     0.045072
global_letters_count * rel_frequency                     -0.085027
global_letters_count * rel_letters_count                 -0.010995
global_letters_count * rel_orthographic_density           0.131241
global_letters_count * rel_synonyms_count                -0.213955
global_orthographic_density * global_synonyms_count       0.709127
global_orthographic_density * rel_aoa                     0.069315
global_orthographic_density * rel_clustering             -0.390377
global_orthographic_density * rel_frequency               0.060152
global_orthographic_density * rel_letters_count          -0.064060
global_orthographic_density * rel_orthographic_density    0.089186
global_orthographic_density * rel_synonyms_count         -0.624644
global_synonyms_count * rel_aoa                           0.001139
global_synonyms_count * rel_clustering                   -0.565631
global_synonyms_count * rel_frequency                    -0.020247
global_synonyms_count * rel_letters_count                -0.090462
global_synonyms_count * rel_orthographic_density         -0.706476
global_synonyms_count * rel_synonyms_count               -0.020602
rel_aoa * rel_clustering                                  0.005159
rel_aoa * rel_frequency                                   0.011952
rel_aoa * rel_letters_count                              -0.019142
rel_aoa * rel_orthographic_density                       -0.076763
rel_aoa * rel_synonyms_count                              0.002313
rel_clustering * rel_frequency                           -0.065582
rel_clustering * rel_letters_count                       -0.081994
rel_clustering * rel_orthographic_density                 0.223477
rel_clustering * rel_synonyms_count                       0.268979
rel_frequency * rel_letters_count                         0.009040
rel_frequency * rel_orthographic_density                 -0.119288
rel_frequency * rel_synonyms_count                       -0.010326
rel_letters_count * rel_orthographic_density             -0.062667
rel_letters_count * rel_synonyms_count                    0.069740
rel_orthographic_density * rel_synonyms_count             0.579875
dtype: float64