Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.majority, past=Past.all, durl=Durl.all, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 14485 substitutions for model Model(time=Time.continuous, source=Source.majority, past=Past.all, durl=Durl.all, max_distance=1)
100% (14485 of 14485) |####################| Elapsed Time: 0:03:40 Time: 0:03:40

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | *   |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *   |
H_00 | *** | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | **  | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | **  | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | **  |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | **  | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | **  | *   |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | *** |
H_00 | *** | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | **  |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | ns. |
H_00 | ns. | *** | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | ns. |
H_00 | ns. | *** | *** | **  |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | **  | *   | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | *   | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | ns. |
H_00 | ns. | *** | *** | **  |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | **  | *   | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | **  |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *   | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | *   | ns. | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *   | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *   | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | *   | ns. | *   |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.51887095  0.17667846  0.08802866  0.07560441  0.03353084  0.03064661
  0.02046165  0.01843159  0.01653748  0.00906346  0.00708615]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.460364 0.294531 -0.080336 0.242995 0.250933 -0.423141 0.231074 0.293937 -0.385455 0.288662 -0.148595 -0.004976
Component-1 -0.266251 0.391018 -0.128295 0.292191 0.301223 0.422910 -0.183974 0.307144 0.427293 -0.258071 0.164004 -0.023056
Component-2 0.733642 0.205838 -0.124213 0.052973 0.608861 -0.127989 -0.019514 0.002379 -0.064045 0.081507 -0.017942 -0.051183

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (14485 of 14485) |####################| Elapsed Time: 0:02:50 Time: 0:02:50

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | ns. | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | *   | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.67536175  0.1811119 ]

Out[35]:
aoa frequency letters_count
Component-0 -0.735084 0.373119 -0.566069
Component-1 0.374534 -0.472487 -0.797797

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (14485 of 14485) |####################| Elapsed Time: 0:01:35 Time: 0:01:35

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1459 (cluster-unique) substitutions, but the PCA is in fact computed on 1163 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07554179478047864

intercept                      5.083533
global_aoa                     0.023919
global_clustering             -0.070556
global_frequency               0.343079
global_letters_count          -0.002503
global_orthographic_density   -0.053057
global_synonyms_count         -0.029173
dtype: float64

Regressing global frequency with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09264280134434444

intercept                                              11.444985
global_aoa                                             -0.162748
global_clustering                                       1.514430
global_frequency                                        0.344817
global_letters_count                                   -0.596063
global_orthographic_density                             0.619156
global_synonyms_count                                   0.730841
global_aoa * global_clustering                         -0.056223
global_aoa * global_frequency                          -0.010616
global_aoa * global_letters_count                       0.007012
global_aoa * global_orthographic_density               -0.076900
global_aoa * global_synonyms_count                      0.005824
global_clustering * global_frequency                   -0.054942
global_clustering * global_letters_count               -0.128937
global_clustering * global_orthographic_density        -0.006413
global_clustering * global_synonyms_count               0.120648
global_frequency * global_letters_count                -0.029899
global_frequency * global_orthographic_density         -0.077640
global_frequency * global_synonyms_count                0.022459
global_letters_count * global_orthographic_density      0.102308
global_letters_count * global_synonyms_count           -0.048419
global_orthographic_density * global_synonyms_count    -0.019064
dtype: float64

Regressing rel frequency with 863 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.04421487548767922

intercept                     -7.455016
global_aoa                     0.056437
global_clustering             -0.035246
global_frequency               0.310850
global_letters_count           0.117639
global_orthographic_density    0.047377
global_synonyms_count          0.128585
dtype: float64

Regressing rel frequency with 863 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.05865572184730561

intercept                                              0.483862
global_aoa                                            -0.086557
global_clustering                                      0.851589
global_frequency                                      -0.051944
global_letters_count                                  -0.879904
global_orthographic_density                           -0.032421
global_synonyms_count                                 -0.946057
global_aoa * global_clustering                        -0.009928
global_aoa * global_frequency                         -0.011820
global_aoa * global_letters_count                      0.036245
global_aoa * global_orthographic_density              -0.040601
global_aoa * global_synonyms_count                     0.065013
global_clustering * global_frequency                  -0.045665
global_clustering * global_letters_count              -0.081529
global_clustering * global_orthographic_density        0.037178
global_clustering * global_synonyms_count              0.105308
global_frequency * global_letters_count                0.020245
global_frequency * global_orthographic_density        -0.003111
global_frequency * global_synonyms_count               0.112892
global_letters_count * global_orthographic_density     0.101936
global_letters_count * global_synonyms_count           0.004949
global_orthographic_density * global_synonyms_count    0.169579
dtype: float64

Regressing global frequency with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.05765716080014249

intercept                   9.424116
rel_aoa                     0.016673
rel_clustering             -0.157779
rel_frequency               0.230127
rel_letters_count          -0.046747
rel_orthographic_density   -0.098722
rel_synonyms_count         -0.159352
dtype: float64

Regressing global frequency with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07108492556187285

intercept                                        9.260601
rel_aoa                                          0.045061
rel_clustering                                  -0.002172
rel_frequency                                    0.202747
rel_letters_count                                0.069165
rel_orthographic_density                        -0.170139
rel_synonyms_count                               0.257085
rel_aoa * rel_clustering                         0.015164
rel_aoa * rel_frequency                          0.012687
rel_aoa * rel_letters_count                     -0.013807
rel_aoa * rel_orthographic_density              -0.025440
rel_aoa * rel_synonyms_count                     0.044439
rel_clustering * rel_frequency                  -0.002727
rel_clustering * rel_letters_count              -0.094828
rel_clustering * rel_orthographic_density       -0.014160
rel_clustering * rel_synonyms_count              0.168952
rel_frequency * rel_letters_count                0.002646
rel_frequency * rel_orthographic_density        -0.014430
rel_frequency * rel_synonyms_count               0.109193
rel_letters_count * rel_orthographic_density     0.039909
rel_letters_count * rel_synonyms_count          -0.123958
rel_orthographic_density * rel_synonyms_count   -0.148384
dtype: float64

Regressing rel frequency with 863 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.25587826590396745

intercept                  -1.586471
rel_aoa                     0.036167
rel_clustering              0.060193
rel_frequency               0.622167
rel_letters_count          -0.114744
rel_orthographic_density   -0.228411
rel_synonyms_count          0.001505
dtype: float64

Regressing rel frequency with 863 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.27411721934408195

intercept                                       -1.628079
rel_aoa                                          0.020144
rel_clustering                                   0.032683
rel_frequency                                    0.653112
rel_letters_count                               -0.077194
rel_orthographic_density                        -0.337754
rel_synonyms_count                               0.367941
rel_aoa * rel_clustering                        -0.035450
rel_aoa * rel_frequency                         -0.033247
rel_aoa * rel_letters_count                      0.013015
rel_aoa * rel_orthographic_density               0.095572
rel_aoa * rel_synonyms_count                     0.178256
rel_clustering * rel_frequency                  -0.052026
rel_clustering * rel_letters_count              -0.139698
rel_clustering * rel_orthographic_density       -0.225136
rel_clustering * rel_synonyms_count              0.129093
rel_frequency * rel_letters_count               -0.010013
rel_frequency * rel_orthographic_density        -0.049915
rel_frequency * rel_synonyms_count               0.100874
rel_letters_count * rel_orthographic_density    -0.002081
rel_letters_count * rel_synonyms_count          -0.100913
rel_orthographic_density * rel_synonyms_count    0.014646
dtype: float64

Regressing global frequency with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08430382909439647

intercept                      3.994678
global_aoa                     0.005532
global_clustering              0.012271
global_frequency               0.373263
global_letters_count           0.214766
global_orthographic_density    0.160143
global_synonyms_count          0.239483
rel_aoa                        0.020794
rel_clustering                -0.113838
rel_frequency                 -0.037971
rel_letters_count             -0.242438
rel_orthographic_density      -0.231600
rel_synonyms_count            -0.335777
dtype: float64

Regressing global frequency with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.1596773371556316

intercept                                                -23.099414
global_aoa                                                 1.761313
global_clustering                                         -3.710961
global_frequency                                           1.364059
global_letters_count                                      -1.304228
global_orthographic_density                                7.148595
global_synonyms_count                                     11.188414
rel_aoa                                                   -1.030090
rel_clustering                                             6.658929
rel_frequency                                             -0.335811
rel_letters_count                                          0.866177
rel_orthographic_density                                  -4.542086
rel_synonyms_count                                        -7.047452
global_aoa * global_clustering                             0.391993
global_aoa * global_frequency                              0.088450
global_aoa * global_letters_count                          0.014000
global_aoa * global_orthographic_density                  -0.088167
global_aoa * global_synonyms_count                        -0.509018
global_aoa * rel_aoa                                      -0.009370
global_aoa * rel_clustering                               -0.456590
global_aoa * rel_frequency                                -0.008353
global_aoa * rel_letters_count                             0.013975
global_aoa * rel_orthographic_density                     -0.006475
global_aoa * rel_synonyms_count                            0.436022
global_clustering * global_frequency                       0.124927
global_clustering * global_letters_count                  -0.311293
global_clustering * global_orthographic_density            0.756189
global_clustering * global_synonyms_count                  0.335723
global_clustering * rel_aoa                               -0.525747
global_clustering * rel_clustering                         0.097784
global_clustering * rel_frequency                         -0.085522
global_clustering * rel_letters_count                      0.233238
global_clustering * rel_orthographic_density              -0.360179
global_clustering * rel_synonyms_count                     0.432007
global_frequency * global_letters_count                   -0.043099
global_frequency * global_orthographic_density            -0.148943
global_frequency * global_synonyms_count                  -0.572575
global_frequency * rel_aoa                                -0.224531
global_frequency * rel_clustering                         -0.206881
global_frequency * rel_frequency                          -0.005663
global_frequency * rel_letters_count                       0.035356
global_frequency * rel_orthographic_density                0.140647
global_frequency * rel_synonyms_count                      0.610847
global_letters_count * global_orthographic_density        -0.054933
global_letters_count * global_synonyms_count               0.174596
global_letters_count * rel_aoa                            -0.007630
global_letters_count * rel_clustering                      0.136619
global_letters_count * rel_frequency                      -0.086577
global_letters_count * rel_letters_count                  -0.021058
global_letters_count * rel_orthographic_density            0.166109
global_letters_count * rel_synonyms_count                 -0.146686
global_orthographic_density * global_synonyms_count       -0.364022
global_orthographic_density * rel_aoa                      0.104566
global_orthographic_density * rel_clustering              -0.848303
global_orthographic_density * rel_frequency                0.038712
global_orthographic_density * rel_letters_count            0.054212
global_orthographic_density * rel_orthographic_density    -0.058784
global_orthographic_density * rel_synonyms_count           0.642836
global_synonyms_count * rel_aoa                            0.305391
global_synonyms_count * rel_clustering                    -0.996366
global_synonyms_count * rel_frequency                      0.243333
global_synonyms_count * rel_letters_count                  0.067057
global_synonyms_count * rel_orthographic_density           0.366057
global_synonyms_count * rel_synonyms_count                 0.023909
rel_aoa * rel_clustering                                   0.510550
rel_aoa * rel_frequency                                    0.106641
rel_aoa * rel_letters_count                                0.000045
rel_aoa * rel_orthographic_density                        -0.025817
rel_aoa * rel_synonyms_count                              -0.227309
rel_clustering * rel_frequency                             0.129402
rel_clustering * rel_letters_count                        -0.214040
rel_clustering * rel_orthographic_density                  0.313365
rel_clustering * rel_synonyms_count                        0.602316
rel_frequency * rel_letters_count                          0.070357
rel_frequency * rel_orthographic_density                  -0.052880
rel_frequency * rel_synonyms_count                        -0.141924
rel_letters_count * rel_orthographic_density              -0.145032
rel_letters_count * rel_synonyms_count                    -0.132705
rel_orthographic_density * rel_synonyms_count             -0.676084
dtype: float64

Regressing rel frequency with 863 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.353860730379452

intercept                      4.071493
global_aoa                     0.003430
global_clustering              0.088658
global_frequency              -0.583548
global_letters_count           0.231510
global_orthographic_density    0.148275
global_synonyms_count          0.193845
rel_aoa                        0.010422
rel_clustering                -0.150275
rel_frequency                  0.958467
rel_letters_count             -0.259233
rel_orthographic_density      -0.200289
rel_synonyms_count            -0.280468
dtype: float64

Regressing rel frequency with 863 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.40681553494928135

intercept                                                -25.448124
global_aoa                                                 1.954200
global_clustering                                         -3.814042
global_frequency                                           0.468756
global_letters_count                                      -0.987482
global_orthographic_density                                7.467211
global_synonyms_count                                      9.501353
rel_aoa                                                   -1.411610
rel_clustering                                             7.443880
rel_frequency                                              0.516238
rel_letters_count                                          0.585342
rel_orthographic_density                                  -4.754983
rel_synonyms_count                                        -5.578240
global_aoa * global_clustering                             0.377514
global_aoa * global_frequency                              0.072200
global_aoa * global_letters_count                          0.000423
global_aoa * global_orthographic_density                  -0.109731
global_aoa * global_synonyms_count                        -0.505118
global_aoa * rel_aoa                                      -0.008104
global_aoa * rel_clustering                               -0.452158
global_aoa * rel_frequency                                 0.005806
global_aoa * rel_letters_count                             0.035844
global_aoa * rel_orthographic_density                      0.028332
global_aoa * rel_synonyms_count                            0.430588
global_clustering * global_frequency                       0.130364
global_clustering * global_letters_count                  -0.237262
global_clustering * global_orthographic_density            0.739961
global_clustering * global_synonyms_count                  0.305391
global_clustering * rel_aoa                               -0.530723
global_clustering * rel_clustering                         0.089563
global_clustering * rel_frequency                         -0.064301
global_clustering * rel_letters_count                      0.184132
global_clustering * rel_orthographic_density              -0.354741
global_clustering * rel_synonyms_count                     0.422499
global_frequency * global_letters_count                   -0.014984
global_frequency * global_orthographic_density            -0.165982
global_frequency * global_synonyms_count                  -0.480609
global_frequency * rel_aoa                                -0.202147
global_frequency * rel_clustering                         -0.263682
global_frequency * rel_frequency                           0.006568
global_frequency * rel_letters_count                       0.013072
global_frequency * rel_orthographic_density                0.137832
global_frequency * rel_synonyms_count                      0.514052
global_letters_count * global_orthographic_density        -0.064699
global_letters_count * global_synonyms_count               0.257608
global_letters_count * rel_aoa                             0.000211
global_letters_count * rel_clustering                      0.053626
global_letters_count * rel_frequency                      -0.079426
global_letters_count * rel_letters_count                  -0.023451
global_letters_count * rel_orthographic_density            0.176841
global_letters_count * rel_synonyms_count                 -0.224959
global_orthographic_density * global_synonyms_count       -0.357180
global_orthographic_density * rel_aoa                      0.114158
global_orthographic_density * rel_clustering              -0.827536
global_orthographic_density * rel_frequency                0.085487
global_orthographic_density * rel_letters_count            0.088194
global_orthographic_density * rel_orthographic_density    -0.018881
global_orthographic_density * rel_synonyms_count           0.633737
global_synonyms_count * rel_aoa                            0.317494
global_synonyms_count * rel_clustering                    -0.985784
global_synonyms_count * rel_frequency                      0.145476
global_synonyms_count * rel_letters_count                 -0.011054
global_synonyms_count * rel_orthographic_density           0.372958
global_synonyms_count * rel_synonyms_count                 0.041066
rel_aoa * rel_clustering                                   0.519883
rel_aoa * rel_frequency                                    0.079394
rel_aoa * rel_letters_count                               -0.014547
rel_aoa * rel_orthographic_density                        -0.047219
rel_aoa * rel_synonyms_count                              -0.231453
rel_clustering * rel_frequency                             0.153220
rel_clustering * rel_letters_count                        -0.138621
rel_clustering * rel_orthographic_density                  0.328337
rel_clustering * rel_synonyms_count                        0.607615
rel_frequency * rel_letters_count                          0.065458
rel_frequency * rel_orthographic_density                  -0.069102
rel_frequency * rel_synonyms_count                        -0.043818
rel_letters_count * rel_orthographic_density              -0.170648
rel_letters_count * rel_synonyms_count                    -0.057751
rel_orthographic_density * rel_synonyms_count             -0.692914
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 795 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.07439762967745889

intercept                      6.796165
global_aoa                     0.211209
global_clustering              0.127795
global_frequency              -0.076420
global_letters_count           0.076114
global_orthographic_density   -0.047351
global_synonyms_count         -0.219339
dtype: float64

Regressing global aoa with 795 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.10434180972792662

intercept                                              0.080742
global_aoa                                             0.427635
global_clustering                                     -1.549254
global_frequency                                       0.338225
global_letters_count                                   0.476407
global_orthographic_density                           -1.242169
global_synonyms_count                                 -4.519813
global_aoa * global_clustering                         0.040211
global_aoa * global_frequency                         -0.032263
global_aoa * global_letters_count                      0.035345
global_aoa * global_orthographic_density               0.056858
global_aoa * global_synonyms_count                     0.030980
global_clustering * global_frequency                   0.084121
global_clustering * global_letters_count               0.180252
global_clustering * global_orthographic_density       -0.088267
global_clustering * global_synonyms_count             -0.747271
global_frequency * global_letters_count                0.042867
global_frequency * global_orthographic_density         0.041209
global_frequency * global_synonyms_count              -0.085036
global_letters_count * global_orthographic_density    -0.049623
global_letters_count * global_synonyms_count           0.051455
global_orthographic_density * global_synonyms_count    0.136225
dtype: float64

Regressing rel aoa with 795 measures, no interactions
           ^^^^^^^
R^2 = 0.01919306554366096

intercept                      1.823342
global_aoa                     0.057244
global_clustering              0.012703
global_frequency              -0.142145
global_letters_count           0.030689
global_orthographic_density    0.072396
global_synonyms_count         -0.147335
dtype: float64

Regressing rel aoa with 795 measures, with interactions
           ^^^^^^^
R^2 = 0.04263468177259555

intercept                                             -2.057796
global_aoa                                             0.531664
global_clustering                                      0.054455
global_frequency                                       0.520694
global_letters_count                                   0.247205
global_orthographic_density                           -0.843630
global_synonyms_count                                 -2.402455
global_aoa * global_clustering                         0.033999
global_aoa * global_frequency                         -0.033263
global_aoa * global_letters_count                     -0.007637
global_aoa * global_orthographic_density               0.034051
global_aoa * global_synonyms_count                     0.016550
global_clustering * global_frequency                   0.039574
global_clustering * global_letters_count              -0.000520
global_clustering * global_orthographic_density       -0.257495
global_clustering * global_synonyms_count             -0.685073
global_frequency * global_letters_count               -0.011051
global_frequency * global_orthographic_density        -0.057604
global_frequency * global_synonyms_count              -0.171908
global_letters_count * global_orthographic_density    -0.068422
global_letters_count * global_synonyms_count          -0.052849
global_orthographic_density * global_synonyms_count    0.001505
dtype: float64

Regressing global aoa with 795 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.03443712111913455

intercept                   6.533094
rel_aoa                     0.037231
rel_clustering              0.331847
rel_frequency               0.005068
rel_letters_count           0.009371
rel_orthographic_density   -0.332201
rel_synonyms_count         -0.250378
dtype: float64

Regressing global aoa with 795 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.06696652808984116

intercept                                        6.570977
rel_aoa                                         -0.131766
rel_clustering                                   0.022168
rel_frequency                                   -0.005377
rel_letters_count                               -0.022649
rel_orthographic_density                        -0.405473
rel_synonyms_count                              -0.295292
rel_aoa * rel_clustering                        -0.002721
rel_aoa * rel_frequency                         -0.063020
rel_aoa * rel_letters_count                      0.056194
rel_aoa * rel_orthographic_density               0.109012
rel_aoa * rel_synonyms_count                    -0.014010
rel_clustering * rel_frequency                   0.006294
rel_clustering * rel_letters_count               0.248903
rel_clustering * rel_orthographic_density        0.132983
rel_clustering * rel_synonyms_count             -0.641908
rel_frequency * rel_letters_count                0.027078
rel_frequency * rel_orthographic_density         0.011391
rel_frequency * rel_synonyms_count              -0.075585
rel_letters_count * rel_orthographic_density     0.015072
rel_letters_count * rel_synonyms_count           0.183134
rel_orthographic_density * rel_synonyms_count    0.444947
dtype: float64

Regressing rel aoa with 795 measures, no interactions
           ^^^^^^^
R^2 = 0.14251370297954624

intercept                   0.679102
rel_aoa                     0.429118
rel_clustering              0.015116
rel_frequency              -0.097632
rel_letters_count          -0.005259
rel_orthographic_density    0.154100
rel_synonyms_count         -0.195964
dtype: float64

Regressing rel aoa with 795 measures, with interactions
           ^^^^^^^
R^2 = 0.16249234314330874

intercept                                        0.880329
rel_aoa                                          0.487440
rel_clustering                                  -0.205484
rel_frequency                                   -0.061846
rel_letters_count                               -0.022034
rel_orthographic_density                         0.387956
rel_synonyms_count                              -0.202689
rel_aoa * rel_clustering                         0.030397
rel_aoa * rel_frequency                          0.014932
rel_aoa * rel_letters_count                      0.021444
rel_aoa * rel_orthographic_density               0.057866
rel_aoa * rel_synonyms_count                    -0.085766
rel_clustering * rel_frequency                   0.011009
rel_clustering * rel_letters_count               0.221275
rel_clustering * rel_orthographic_density        0.243000
rel_clustering * rel_synonyms_count             -0.340289
rel_frequency * rel_letters_count                0.028953
rel_frequency * rel_orthographic_density         0.138159
rel_frequency * rel_synonyms_count              -0.044079
rel_letters_count * rel_orthographic_density     0.011609
rel_letters_count * rel_synonyms_count           0.129400
rel_orthographic_density * rel_synonyms_count    0.280552
dtype: float64

Regressing global aoa with 795 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.09298488946551198

intercept                      3.667225
global_aoa                     0.353446
global_clustering             -0.133089
global_frequency              -0.081209
global_letters_count           0.215931
global_orthographic_density    0.049721
global_synonyms_count         -0.027761
rel_aoa                       -0.225437
rel_clustering                 0.280488
rel_frequency                 -0.021247
rel_letters_count             -0.165445
rel_orthographic_density      -0.069244
rel_synonyms_count            -0.231679
dtype: float64

Regressing global aoa with 795 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.18844140933893402

intercept                                                 70.669092
global_aoa                                                -0.219569
global_clustering                                          6.650020
global_frequency                                          -0.956782
global_letters_count                                      -6.296930
global_orthographic_density                              -20.647811
global_synonyms_count                                     -8.535420
rel_aoa                                                    0.776721
rel_clustering                                            -6.978625
rel_frequency                                              2.483161
rel_letters_count                                          4.806202
rel_orthographic_density                                  16.826306
rel_synonyms_count                                        -3.166250
global_aoa * global_clustering                            -0.147030
global_aoa * global_frequency                             -0.123394
global_aoa * global_letters_count                          0.102240
global_aoa * global_orthographic_density                   0.231393
global_aoa * global_synonyms_count                        -0.222775
global_aoa * rel_aoa                                       0.007529
global_aoa * rel_clustering                                0.136961
global_aoa * rel_frequency                                 0.025320
global_aoa * rel_letters_count                            -0.112815
global_aoa * rel_orthographic_density                     -0.233420
global_aoa * rel_synonyms_count                            0.387301
global_clustering * global_frequency                       0.089270
global_clustering * global_letters_count                  -0.306524
global_clustering * global_orthographic_density           -2.312994
global_clustering * global_synonyms_count                 -1.179787
global_clustering * rel_aoa                                0.220196
global_clustering * rel_clustering                         0.130584
global_clustering * rel_frequency                          0.184395
global_clustering * rel_letters_count                      0.069804
global_clustering * rel_orthographic_density               1.361371
global_clustering * rel_synonyms_count                    -0.387554
global_frequency * global_letters_count                    0.305460
global_frequency * global_orthographic_density             0.385386
global_frequency * global_synonyms_count                  -0.305096
global_frequency * rel_aoa                                 0.138093
global_frequency * rel_clustering                          0.038397
global_frequency * rel_frequency                          -0.041602
global_frequency * rel_letters_count                      -0.318334
global_frequency * rel_orthographic_density               -0.497823
global_frequency * rel_synonyms_count                      0.533002
global_letters_count * global_orthographic_density         0.150110
global_letters_count * global_synonyms_count               0.736774
global_letters_count * rel_aoa                            -0.101284
global_letters_count * rel_clustering                      0.483096
global_letters_count * rel_frequency                      -0.085084
global_letters_count * rel_letters_count                   0.041647
global_letters_count * rel_orthographic_density           -0.367783
global_letters_count * rel_synonyms_count                 -0.784167
global_orthographic_density * global_synonyms_count        0.905382
global_orthographic_density * rel_aoa                     -0.327895
global_orthographic_density * rel_clustering               1.782840
global_orthographic_density * rel_frequency               -0.253394
global_orthographic_density * rel_letters_count           -0.153519
global_orthographic_density * rel_orthographic_density     0.081870
global_orthographic_density * rel_synonyms_count          -1.626309
global_synonyms_count * rel_aoa                            0.076070
global_synonyms_count * rel_clustering                     1.432875
global_synonyms_count * rel_frequency                     -0.347834
global_synonyms_count * rel_letters_count                 -0.966273
global_synonyms_count * rel_orthographic_density          -1.147402
global_synonyms_count * rel_synonyms_count                 0.001700
rel_aoa * rel_clustering                                  -0.155211
rel_aoa * rel_frequency                                   -0.073989
rel_aoa * rel_letters_count                                0.112257
rel_aoa * rel_orthographic_density                         0.362980
rel_aoa * rel_synonyms_count                              -0.255230
rel_clustering * rel_frequency                            -0.280915
rel_clustering * rel_letters_count                        -0.026229
rel_clustering * rel_orthographic_density                 -0.649270
rel_clustering * rel_synonyms_count                       -0.647768
rel_frequency * rel_letters_count                          0.121699
rel_frequency * rel_orthographic_density                   0.462044
rel_frequency * rel_synonyms_count                         0.083714
rel_letters_count * rel_orthographic_density               0.446907
rel_letters_count * rel_synonyms_count                     1.155284
rel_orthographic_density * rel_synonyms_count              2.236531
dtype: float64

Regressing rel aoa with 795 measures, no interactions
           ^^^^^^^
R^2 = 0.19831909547824667

intercept                      1.917121
global_aoa                    -0.424863
global_clustering             -0.110561
global_frequency              -0.031630
global_letters_count           0.165912
global_orthographic_density    0.013421
global_synonyms_count          0.165822
rel_aoa                        0.736364
rel_clustering                 0.220903
rel_frequency                 -0.061268
rel_letters_count             -0.138345
rel_orthographic_density      -0.082332
rel_synonyms_count            -0.403228
dtype: float64

Regressing rel aoa with 795 measures, with interactions
           ^^^^^^^
R^2 = 0.27642176883781155

intercept                                                 44.265407
global_aoa                                                -1.348925
global_clustering                                          3.454583
global_frequency                                          -0.239490
global_letters_count                                      -3.987386
global_orthographic_density                              -15.630031
global_synonyms_count                                     -5.048262
rel_aoa                                                    2.084031
rel_clustering                                            -1.772411
rel_frequency                                              1.497598
rel_letters_count                                          3.336280
rel_orthographic_density                                  13.503088
rel_synonyms_count                                        -3.598099
global_aoa * global_clustering                            -0.192585
global_aoa * global_frequency                             -0.095253
global_aoa * global_letters_count                          0.056031
global_aoa * global_orthographic_density                   0.197479
global_aoa * global_synonyms_count                        -0.027026
global_aoa * rel_aoa                                      -0.010859
global_aoa * rel_clustering                                0.116903
global_aoa * rel_frequency                                 0.011228
global_aoa * rel_letters_count                            -0.097292
global_aoa * rel_orthographic_density                     -0.279707
global_aoa * rel_synonyms_count                            0.175009
global_clustering * global_frequency                       0.153555
global_clustering * global_letters_count                  -0.135413
global_clustering * global_orthographic_density           -1.496858
global_clustering * global_synonyms_count                 -1.063951
global_clustering * rel_aoa                                0.233981
global_clustering * rel_clustering                         0.109058
global_clustering * rel_frequency                          0.025595
global_clustering * rel_letters_count                     -0.057552
global_clustering * rel_orthographic_density               0.653994
global_clustering * rel_synonyms_count                    -0.221775
global_frequency * global_letters_count                    0.224364
global_frequency * global_orthographic_density             0.391396
global_frequency * global_synonyms_count                  -0.389304
global_frequency * rel_aoa                                 0.115064
global_frequency * rel_clustering                         -0.180715
global_frequency * rel_frequency                          -0.026545
global_frequency * rel_letters_count                      -0.270385
global_frequency * rel_orthographic_density               -0.585386
global_frequency * rel_synonyms_count                      0.531767
global_letters_count * global_orthographic_density         0.163641
global_letters_count * global_synonyms_count               0.288431
global_letters_count * rel_aoa                            -0.087612
global_letters_count * rel_clustering                      0.225826
global_letters_count * rel_frequency                      -0.090168
global_letters_count * rel_letters_count                   0.042314
global_letters_count * rel_orthographic_density           -0.286541
global_letters_count * rel_synonyms_count                 -0.501949
global_orthographic_density * global_synonyms_count        0.576330
global_orthographic_density * rel_aoa                     -0.290191
global_orthographic_density * rel_clustering               1.101172
global_orthographic_density * rel_frequency               -0.358485
global_orthographic_density * rel_letters_count           -0.186271
global_orthographic_density * rel_orthographic_density     0.032374
global_orthographic_density * rel_synonyms_count          -1.145518
global_synonyms_count * rel_aoa                            0.003430
global_synonyms_count * rel_clustering                     1.369030
global_synonyms_count * rel_frequency                     -0.089708
global_synonyms_count * rel_letters_count                 -0.468043
global_synonyms_count * rel_orthographic_density          -0.774614
global_synonyms_count * rel_synonyms_count                 0.005692
rel_aoa * rel_clustering                                  -0.097790
rel_aoa * rel_frequency                                   -0.050673
rel_aoa * rel_letters_count                                0.113565
rel_aoa * rel_orthographic_density                         0.371675
rel_aoa * rel_synonyms_count                              -0.162657
rel_clustering * rel_frequency                             0.023390
rel_clustering * rel_letters_count                         0.158389
rel_clustering * rel_orthographic_density                 -0.189886
rel_clustering * rel_synonyms_count                       -0.766840
rel_frequency * rel_letters_count                          0.146753
rel_frequency * rel_orthographic_density                   0.587538
rel_frequency * rel_synonyms_count                        -0.114375
rel_letters_count * rel_orthographic_density               0.373290
rel_letters_count * rel_synonyms_count                     0.772402
rel_orthographic_density * rel_synonyms_count              1.582483
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 695 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04701303645099974

intercept                     -4.584033
global_aoa                    -0.013284
global_clustering              0.164024
global_frequency              -0.042293
global_letters_count           0.032639
global_orthographic_density    0.030044
global_synonyms_count         -0.032274
dtype: float64

Regressing global clustering with 695 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.08861446164806286

intercept                                             -3.596717
global_aoa                                             0.144891
global_clustering                                      0.179581
global_frequency                                      -0.610648
global_letters_count                                   0.301689
global_orthographic_density                            0.554884
global_synonyms_count                                 -0.144156
global_aoa * global_clustering                         0.034590
global_aoa * global_frequency                          0.009763
global_aoa * global_letters_count                     -0.003675
global_aoa * global_orthographic_density              -0.011274
global_aoa * global_synonyms_count                    -0.012434
global_clustering * global_frequency                  -0.067174
global_clustering * global_letters_count               0.050581
global_clustering * global_orthographic_density        0.093108
global_clustering * global_synonyms_count             -0.081526
global_frequency * global_letters_count                0.010842
global_frequency * global_orthographic_density         0.033732
global_frequency * global_synonyms_count               0.018570
global_letters_count * global_orthographic_density    -0.032544
global_letters_count * global_synonyms_count          -0.046128
global_orthographic_density * global_synonyms_count   -0.129526
dtype: float64

Regressing rel clustering with 695 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.02838470498593637

intercept                      1.093599
global_aoa                    -0.009781
global_clustering              0.138145
global_frequency              -0.014688
global_letters_count           0.038996
global_orthographic_density    0.051191
global_synonyms_count         -0.052144
dtype: float64

Regressing rel clustering with 695 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.05953008004941463

intercept                                              3.852048
global_aoa                                             0.066001
global_clustering                                      0.365484
global_frequency                                      -0.584982
global_letters_count                                  -0.004664
global_orthographic_density                            0.424505
global_synonyms_count                                  0.475329
global_aoa * global_clustering                         0.026975
global_aoa * global_frequency                          0.013833
global_aoa * global_letters_count                     -0.000864
global_aoa * global_orthographic_density              -0.019686
global_aoa * global_synonyms_count                    -0.037569
global_clustering * global_frequency                  -0.060261
global_clustering * global_letters_count               0.009534
global_clustering * global_orthographic_density        0.072215
global_clustering * global_synonyms_count              0.004301
global_frequency * global_letters_count                0.014245
global_frequency * global_orthographic_density         0.034720
global_frequency * global_synonyms_count              -0.000170
global_letters_count * global_orthographic_density    -0.017440
global_letters_count * global_synonyms_count          -0.013209
global_orthographic_density * global_synonyms_count   -0.125253
dtype: float64

Regressing global clustering with 695 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.023524486648381467

intercept                  -5.909797
rel_aoa                    -0.003583
rel_clustering              0.140697
rel_frequency              -0.012239
rel_letters_count           0.024761
rel_orthographic_density    0.029248
rel_synonyms_count         -0.041916
dtype: float64

Regressing global clustering with 695 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04472029423044466

intercept                                       -5.903864
rel_aoa                                         -0.024464
rel_clustering                                   0.077778
rel_frequency                                   -0.022272
rel_letters_count                               -0.001624
rel_orthographic_density                         0.074121
rel_synonyms_count                              -0.092537
rel_aoa * rel_clustering                         0.064341
rel_aoa * rel_frequency                         -0.002108
rel_aoa * rel_letters_count                     -0.007683
rel_aoa * rel_orthographic_density              -0.009273
rel_aoa * rel_synonyms_count                    -0.010005
rel_clustering * rel_frequency                   0.006367
rel_clustering * rel_letters_count               0.033269
rel_clustering * rel_orthographic_density        0.037471
rel_clustering * rel_synonyms_count              0.014476
rel_frequency * rel_letters_count                0.002894
rel_frequency * rel_orthographic_density        -0.000405
rel_frequency * rel_synonyms_count              -0.013258
rel_letters_count * rel_orthographic_density    -0.027353
rel_letters_count * rel_synonyms_count          -0.002793
rel_orthographic_density * rel_synonyms_count   -0.024397
dtype: float64

Regressing rel clustering with 695 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.14368336267989656

intercept                   0.247090
rel_aoa                    -0.012637
rel_clustering              0.396946
rel_frequency               0.011395
rel_letters_count           0.039043
rel_orthographic_density    0.058197
rel_synonyms_count         -0.000883
dtype: float64

Regressing rel clustering with 695 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.16312327849388397

intercept                                        0.218053
rel_aoa                                         -0.026183
rel_clustering                                   0.327447
rel_frequency                                   -0.009045
rel_letters_count                                0.044066
rel_orthographic_density                         0.065726
rel_synonyms_count                              -0.062528
rel_aoa * rel_clustering                         0.051752
rel_aoa * rel_frequency                          0.003347
rel_aoa * rel_letters_count                     -0.011800
rel_aoa * rel_orthographic_density              -0.030401
rel_aoa * rel_synonyms_count                    -0.018917
rel_clustering * rel_frequency                  -0.005597
rel_clustering * rel_letters_count               0.028571
rel_clustering * rel_orthographic_density        0.047250
rel_clustering * rel_synonyms_count             -0.024589
rel_frequency * rel_letters_count                0.005251
rel_frequency * rel_orthographic_density        -0.008976
rel_frequency * rel_synonyms_count              -0.020539
rel_letters_count * rel_orthographic_density    -0.010043
rel_letters_count * rel_synonyms_count           0.015789
rel_orthographic_density * rel_synonyms_count    0.006564
dtype: float64

Regressing global clustering with 695 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.054068861824777215

intercept                     -3.631314
global_aoa                    -0.016452
global_clustering              0.185951
global_frequency              -0.079644
global_letters_count          -0.001784
global_orthographic_density   -0.066736
global_synonyms_count          0.027269
rel_aoa                        0.002289
rel_clustering                -0.021709
rel_frequency                  0.042417
rel_letters_count              0.032932
rel_orthographic_density       0.103336
rel_synonyms_count            -0.080065
dtype: float64

Regressing global clustering with 695 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1702282467222056

intercept                                                 23.423094
global_aoa                                                -0.769267
global_clustering                                          5.001104
global_frequency                                          -2.106961
global_letters_count                                       0.671831
global_orthographic_density                               -0.446621
global_synonyms_count                                     -3.225505
rel_aoa                                                    0.885928
rel_clustering                                            -4.992490
rel_frequency                                              1.068455
rel_letters_count                                         -0.427292
rel_orthographic_density                                   0.076167
rel_synonyms_count                                         0.591836
global_aoa * global_clustering                            -0.145074
global_aoa * global_frequency                              0.014501
global_aoa * global_letters_count                         -0.032909
global_aoa * global_orthographic_density                  -0.092930
global_aoa * global_synonyms_count                         0.116554
global_aoa * rel_aoa                                      -0.003191
global_aoa * rel_clustering                                0.188674
global_aoa * rel_frequency                                 0.004112
global_aoa * rel_letters_count                             0.030825
global_aoa * rel_orthographic_density                      0.064661
global_aoa * rel_synonyms_count                           -0.083815
global_clustering * global_frequency                      -0.325659
global_clustering * global_letters_count                   0.031876
global_clustering * global_orthographic_density           -0.165840
global_clustering * global_synonyms_count                 -0.457450
global_clustering * rel_aoa                                0.080460
global_clustering * rel_clustering                        -0.056479
global_clustering * rel_frequency                          0.220974
global_clustering * rel_letters_count                     -0.010585
global_clustering * rel_orthographic_density               0.101214
global_clustering * rel_synonyms_count                     0.281318
global_frequency * global_letters_count                   -0.021242
global_frequency * global_orthographic_density             0.019195
global_frequency * global_synonyms_count                   0.139020
global_frequency * rel_aoa                                -0.032493
global_frequency * rel_clustering                          0.256575
global_frequency * rel_frequency                           0.016271
global_frequency * rel_letters_count                       0.036238
global_frequency * rel_orthographic_density                0.012906
global_frequency * rel_synonyms_count                      0.012060
global_letters_count * global_orthographic_density         0.026140
global_letters_count * global_synonyms_count              -0.180011
global_letters_count * rel_aoa                            -0.027414
global_letters_count * rel_clustering                      0.036670
global_letters_count * rel_frequency                       0.036558
global_letters_count * rel_letters_count                   0.003224
global_letters_count * rel_orthographic_density           -0.023075
global_letters_count * rel_synonyms_count                  0.230850
global_orthographic_density * global_synonyms_count       -0.437770
global_orthographic_density * rel_aoa                      0.074397
global_orthographic_density * rel_clustering               0.209920
global_orthographic_density * rel_frequency                0.028014
global_orthographic_density * rel_letters_count           -0.114813
global_orthographic_density * rel_orthographic_density    -0.013627
global_orthographic_density * rel_synonyms_count           0.167336
global_synonyms_count * rel_aoa                           -0.099436
global_synonyms_count * rel_clustering                     0.249918
global_synonyms_count * rel_frequency                     -0.161419
global_synonyms_count * rel_letters_count                 -0.096993
global_synonyms_count * rel_orthographic_density           0.073332
global_synonyms_count * rel_synonyms_count                -0.018241
rel_aoa * rel_clustering                                  -0.038047
rel_aoa * rel_frequency                                    0.034628
rel_aoa * rel_letters_count                                0.013785
rel_aoa * rel_orthographic_density                        -0.093585
rel_aoa * rel_synonyms_count                               0.053756
rel_clustering * rel_frequency                            -0.183132
rel_clustering * rel_letters_count                        -0.013404
rel_clustering * rel_orthographic_density                 -0.018430
rel_clustering * rel_synonyms_count                       -0.156692
rel_frequency * rel_letters_count                         -0.044510
rel_frequency * rel_orthographic_density                  -0.040282
rel_frequency * rel_synonyms_count                         0.023520
rel_letters_count * rel_orthographic_density               0.084305
rel_letters_count * rel_synonyms_count                     0.026726
rel_orthographic_density * rel_synonyms_count              0.151027
dtype: float64

Regressing rel clustering with 695 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.22994432347540428

intercept                     -2.329419
global_aoa                    -0.014345
global_clustering             -0.549852
global_frequency              -0.059926
global_letters_count          -0.001491
global_orthographic_density   -0.030981
global_synonyms_count         -0.054042
rel_aoa                       -0.002338
rel_clustering                 0.837942
rel_frequency                  0.035593
rel_letters_count              0.037297
rel_orthographic_density       0.064417
rel_synonyms_count             0.017429
dtype: float64

Regressing rel clustering with 695 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.3179767735198272

intercept                                                 19.916527
global_aoa                                                -0.529237
global_clustering                                          3.184514
global_frequency                                          -1.638613
global_letters_count                                       0.164032
global_orthographic_density                               -0.895460
global_synonyms_count                                     -1.624709
rel_aoa                                                    0.795875
rel_clustering                                            -3.271081
rel_frequency                                              0.870239
rel_letters_count                                         -0.392481
rel_orthographic_density                                   0.212203
rel_synonyms_count                                        -0.681388
global_aoa * global_clustering                            -0.088203
global_aoa * global_frequency                              0.019445
global_aoa * global_letters_count                         -0.023829
global_aoa * global_orthographic_density                  -0.080646
global_aoa * global_synonyms_count                         0.065437
global_aoa * rel_aoa                                      -0.002956
global_aoa * rel_clustering                                0.122252
global_aoa * rel_frequency                                -0.000194
global_aoa * rel_letters_count                             0.027791
global_aoa * rel_orthographic_density                      0.057547
global_aoa * rel_synonyms_count                           -0.032330
global_clustering * global_frequency                      -0.236467
global_clustering * global_letters_count                  -0.022594
global_clustering * global_orthographic_density           -0.138893
global_clustering * global_synonyms_count                 -0.392574
global_clustering * rel_aoa                                0.054180
global_clustering * rel_clustering                        -0.089067
global_clustering * rel_frequency                          0.163304
global_clustering * rel_letters_count                     -0.014258
global_clustering * rel_orthographic_density               0.073750
global_clustering * rel_synonyms_count                     0.265469
global_frequency * global_letters_count                   -0.015170
global_frequency * global_orthographic_density             0.057413
global_frequency * global_synonyms_count                   0.002820
global_frequency * rel_aoa                                -0.036445
global_frequency * rel_clustering                          0.182606
global_frequency * rel_frequency                           0.014792
global_frequency * rel_letters_count                       0.033707
global_frequency * rel_orthographic_density               -0.016236
global_frequency * rel_synonyms_count                      0.134167
global_letters_count * global_orthographic_density         0.046814
global_letters_count * global_synonyms_count              -0.167187
global_letters_count * rel_aoa                            -0.029680
global_letters_count * rel_clustering                      0.079579
global_letters_count * rel_frequency                       0.022284
global_letters_count * rel_letters_count                   0.005923
global_letters_count * rel_orthographic_density           -0.021739
global_letters_count * rel_synonyms_count                  0.219365
global_orthographic_density * global_synonyms_count       -0.287653
global_orthographic_density * rel_aoa                      0.066567
global_orthographic_density * rel_clustering               0.164120
global_orthographic_density * rel_frequency               -0.018131
global_orthographic_density * rel_letters_count           -0.123496
global_orthographic_density * rel_orthographic_density     0.020358
global_orthographic_density * rel_synonyms_count           0.062304
global_synonyms_count * rel_aoa                           -0.117474
global_synonyms_count * rel_clustering                     0.250791
global_synonyms_count * rel_frequency                     -0.068743
global_synonyms_count * rel_letters_count                 -0.030395
global_synonyms_count * rel_orthographic_density          -0.064781
global_synonyms_count * rel_synonyms_count                -0.008311
rel_aoa * rel_clustering                                  -0.018589
rel_aoa * rel_frequency                                    0.033927
rel_aoa * rel_letters_count                                0.013305
rel_aoa * rel_orthographic_density                        -0.085017
rel_aoa * rel_synonyms_count                               0.067371
rel_clustering * rel_frequency                            -0.134234
rel_clustering * rel_letters_count                        -0.011091
rel_clustering * rel_orthographic_density                 -0.007843
rel_clustering * rel_synonyms_count                       -0.190161
rel_frequency * rel_letters_count                         -0.034271
rel_frequency * rel_orthographic_density                  -0.009210
rel_frequency * rel_synonyms_count                        -0.066637
rel_letters_count * rel_orthographic_density               0.100220
rel_letters_count * rel_synonyms_count                    -0.039811
rel_orthographic_density * rel_synonyms_count              0.231563
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08950213355306913

intercept                      4.922234
global_aoa                     0.031985
global_clustering              0.044474
global_frequency               0.036039
global_letters_count           0.245358
global_orthographic_density   -0.131614
global_synonyms_count         -0.368958
dtype: float64

Regressing global letters_count with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10785453114908283

intercept                                             -7.743553
global_aoa                                             0.559276
global_clustering                                     -2.886417
global_frequency                                       0.768106
global_letters_count                                   0.369139
global_orthographic_density                           -1.380300
global_synonyms_count                                 -2.080544
global_aoa * global_clustering                         0.154623
global_aoa * global_frequency                          0.020934
global_aoa * global_letters_count                      0.014900
global_aoa * global_orthographic_density               0.083537
global_aoa * global_synonyms_count                    -0.028266
global_clustering * global_frequency                   0.174760
global_clustering * global_letters_count               0.051127
global_clustering * global_orthographic_density        0.010181
global_clustering * global_synonyms_count             -0.013214
global_frequency * global_letters_count                0.002392
global_frequency * global_orthographic_density         0.088845
global_frequency * global_synonyms_count               0.004750
global_letters_count * global_orthographic_density    -0.044363
global_letters_count * global_synonyms_count           0.221216
global_orthographic_density * global_synonyms_count    0.374909
dtype: float64

Regressing rel letters_count with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.040257553697416126

intercept                      2.172494
global_aoa                    -0.010022
global_clustering             -0.006654
global_frequency              -0.002223
global_letters_count           0.137245
global_orthographic_density   -0.126621
global_synonyms_count         -0.435997
dtype: float64

Regressing rel letters_count with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.06122926768599457

intercept                                             -12.104509
global_aoa                                              0.750552
global_clustering                                      -2.666362
global_frequency                                        1.086721
global_letters_count                                    0.269934
global_orthographic_density                            -1.613496
global_synonyms_count                                  -1.523050
global_aoa * global_clustering                          0.165507
global_aoa * global_frequency                           0.019885
global_aoa * global_letters_count                      -0.009240
global_aoa * global_orthographic_density                0.072718
global_aoa * global_synonyms_count                     -0.051858
global_clustering * global_frequency                    0.200582
global_clustering * global_letters_count               -0.019934
global_clustering * global_orthographic_density        -0.119423
global_clustering * global_synonyms_count              -0.017601
global_frequency * global_letters_count                -0.021914
global_frequency * global_orthographic_density          0.057871
global_frequency * global_synonyms_count               -0.034219
global_letters_count * global_orthographic_density     -0.072141
global_letters_count * global_synonyms_count            0.208472
global_orthographic_density * global_synonyms_count     0.315610
dtype: float64

Regressing global letters_count with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07102607663539284

intercept                   5.765140
rel_aoa                    -0.037773
rel_clustering              0.221568
rel_frequency               0.047335
rel_letters_count           0.182771
rel_orthographic_density   -0.275835
rel_synonyms_count         -0.361346
dtype: float64

Regressing global letters_count with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08108393213726983

intercept                                        5.800156
rel_aoa                                         -0.064975
rel_clustering                                   0.069294
rel_frequency                                    0.074566
rel_letters_count                                0.198918
rel_orthographic_density                        -0.379885
rel_synonyms_count                              -0.434517
rel_aoa * rel_clustering                         0.030176
rel_aoa * rel_frequency                         -0.025722
rel_aoa * rel_letters_count                      0.008861
rel_aoa * rel_orthographic_density               0.059392
rel_aoa * rel_synonyms_count                    -0.040823
rel_clustering * rel_frequency                  -0.023699
rel_clustering * rel_letters_count               0.042590
rel_clustering * rel_orthographic_density        0.007112
rel_clustering * rel_synonyms_count             -0.137252
rel_frequency * rel_letters_count               -0.000619
rel_frequency * rel_orthographic_density        -0.013003
rel_frequency * rel_synonyms_count              -0.025426
rel_letters_count * rel_orthographic_density     0.025777
rel_letters_count * rel_synonyms_count           0.188801
rel_orthographic_density * rel_synonyms_count    0.401841
dtype: float64

Regressing rel letters_count with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12648783842404787

intercept                   1.504569
rel_aoa                    -0.042459
rel_clustering              0.054537
rel_frequency              -0.142864
rel_letters_count           0.376222
rel_orthographic_density    0.036736
rel_synonyms_count         -0.387021
dtype: float64

Regressing rel letters_count with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1436697278251735

intercept                                        1.463866
rel_aoa                                         -0.008658
rel_clustering                                   0.021789
rel_frequency                                   -0.150054
rel_letters_count                                0.537292
rel_orthographic_density                         0.089345
rel_synonyms_count                              -0.379713
rel_aoa * rel_clustering                         0.101050
rel_aoa * rel_frequency                          0.008412
rel_aoa * rel_letters_count                     -0.033446
rel_aoa * rel_orthographic_density              -0.038299
rel_aoa * rel_synonyms_count                    -0.077916
rel_clustering * rel_frequency                   0.030336
rel_clustering * rel_letters_count               0.075900
rel_clustering * rel_orthographic_density        0.128273
rel_clustering * rel_synonyms_count             -0.106881
rel_frequency * rel_letters_count                0.028267
rel_frequency * rel_orthographic_density         0.071047
rel_frequency * rel_synonyms_count               0.012295
rel_letters_count * rel_orthographic_density     0.069731
rel_letters_count * rel_synonyms_count           0.230582
rel_orthographic_density * rel_synonyms_count    0.423263
dtype: float64

Regressing global letters_count with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0997546513002523

intercept                      0.468589
global_aoa                     0.098251
global_clustering             -0.305347
global_frequency               0.143810
global_letters_count           0.382069
global_orthographic_density   -0.116027
global_synonyms_count         -0.224088
rel_aoa                       -0.106902
rel_clustering                 0.399332
rel_frequency                 -0.135893
rel_letters_count             -0.146934
rel_orthographic_density       0.000814
rel_synonyms_count            -0.157678
dtype: float64

Regressing global letters_count with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17483607823473324

intercept                                                 4.494470
global_aoa                                                0.945095
global_clustering                                        -1.813668
global_frequency                                          0.478084
global_letters_count                                     -0.710759
global_orthographic_density                              -8.038010
global_synonyms_count                                    -2.525586
rel_aoa                                                  -1.711433
rel_clustering                                           -5.218413
rel_frequency                                             0.758704
rel_letters_count                                         0.494506
rel_orthographic_density                                  3.508553
rel_synonyms_count                                       -2.155221
global_aoa * global_clustering                            0.218511
global_aoa * global_frequency                             0.021562
global_aoa * global_letters_count                         0.027656
global_aoa * global_orthographic_density                  0.101056
global_aoa * global_synonyms_count                       -0.381774
global_aoa * rel_aoa                                      0.032488
global_aoa * rel_clustering                              -0.036501
global_aoa * rel_frequency                               -0.047773
global_aoa * rel_letters_count                           -0.043666
global_aoa * rel_orthographic_density                    -0.001728
global_aoa * rel_synonyms_count                           0.379002
global_clustering * global_frequency                      0.223728
global_clustering * global_letters_count                  0.147449
global_clustering * global_orthographic_density          -0.937505
global_clustering * global_synonyms_count                -0.478729
global_clustering * rel_aoa                               0.072728
global_clustering * rel_clustering                        0.032492
global_clustering * rel_frequency                         0.036227
global_clustering * rel_letters_count                    -0.349580
global_clustering * rel_orthographic_density              0.638151
global_clustering * rel_synonyms_count                   -0.101040
global_frequency * global_letters_count                   0.173231
global_frequency * global_orthographic_density            0.165807
global_frequency * global_synonyms_count                 -0.314014
global_frequency * rel_aoa                                0.172391
global_frequency * rel_clustering                         0.239114
global_frequency * rel_frequency                         -0.031982
global_frequency * rel_letters_count                     -0.275389
global_frequency * rel_orthographic_density              -0.015026
global_frequency * rel_synonyms_count                     0.334089
global_letters_count * global_orthographic_density       -0.125875
global_letters_count * global_synonyms_count              0.710117
global_letters_count * rel_aoa                            0.098690
global_letters_count * rel_clustering                     0.211481
global_letters_count * rel_frequency                      0.004163
global_letters_count * rel_letters_count                  0.040217
global_letters_count * rel_orthographic_density           0.105729
global_letters_count * rel_synonyms_count                -0.586512
global_orthographic_density * global_synonyms_count       0.917291
global_orthographic_density * rel_aoa                    -0.190673
global_orthographic_density * rel_clustering              0.707498
global_orthographic_density * rel_frequency              -0.087689
global_orthographic_density * rel_letters_count           0.161188
global_orthographic_density * rel_orthographic_density    0.127464
global_orthographic_density * rel_synonyms_count         -0.865056
global_synonyms_count * rel_aoa                           0.299097
global_synonyms_count * rel_clustering                    1.155377
global_synonyms_count * rel_frequency                     0.164457
global_synonyms_count * rel_letters_count                -0.493222
global_synonyms_count * rel_orthographic_density         -0.550714
global_synonyms_count * rel_synonyms_count               -0.170641
rel_aoa * rel_clustering                                 -0.104282
rel_aoa * rel_frequency                                  -0.123461
rel_aoa * rel_letters_count                              -0.130137
rel_aoa * rel_orthographic_density                        0.182506
rel_aoa * rel_synonyms_count                             -0.292871
rel_clustering * rel_frequency                           -0.355283
rel_clustering * rel_letters_count                        0.045458
rel_clustering * rel_orthographic_density                -0.252152
rel_clustering * rel_synonyms_count                      -0.529243
rel_frequency * rel_letters_count                         0.079038
rel_frequency * rel_orthographic_density                  0.043250
rel_frequency * rel_synonyms_count                       -0.161272
rel_letters_count * rel_orthographic_density             -0.073245
rel_letters_count * rel_synonyms_count                    0.550761
rel_orthographic_density * rel_synonyms_count             0.879603
dtype: float64

Regressing rel letters_count with 863 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1826005578753488

intercept                     -0.059588
global_aoa                     0.083942
global_clustering             -0.310186
global_frequency               0.140780
global_letters_count          -0.530024
global_orthographic_density   -0.052305
global_synonyms_count         -0.192849
rel_aoa                       -0.091474
rel_clustering                 0.377447
rel_frequency                 -0.156761
rel_letters_count              0.791704
rel_orthographic_density      -0.086812
rel_synonyms_count            -0.160409
dtype: float64

Regressing rel letters_count with 863 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.25345964710260505

intercept                                                -6.234080
global_aoa                                                0.897517
global_clustering                                        -3.272079
global_frequency                                          0.698086
global_letters_count                                     -0.635322
global_orthographic_density                              -6.371918
global_synonyms_count                                     0.087048
rel_aoa                                                  -1.763376
rel_clustering                                           -3.915664
rel_frequency                                             0.394457
rel_letters_count                                         0.944174
rel_orthographic_density                                  1.979312
rel_synonyms_count                                       -4.554200
global_aoa * global_clustering                            0.213230
global_aoa * global_frequency                             0.028977
global_aoa * global_letters_count                         0.029055
global_aoa * global_orthographic_density                  0.058039
global_aoa * global_synonyms_count                       -0.398260
global_aoa * rel_aoa                                      0.027377
global_aoa * rel_clustering                              -0.028845
global_aoa * rel_frequency                               -0.058023
global_aoa * rel_letters_count                           -0.044803
global_aoa * rel_orthographic_density                     0.040124
global_aoa * rel_synonyms_count                           0.389240
global_clustering * global_frequency                      0.256864
global_clustering * global_letters_count                  0.240256
global_clustering * global_orthographic_density          -0.664683
global_clustering * global_synonyms_count                -0.378913
global_clustering * rel_aoa                               0.039678
global_clustering * rel_clustering                        0.017888
global_clustering * rel_frequency                        -0.025495
global_clustering * rel_letters_count                    -0.400140
global_clustering * rel_orthographic_density              0.384327
global_clustering * rel_synonyms_count                   -0.163793
global_frequency * global_letters_count                   0.133421
global_frequency * global_orthographic_density            0.204824
global_frequency * global_synonyms_count                 -0.360197
global_frequency * rel_aoa                                0.156224
global_frequency * rel_clustering                         0.201443
global_frequency * rel_frequency                         -0.035062
global_frequency * rel_letters_count                     -0.232365
global_frequency * rel_orthographic_density              -0.050506
global_frequency * rel_synonyms_count                     0.397664
global_letters_count * global_orthographic_density       -0.096059
global_letters_count * global_synonyms_count              0.587160
global_letters_count * rel_aoa                            0.086000
global_letters_count * rel_clustering                     0.116506
global_letters_count * rel_frequency                      0.024954
global_letters_count * rel_letters_count                  0.022399
global_letters_count * rel_orthographic_density           0.065532
global_letters_count * rel_synonyms_count                -0.450133
global_orthographic_density * global_synonyms_count       0.583494
global_orthographic_density * rel_aoa                    -0.116925
global_orthographic_density * rel_clustering              0.459432
global_orthographic_density * rel_frequency              -0.108547
global_orthographic_density * rel_letters_count           0.097392
global_orthographic_density * rel_orthographic_density    0.131649
global_orthographic_density * rel_synonyms_count         -0.579003
global_synonyms_count * rel_aoa                           0.319780
global_synonyms_count * rel_clustering                    1.016409
global_synonyms_count * rel_frequency                     0.207114
global_synonyms_count * rel_letters_count                -0.469353
global_synonyms_count * rel_orthographic_density         -0.303061
global_synonyms_count * rel_synonyms_count               -0.162367
rel_aoa * rel_clustering                                 -0.066812
rel_aoa * rel_frequency                                  -0.100573
rel_aoa * rel_letters_count                              -0.113282
rel_aoa * rel_orthographic_density                        0.126648
rel_aoa * rel_synonyms_count                             -0.310895
rel_clustering * rel_frequency                           -0.304442
rel_clustering * rel_letters_count                        0.098506
rel_clustering * rel_orthographic_density                -0.034392
rel_clustering * rel_synonyms_count                      -0.451116
rel_frequency * rel_letters_count                         0.057740
rel_frequency * rel_orthographic_density                  0.049217
rel_frequency * rel_synonyms_count                       -0.216727
rel_letters_count * rel_orthographic_density             -0.006350
rel_letters_count * rel_synonyms_count                    0.497879
rel_orthographic_density * rel_synonyms_count             0.675922
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 833 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.026808993182996566

intercept                      0.363205
global_aoa                    -0.006615
global_clustering              0.014168
global_frequency              -0.003945
global_letters_count           0.013329
global_orthographic_density    0.034848
global_synonyms_count          0.165091
dtype: float64

Regressing global synonyms_count with 833 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0476111970445795

intercept                                              0.611805
global_aoa                                             0.086789
global_clustering                                      0.206689
global_frequency                                      -0.004827
global_letters_count                                   0.052418
global_orthographic_density                           -0.114963
global_synonyms_count                                 -0.088865
global_aoa * global_clustering                         0.003520
global_aoa * global_frequency                         -0.005918
global_aoa * global_letters_count                     -0.006501
global_aoa * global_orthographic_density               0.010444
global_aoa * global_synonyms_count                     0.032440
global_clustering * global_frequency                  -0.015124
global_clustering * global_letters_count              -0.008476
global_clustering * global_orthographic_density       -0.031209
global_clustering * global_synonyms_count              0.086109
global_frequency * global_letters_count               -0.004746
global_frequency * global_orthographic_density        -0.010543
global_frequency * global_synonyms_count               0.015259
global_letters_count * global_orthographic_density    -0.009060
global_letters_count * global_synonyms_count           0.035408
global_orthographic_density * global_synonyms_count    0.153490
dtype: float64

Regressing rel synonyms_count with 833 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.020566580299250847

intercept                      0.011341
global_aoa                    -0.003261
global_clustering              0.005293
global_frequency              -0.005601
global_letters_count           0.013513
global_orthographic_density    0.032527
global_synonyms_count          0.137495
dtype: float64

Regressing rel synonyms_count with 833 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.040196284216821754

intercept                                              0.593246
global_aoa                                             0.068437
global_clustering                                      0.061723
global_frequency                                      -0.088940
global_letters_count                                  -0.031970
global_orthographic_density                           -0.233991
global_synonyms_count                                  0.026086
global_aoa * global_clustering                         0.010399
global_aoa * global_frequency                         -0.000204
global_aoa * global_letters_count                     -0.004560
global_aoa * global_orthographic_density               0.012987
global_aoa * global_synonyms_count                     0.020758
global_clustering * global_frequency                  -0.009958
global_clustering * global_letters_count              -0.007036
global_clustering * global_orthographic_density       -0.015159
global_clustering * global_synonyms_count              0.107942
global_frequency * global_letters_count                0.003358
global_frequency * global_orthographic_density         0.009837
global_frequency * global_synonyms_count               0.014521
global_letters_count * global_orthographic_density    -0.006469
global_letters_count * global_synonyms_count           0.049114
global_orthographic_density * global_synonyms_count    0.147464
dtype: float64

Regressing global synonyms_count with 833 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0254892985165901

intercept                   0.398464
rel_aoa                     0.001110
rel_clustering             -0.031885
rel_frequency               0.002609
rel_letters_count           0.018495
rel_orthographic_density    0.048921
rel_synonyms_count          0.171520
dtype: float64

Regressing global synonyms_count with 833 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.037631913982344445

intercept                                        0.420057
rel_aoa                                         -0.021221
rel_clustering                                  -0.085125
rel_frequency                                    0.007350
rel_letters_count                               -0.013497
rel_orthographic_density                         0.045119
rel_synonyms_count                               0.195598
rel_aoa * rel_clustering                        -0.007712
rel_aoa * rel_frequency                         -0.007483
rel_aoa * rel_letters_count                      0.001901
rel_aoa * rel_orthographic_density               0.001076
rel_aoa * rel_synonyms_count                     0.029012
rel_clustering * rel_frequency                  -0.004443
rel_clustering * rel_letters_count               0.009717
rel_clustering * rel_orthographic_density       -0.026052
rel_clustering * rel_synonyms_count              0.101920
rel_frequency * rel_letters_count               -0.003301
rel_frequency * rel_orthographic_density        -0.009013
rel_frequency * rel_synonyms_count               0.016316
rel_letters_count * rel_orthographic_density    -0.012992
rel_letters_count * rel_synonyms_count           0.021063
rel_orthographic_density * rel_synonyms_count    0.089238
dtype: float64

Regressing rel synonyms_count with 833 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.09038108483883256

intercept                   0.079993
rel_aoa                    -0.008295
rel_clustering              0.023370
rel_frequency               0.009704
rel_letters_count           0.015876
rel_orthographic_density    0.028825
rel_synonyms_count          0.313933
dtype: float64

Regressing rel synonyms_count with 833 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.10611152710518978

intercept                                        0.089420
rel_aoa                                         -0.024943
rel_clustering                                  -0.031858
rel_frequency                                    0.008472
rel_letters_count                               -0.005871
rel_orthographic_density                         0.033754
rel_synonyms_count                               0.416739
rel_aoa * rel_clustering                         0.011709
rel_aoa * rel_frequency                         -0.003991
rel_aoa * rel_letters_count                      0.003157
rel_aoa * rel_orthographic_density               0.005978
rel_aoa * rel_synonyms_count                     0.010160
rel_clustering * rel_frequency                  -0.001793
rel_clustering * rel_letters_count               0.022725
rel_clustering * rel_orthographic_density        0.006444
rel_clustering * rel_synonyms_count              0.094221
rel_frequency * rel_letters_count                0.000414
rel_frequency * rel_orthographic_density        -0.000732
rel_frequency * rel_synonyms_count               0.029211
rel_letters_count * rel_orthographic_density    -0.009940
rel_letters_count * rel_synonyms_count           0.034547
rel_orthographic_density * rel_synonyms_count    0.138871
dtype: float64

Regressing global synonyms_count with 833 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.03559917974871929

intercept                      1.632815
global_aoa                    -0.009948
global_clustering              0.121864
global_frequency              -0.026638
global_letters_count          -0.028758
global_orthographic_density   -0.014947
global_synonyms_count          0.079948
rel_aoa                        0.006349
rel_clustering                -0.125437
rel_frequency                  0.028682
rel_letters_count              0.045116
rel_orthographic_density       0.055815
rel_synonyms_count             0.097190
dtype: float64

Regressing global synonyms_count with 833 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12322790979923691

intercept                                                 12.431519
global_aoa                                                -0.340013
global_clustering                                          2.291192
global_frequency                                          -0.578738
global_letters_count                                       0.523433
global_orthographic_density                               -1.374090
global_synonyms_count                                      1.741321
rel_aoa                                                    0.121293
rel_clustering                                            -1.482720
rel_frequency                                              0.282546
rel_letters_count                                         -0.795033
rel_orthographic_density                                   1.049293
rel_synonyms_count                                        -3.950578
global_aoa * global_clustering                            -0.051372
global_aoa * global_frequency                              0.012491
global_aoa * global_letters_count                         -0.016511
global_aoa * global_orthographic_density                  -0.026576
global_aoa * global_synonyms_count                         0.028347
global_aoa * rel_aoa                                      -0.009080
global_aoa * rel_clustering                                0.075219
global_aoa * rel_frequency                                -0.021996
global_aoa * rel_letters_count                             0.009242
global_aoa * rel_orthographic_density                      0.045499
global_aoa * rel_synonyms_count                            0.034052
global_clustering * global_frequency                      -0.114109
global_clustering * global_letters_count                   0.026163
global_clustering * global_orthographic_density           -0.298746
global_clustering * global_synonyms_count                  0.222718
global_clustering * rel_aoa                               -0.010902
global_clustering * rel_clustering                         0.035318
global_clustering * rel_frequency                          0.066836
global_clustering * rel_letters_count                     -0.112650
global_clustering * rel_orthographic_density               0.201958
global_clustering * rel_synonyms_count                    -0.268512
global_frequency * global_letters_count                   -0.026300
global_frequency * global_orthographic_density            -0.032223
global_frequency * global_synonyms_count                   0.036736
global_frequency * rel_aoa                                -0.013873
global_frequency * rel_clustering                          0.073902
global_frequency * rel_frequency                           0.005661
global_frequency * rel_letters_count                       0.001210
global_frequency * rel_orthographic_density                0.006745
global_frequency * rel_synonyms_count                      0.021096
global_letters_count * global_orthographic_density         0.013564
global_letters_count * global_synonyms_count              -0.013724
global_letters_count * rel_aoa                            -0.006605
global_letters_count * rel_clustering                     -0.061114
global_letters_count * rel_frequency                       0.030223
global_letters_count * rel_letters_count                   0.000092
global_letters_count * rel_orthographic_density           -0.023245
global_letters_count * rel_synonyms_count                  0.141389
global_orthographic_density * global_synonyms_count       -0.206547
global_orthographic_density * rel_aoa                      0.004075
global_orthographic_density * rel_clustering               0.214174
global_orthographic_density * rel_frequency                0.027554
global_orthographic_density * rel_letters_count            0.011431
global_orthographic_density * rel_orthographic_density    -0.040874
global_orthographic_density * rel_synonyms_count           0.462757
global_synonyms_count * rel_aoa                            0.063843
global_synonyms_count * rel_clustering                     0.094148
global_synonyms_count * rel_frequency                      0.056268
global_synonyms_count * rel_letters_count                 -0.000066
global_synonyms_count * rel_orthographic_density           0.386507
global_synonyms_count * rel_synonyms_count                 0.164567
rel_aoa * rel_clustering                                  -0.000597
rel_aoa * rel_frequency                                    0.018225
rel_aoa * rel_letters_count                                0.011192
rel_aoa * rel_orthographic_density                        -0.042880
rel_aoa * rel_synonyms_count                              -0.101835
rel_clustering * rel_frequency                            -0.019322
rel_clustering * rel_letters_count                         0.136242
rel_clustering * rel_orthographic_density                 -0.122483
rel_clustering * rel_synonyms_count                       -0.019460
rel_frequency * rel_letters_count                         -0.008129
rel_frequency * rel_orthographic_density                  -0.022176
rel_frequency * rel_synonyms_count                        -0.104997
rel_letters_count * rel_orthographic_density              -0.035151
rel_letters_count * rel_synonyms_count                    -0.049400
rel_orthographic_density * rel_synonyms_count             -0.446660
dtype: float64

Regressing rel synonyms_count with 833 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.17081454280501474

intercept                      1.170282
global_aoa                    -0.008402
global_clustering              0.073297
global_frequency              -0.034522
global_letters_count          -0.017105
global_orthographic_density    0.030856
global_synonyms_count         -0.647509
rel_aoa                        0.003167
rel_clustering                -0.076757
rel_frequency                  0.032233
rel_letters_count              0.033263
rel_orthographic_density       0.005251
rel_synonyms_count             0.926885
dtype: float64

Regressing rel synonyms_count with 833 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.24859007751311182

intercept                                                 12.267180
global_aoa                                                -0.315442
global_clustering                                          2.049483
global_frequency                                          -0.610417
global_letters_count                                       0.217810
global_orthographic_density                               -1.327200
global_synonyms_count                                      0.367049
rel_aoa                                                    0.204742
rel_clustering                                            -1.519494
rel_frequency                                              0.358722
rel_letters_count                                         -0.510679
rel_orthographic_density                                   1.087035
rel_synonyms_count                                        -2.363717
global_aoa * global_clustering                            -0.047184
global_aoa * global_frequency                              0.006759
global_aoa * global_letters_count                         -0.007586
global_aoa * global_orthographic_density                  -0.020969
global_aoa * global_synonyms_count                         0.040102
global_aoa * rel_aoa                                      -0.005357
global_aoa * rel_clustering                                0.063785
global_aoa * rel_frequency                                -0.017310
global_aoa * rel_letters_count                             0.003101
global_aoa * rel_orthographic_density                      0.031318
global_aoa * rel_synonyms_count                            0.030800
global_clustering * global_frequency                      -0.103219
global_clustering * global_letters_count                   0.009335
global_clustering * global_orthographic_density           -0.254876
global_clustering * global_synonyms_count                  0.192034
global_clustering * rel_aoa                               -0.001647
global_clustering * rel_clustering                         0.027190
global_clustering * rel_frequency                          0.053775
global_clustering * rel_letters_count                     -0.088131
global_clustering * rel_orthographic_density               0.182358
global_clustering * rel_synonyms_count                    -0.197262
global_frequency * global_letters_count                   -0.012274
global_frequency * global_orthographic_density            -0.011689
global_frequency * global_synonyms_count                   0.060048
global_frequency * rel_aoa                                -0.011967
global_frequency * rel_clustering                          0.084348
global_frequency * rel_frequency                           0.005171
global_frequency * rel_letters_count                      -0.005358
global_frequency * rel_orthographic_density               -0.003409
global_frequency * rel_synonyms_count                      0.016374
global_letters_count * global_orthographic_density         0.008438
global_letters_count * global_synonyms_count              -0.002520
global_letters_count * rel_aoa                            -0.016689
global_letters_count * rel_clustering                     -0.034415
global_letters_count * rel_frequency                       0.010771
global_letters_count * rel_letters_count                  -0.001200
global_letters_count * rel_orthographic_density           -0.014508
global_letters_count * rel_synonyms_count                  0.119488
global_orthographic_density * global_synonyms_count       -0.192323
global_orthographic_density * rel_aoa                      0.001159
global_orthographic_density * rel_clustering               0.173451
global_orthographic_density * rel_frequency               -0.005195
global_orthographic_density * rel_letters_count            0.013199
global_orthographic_density * rel_orthographic_density    -0.038700
global_orthographic_density * rel_synonyms_count           0.423847
global_synonyms_count * rel_aoa                            0.038160
global_synonyms_count * rel_clustering                     0.045277
global_synonyms_count * rel_frequency                      0.015948
global_synonyms_count * rel_letters_count                 -0.043679
global_synonyms_count * rel_orthographic_density           0.271651
global_synonyms_count * rel_synonyms_count                 0.162843
rel_aoa * rel_clustering                                  -0.001657
rel_aoa * rel_frequency                                    0.017663
rel_aoa * rel_letters_count                                0.017754
rel_aoa * rel_orthographic_density                        -0.026783
rel_aoa * rel_synonyms_count                              -0.100607
rel_clustering * rel_frequency                            -0.023772
rel_clustering * rel_letters_count                         0.103075
rel_clustering * rel_orthographic_density                 -0.107754
rel_clustering * rel_synonyms_count                       -0.010219
rel_frequency * rel_letters_count                          0.002712
rel_frequency * rel_orthographic_density                   0.005084
rel_frequency * rel_synonyms_count                        -0.076425
rel_letters_count * rel_orthographic_density              -0.033236
rel_letters_count * rel_synonyms_count                    -0.002889
rel_orthographic_density * rel_synonyms_count             -0.316457
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 702 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08798396812975995

intercept                      1.810253
global_aoa                    -0.026249
global_clustering             -0.003266
global_frequency              -0.018859
global_letters_count          -0.066540
global_orthographic_density    0.155306
global_synonyms_count          0.072444
dtype: float64

Regressing global orthographic_density with 702 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10893664141577175

intercept                                              6.494923
global_aoa                                            -0.366595
global_clustering                                      1.019832
global_frequency                                      -0.110332
global_letters_count                                  -0.222601
global_orthographic_density                            0.333227
global_synonyms_count                                  0.563230
global_aoa * global_clustering                        -0.038658
global_aoa * global_frequency                          0.006555
global_aoa * global_letters_count                      0.001671
global_aoa * global_orthographic_density               0.037032
global_aoa * global_synonyms_count                     0.010037
global_clustering * global_frequency                  -0.040958
global_clustering * global_letters_count              -0.067688
global_clustering * global_orthographic_density       -0.017996
global_clustering * global_synonyms_count              0.056086
global_frequency * global_letters_count               -0.021890
global_frequency * global_orthographic_density        -0.038761
global_frequency * global_synonyms_count               0.010994
global_letters_count * global_orthographic_density    -0.026843
global_letters_count * global_synonyms_count          -0.056282
global_orthographic_density * global_synonyms_count    0.007016
dtype: float64

Regressing rel orthographic_density with 702 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.062201866531399586

intercept                     -0.531062
global_aoa                    -0.014708
global_clustering             -0.007622
global_frequency              -0.019303
global_letters_count          -0.059091
global_orthographic_density    0.119845
global_synonyms_count          0.067439
dtype: float64

Regressing rel orthographic_density with 702 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07957456186523504

intercept                                              3.978576
global_aoa                                            -0.227618
global_clustering                                      0.995643
global_frequency                                      -0.047824
global_letters_count                                  -0.343330
global_orthographic_density                            0.036942
global_synonyms_count                                  0.394417
global_aoa * global_clustering                        -0.028461
global_aoa * global_frequency                         -0.001463
global_aoa * global_letters_count                      0.003469
global_aoa * global_orthographic_density               0.035508
global_aoa * global_synonyms_count                    -0.002505
global_clustering * global_frequency                  -0.034229
global_clustering * global_letters_count              -0.083757
global_clustering * global_orthographic_density       -0.025246
global_clustering * global_synonyms_count              0.013846
global_frequency * global_letters_count               -0.021634
global_frequency * global_orthographic_density        -0.021444
global_frequency * global_synonyms_count              -0.002139
global_letters_count * global_orthographic_density    -0.014951
global_letters_count * global_synonyms_count          -0.035087
global_orthographic_density * global_synonyms_count   -0.007825
dtype: float64

Regressing global orthographic_density with 702 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.061860452387010656

intercept                   1.582715
rel_aoa                     0.003287
rel_clustering             -0.037684
rel_frequency              -0.016845
rel_letters_count          -0.044847
rel_orthographic_density    0.197687
rel_synonyms_count          0.073212
dtype: float64

Regressing global orthographic_density with 702 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07801307435846072

intercept                                        1.595082
rel_aoa                                          0.089837
rel_clustering                                   0.035549
rel_frequency                                   -0.020758
rel_letters_count                               -0.038354
rel_orthographic_density                         0.293041
rel_synonyms_count                               0.259915
rel_aoa * rel_clustering                         0.027091
rel_aoa * rel_frequency                          0.024260
rel_aoa * rel_letters_count                     -0.007516
rel_aoa * rel_orthographic_density               0.023704
rel_aoa * rel_synonyms_count                     0.030957
rel_clustering * rel_frequency                  -0.003883
rel_clustering * rel_letters_count              -0.035418
rel_clustering * rel_orthographic_density        0.052372
rel_clustering * rel_synonyms_count             -0.023278
rel_frequency * rel_letters_count                0.006168
rel_frequency * rel_orthographic_density         0.029177
rel_frequency * rel_synonyms_count               0.035607
rel_letters_count * rel_orthographic_density    -0.029564
rel_letters_count * rel_synonyms_count          -0.033515
rel_orthographic_density * rel_synonyms_count    0.006195
dtype: float64

Regressing rel orthographic_density with 702 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10871203782605145

intercept                  -0.548367
rel_aoa                     0.013442
rel_clustering             -0.026775
rel_frequency               0.020052
rel_letters_count          -0.040778
rel_orthographic_density    0.263582
rel_synonyms_count          0.059577
dtype: float64

Regressing rel orthographic_density with 702 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12078360956592592

intercept                                       -0.459745
rel_aoa                                          0.081781
rel_clustering                                  -0.022145
rel_frequency                                    0.057023
rel_letters_count                               -0.072286
rel_orthographic_density                         0.340347
rel_synonyms_count                               0.165083
rel_aoa * rel_clustering                         0.015383
rel_aoa * rel_frequency                          0.010147
rel_aoa * rel_letters_count                     -0.001501
rel_aoa * rel_orthographic_density               0.046920
rel_aoa * rel_synonyms_count                     0.039138
rel_clustering * rel_frequency                  -0.023805
rel_clustering * rel_letters_count              -0.035102
rel_clustering * rel_orthographic_density        0.013234
rel_clustering * rel_synonyms_count             -0.037161
rel_frequency * rel_letters_count               -0.006569
rel_frequency * rel_orthographic_density         0.021349
rel_frequency * rel_synonyms_count               0.015144
rel_letters_count * rel_orthographic_density    -0.029790
rel_letters_count * rel_synonyms_count          -0.043910
rel_orthographic_density * rel_synonyms_count   -0.024707
dtype: float64

Regressing global orthographic_density with 702 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09713576474750862

intercept                      3.145444
global_aoa                    -0.055264
global_clustering              0.051823
global_frequency              -0.081375
global_letters_count          -0.138034
global_orthographic_density    0.232440
global_synonyms_count          0.054370
rel_aoa                        0.043219
rel_clustering                -0.060517
rel_frequency                  0.070767
rel_letters_count              0.074024
rel_orthographic_density      -0.106183
rel_synonyms_count             0.015751
dtype: float64

Regressing global orthographic_density with 702 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16441263569297138

intercept                                                -6.897459
global_aoa                                               -0.240760
global_clustering                                        -0.158301
global_frequency                                          0.657686
global_letters_count                                      0.241673
global_orthographic_density                               3.628179
global_synonyms_count                                     5.187979
rel_aoa                                                  -0.119246
rel_clustering                                            3.122210
rel_frequency                                            -0.407990
rel_letters_count                                         0.022589
rel_orthographic_density                                 -2.302904
rel_synonyms_count                                        0.361288
global_aoa * global_clustering                           -0.076074
global_aoa * global_frequency                            -0.023026
global_aoa * global_letters_count                        -0.012118
global_aoa * global_orthographic_density                  0.016090
global_aoa * global_synonyms_count                        0.077847
global_aoa * rel_aoa                                     -0.004238
global_aoa * rel_clustering                              -0.009263
global_aoa * rel_frequency                                0.019130
global_aoa * rel_letters_count                            0.033767
global_aoa * rel_orthographic_density                     0.019642
global_aoa * rel_synonyms_count                          -0.128264
global_clustering * global_frequency                      0.006700
global_clustering * global_letters_count                 -0.090894
global_clustering * global_orthographic_density           0.329052
global_clustering * global_synonyms_count                 0.237476
global_clustering * rel_aoa                              -0.061419
global_clustering * rel_clustering                       -0.026905
global_clustering * rel_frequency                        -0.082028
global_clustering * rel_letters_count                     0.131366
global_clustering * rel_orthographic_density             -0.203194
global_clustering * rel_synonyms_count                    0.009293
global_frequency * global_letters_count                  -0.057767
global_frequency * global_orthographic_density           -0.117326
global_frequency * global_synonyms_count                 -0.176847
global_frequency * rel_aoa                               -0.005917
global_frequency * rel_clustering                        -0.141968
global_frequency * rel_frequency                         -0.002176
global_frequency * rel_letters_count                      0.076163
global_frequency * rel_orthographic_density               0.088371
global_frequency * rel_synonyms_count                    -0.028580
global_letters_count * global_orthographic_density       -0.047038
global_letters_count * global_synonyms_count             -0.482796
global_letters_count * rel_aoa                           -0.016060
global_letters_count * rel_clustering                    -0.123893
global_letters_count * rel_frequency                     -0.039014
global_letters_count * rel_letters_count                 -0.027586
global_letters_count * rel_orthographic_density           0.025885
global_letters_count * rel_synonyms_count                 0.282163
global_orthographic_density * global_synonyms_count      -0.013454
global_orthographic_density * rel_aoa                    -0.008158
global_orthographic_density * rel_clustering             -0.447381
global_orthographic_density * rel_frequency              -0.027192
global_orthographic_density * rel_letters_count          -0.080136
global_orthographic_density * rel_orthographic_density   -0.106394
global_orthographic_density * rel_synonyms_count         -0.142264
global_synonyms_count * rel_aoa                          -0.106979
global_synonyms_count * rel_clustering                   -0.207514
global_synonyms_count * rel_frequency                     0.215563
global_synonyms_count * rel_letters_count                 0.376206
global_synonyms_count * rel_orthographic_density         -0.093659
global_synonyms_count * rel_synonyms_count               -0.066984
rel_aoa * rel_clustering                                  0.110107
rel_aoa * rel_frequency                                   0.017640
rel_aoa * rel_letters_count                               0.004891
rel_aoa * rel_orthographic_density                        0.009933
rel_aoa * rel_synonyms_count                              0.167717
rel_clustering * rel_frequency                            0.160657
rel_clustering * rel_letters_count                        0.002018
rel_clustering * rel_orthographic_density                 0.251641
rel_clustering * rel_synonyms_count                      -0.041121
rel_frequency * rel_letters_count                         0.000650
rel_frequency * rel_orthographic_density                  0.016372
rel_frequency * rel_synonyms_count                       -0.031945
rel_letters_count * rel_orthographic_density             -0.006171
rel_letters_count * rel_synonyms_count                   -0.263722
rel_orthographic_density * rel_synonyms_count             0.211542
dtype: float64

Regressing rel orthographic_density with 702 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1475939954963369

intercept                      2.255868
global_aoa                    -0.044492
global_clustering              0.046165
global_frequency              -0.066259
global_letters_count          -0.090103
global_orthographic_density   -0.530227
global_synonyms_count          0.063824
rel_aoa                        0.036150
rel_clustering                -0.045878
rel_frequency                  0.069241
rel_letters_count              0.027169
rel_orthographic_density       0.730601
rel_synonyms_count            -0.012228
dtype: float64

Regressing rel orthographic_density with 702 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.21098102428985244

intercept                                                -2.865236
global_aoa                                               -0.049936
global_clustering                                         0.630553
global_frequency                                          0.447039
global_letters_count                                     -0.223931
global_orthographic_density                               2.504581
global_synonyms_count                                     4.045263
rel_aoa                                                  -0.116245
rel_clustering                                            1.973197
rel_frequency                                            -0.334381
rel_letters_count                                         0.115605
rel_orthographic_density                                 -1.359082
rel_synonyms_count                                        0.944649
global_aoa * global_clustering                           -0.063505
global_aoa * global_frequency                            -0.023341
global_aoa * global_letters_count                        -0.011942
global_aoa * global_orthographic_density                 -0.025729
global_aoa * global_synonyms_count                        0.036089
global_aoa * rel_aoa                                     -0.003537
global_aoa * rel_clustering                              -0.023633
global_aoa * rel_frequency                                0.021217
global_aoa * rel_letters_count                            0.038948
global_aoa * rel_orthographic_density                     0.059740
global_aoa * rel_synonyms_count                          -0.081834
global_clustering * global_frequency                     -0.029580
global_clustering * global_letters_count                 -0.133880
global_clustering * global_orthographic_density           0.233916
global_clustering * global_synonyms_count                 0.165153
global_clustering * rel_aoa                              -0.061681
global_clustering * rel_clustering                       -0.014173
global_clustering * rel_frequency                        -0.045061
global_clustering * rel_letters_count                     0.150785
global_clustering * rel_orthographic_density             -0.102372
global_clustering * rel_synonyms_count                    0.068010
global_frequency * global_letters_count                  -0.047249
global_frequency * global_orthographic_density           -0.139104
global_frequency * global_synonyms_count                 -0.120683
global_frequency * rel_aoa                               -0.004866
global_frequency * rel_clustering                        -0.097593
global_frequency * rel_frequency                         -0.001489
global_frequency * rel_letters_count                      0.068192
global_frequency * rel_orthographic_density               0.125399
global_frequency * rel_synonyms_count                    -0.065934
global_letters_count * global_orthographic_density        0.017163
global_letters_count * global_synonyms_count             -0.377116
global_letters_count * rel_aoa                           -0.017302
global_letters_count * rel_clustering                    -0.035559
global_letters_count * rel_frequency                     -0.024773
global_letters_count * rel_letters_count                 -0.020174
global_letters_count * rel_orthographic_density          -0.003741
global_letters_count * rel_synonyms_count                 0.207163
global_orthographic_density * global_synonyms_count      -0.067571
global_orthographic_density * rel_aoa                    -0.024047
global_orthographic_density * rel_clustering             -0.281447
global_orthographic_density * rel_frequency               0.019353
global_orthographic_density * rel_letters_count          -0.092256
global_orthographic_density * rel_orthographic_density   -0.071488
global_orthographic_density * rel_synonyms_count         -0.066453
global_synonyms_count * rel_aoa                          -0.091325
global_synonyms_count * rel_clustering                   -0.080340
global_synonyms_count * rel_frequency                     0.171766
global_synonyms_count * rel_letters_count                 0.264641
global_synonyms_count * rel_orthographic_density         -0.106547
global_synonyms_count * rel_synonyms_count               -0.066587
rel_aoa * rel_clustering                                  0.107288
rel_aoa * rel_frequency                                   0.010940
rel_aoa * rel_letters_count                              -0.000879
rel_aoa * rel_orthographic_density                        0.017901
rel_aoa * rel_synonyms_count                              0.148422
rel_clustering * rel_frequency                            0.113703
rel_clustering * rel_letters_count                       -0.061361
rel_clustering * rel_orthographic_density                 0.083284
rel_clustering * rel_synonyms_count                      -0.166950
rel_frequency * rel_letters_count                        -0.014558
rel_frequency * rel_orthographic_density                 -0.031017
rel_frequency * rel_synonyms_count                       -0.008256
rel_letters_count * rel_orthographic_density             -0.015666
rel_letters_count * rel_synonyms_count                   -0.167148
rel_orthographic_density * rel_synonyms_count             0.205311
dtype: float64