Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.discrete, source=Source.all, past=Past.last_bin, durl=Durl.all, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 26478 substitutions for model Model(time=Time.discrete, source=Source.all, past=Past.last_bin, durl=Durl.all, max_distance=1)
100% (26478 of 26478) |####################| Elapsed Time: 0:06:13 Time: 0:06:13

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | ns. | *** | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | *** | **  |
H_00 | *** | *   | **  | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | *** | ns. | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *   | **  | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *   | ns. | *   |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *   | **  | *   |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *** | ns. | ns. | *** |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *** |
H_00 | *** | ns. | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | **  | ns. | **  |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *** | *** | *** | **  |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | ns. | *** | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | **  | **  | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *** | *   | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *** | ns. | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | ns. |
H_00 | *** | *** | *   | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | **  |
H_00 | *** | *** | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *** | ns. | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | ns. |
H_00 | *** | *** | *   | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | ns. | ns. | *** |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | ns. | *   | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *** | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *   | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *   |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *** | ns. | *   |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.53006544  0.17932205  0.08141898  0.07051413  0.03214494  0.030518
  0.02116978  0.01834918  0.01486674  0.00951714]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.520267 0.261026 -0.085568 0.229056 0.251848 -0.428476 0.193606 0.262045 -0.386474 0.271120 -0.159254 0.000484
Component-1 0.326613 -0.400405 0.112798 -0.277205 -0.243586 -0.428554 0.161222 -0.279969 -0.467448 0.223314 -0.164824 0.022861
Component-2 -0.601661 -0.657915 0.102492 -0.198629 0.293584 0.143075 0.004068 -0.187559 0.074046 -0.054288 0.050142 0.048905

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (26478 of 26478) |####################| Elapsed Time: 0:04:25 Time: 0:04:25

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | **  | **  | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | **  | *** | *** | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.66788175  0.20182029]

Out[35]:
aoa frequency letters_count
Component-0 -0.752382 0.394130 -0.527809
Component-1 0.325555 -0.474083 -0.818083

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (26478 of 26478) |####################| Elapsed Time: 0:02:53 Time: 0:02:53

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1002 (cluster-unique) substitutions, but the PCA is in fact computed on 786 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.13458594534634838

intercept                      4.732901
global_aoa                     0.100633
global_clustering              0.068417
global_frequency               0.442892
global_letters_count          -0.032332
global_orthographic_density   -0.000199
global_synonyms_count         -0.078338
dtype: float64

Regressing global frequency with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.170893743423971

intercept                                              13.053435
global_aoa                                             -0.572500
global_clustering                                       2.196326
global_frequency                                        0.034688
global_letters_count                                    0.095987
global_orthographic_density                             2.528096
global_synonyms_count                                   0.375394
global_aoa * global_clustering                         -0.100323
global_aoa * global_frequency                           0.008315
global_aoa * global_letters_count                       0.012507
global_aoa * global_orthographic_density               -0.096089
global_aoa * global_synonyms_count                      0.088756
global_clustering * global_frequency                   -0.155730
global_clustering * global_letters_count               -0.022429
global_clustering * global_orthographic_density         0.104592
global_clustering * global_synonyms_count               0.087098
global_frequency * global_letters_count                -0.043835
global_frequency * global_orthographic_density         -0.203181
global_frequency * global_synonyms_count               -0.027674
global_letters_count * global_orthographic_density      0.100170
global_letters_count * global_synonyms_count           -0.071316
global_orthographic_density * global_synonyms_count     0.120140
dtype: float64

Regressing rel frequency with 609 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.07562999107589308

intercept                     -6.613194
global_aoa                     0.129686
global_clustering              0.038649
global_frequency               0.377230
global_letters_count          -0.012955
global_orthographic_density   -0.094368
global_synonyms_count          0.045140
dtype: float64

Regressing rel frequency with 609 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.10551560094845724

intercept                                              4.082295
global_aoa                                            -0.866179
global_clustering                                      1.650276
global_frequency                                      -0.133788
global_letters_count                                  -0.335169
global_orthographic_density                            1.167313
global_synonyms_count                                 -0.080437
global_aoa * global_clustering                        -0.097650
global_aoa * global_frequency                          0.013542
global_aoa * global_letters_count                      0.049922
global_aoa * global_orthographic_density              -0.046679
global_aoa * global_synonyms_count                     0.106075
global_clustering * global_frequency                  -0.118693
global_clustering * global_letters_count              -0.007141
global_clustering * global_orthographic_density        0.092113
global_clustering * global_synonyms_count              0.284270
global_frequency * global_letters_count               -0.020741
global_frequency * global_orthographic_density        -0.136593
global_frequency * global_synonyms_count               0.100313
global_letters_count * global_orthographic_density     0.138223
global_letters_count * global_synonyms_count          -0.010780
global_orthographic_density * global_synonyms_count    0.212512
dtype: float64

Regressing global frequency with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08154542308494228

intercept                   9.562106
rel_aoa                     0.101387
rel_clustering              0.031312
rel_frequency               0.287423
rel_letters_count          -0.034603
rel_orthographic_density    0.010887
rel_synonyms_count         -0.239911
dtype: float64

Regressing global frequency with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.10606229826814495

intercept                                        9.408402
rel_aoa                                          0.137693
rel_clustering                                   0.016419
rel_frequency                                    0.244690
rel_letters_count                               -0.014864
rel_orthographic_density                        -0.256035
rel_synonyms_count                              -0.027917
rel_aoa * rel_clustering                        -0.077607
rel_aoa * rel_frequency                          0.014252
rel_aoa * rel_letters_count                     -0.017245
rel_aoa * rel_orthographic_density              -0.072429
rel_aoa * rel_synonyms_count                     0.072534
rel_clustering * rel_frequency                  -0.053153
rel_clustering * rel_letters_count              -0.043274
rel_clustering * rel_orthographic_density       -0.026559
rel_clustering * rel_synonyms_count              0.235719
rel_frequency * rel_letters_count               -0.022796
rel_frequency * rel_orthographic_density        -0.125298
rel_frequency * rel_synonyms_count               0.035312
rel_letters_count * rel_orthographic_density     0.023846
rel_letters_count * rel_synonyms_count          -0.172249
rel_orthographic_density * rel_synonyms_count   -0.188230
dtype: float64

Regressing rel frequency with 609 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3165306259587841

intercept                  -1.163446
rel_aoa                     0.095437
rel_clustering              0.180373
rel_frequency               0.654566
rel_letters_count          -0.106076
rel_orthographic_density   -0.184974
rel_synonyms_count         -0.113782
dtype: float64

Regressing rel frequency with 609 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.3413519471665122

intercept                                       -1.196273
rel_aoa                                          0.115264
rel_clustering                                   0.161733
rel_frequency                                    0.687578
rel_letters_count                               -0.107333
rel_orthographic_density                        -0.504287
rel_synonyms_count                              -0.016866
rel_aoa * rel_clustering                        -0.105389
rel_aoa * rel_frequency                         -0.014090
rel_aoa * rel_letters_count                      0.004660
rel_aoa * rel_orthographic_density               0.025015
rel_aoa * rel_synonyms_count                     0.151518
rel_clustering * rel_frequency                  -0.054190
rel_clustering * rel_letters_count              -0.065974
rel_clustering * rel_orthographic_density       -0.107445
rel_clustering * rel_synonyms_count              0.230923
rel_frequency * rel_letters_count               -0.039088
rel_frequency * rel_orthographic_density        -0.114137
rel_frequency * rel_synonyms_count               0.024887
rel_letters_count * rel_orthographic_density     0.046534
rel_letters_count * rel_synonyms_count          -0.090211
rel_orthographic_density * rel_synonyms_count    0.046519
dtype: float64

Regressing global frequency with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.14682942473613114

intercept                      2.600373
global_aoa                     0.048623
global_clustering             -0.095522
global_frequency               0.477730
global_letters_count           0.115126
global_orthographic_density    0.128606
global_synonyms_count          0.476346
rel_aoa                        0.066661
rel_clustering                 0.174989
rel_frequency                 -0.043909
rel_letters_count             -0.163481
rel_orthographic_density      -0.154911
rel_synonyms_count            -0.644758
dtype: float64

Regressing global frequency with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.2734486285377

intercept                                                -31.312451
global_aoa                                                 0.487788
global_clustering                                         -6.063320
global_frequency                                           1.019127
global_letters_count                                       1.565928
global_orthographic_density                                7.251818
global_synonyms_count                                     10.209770
rel_aoa                                                    1.519201
rel_clustering                                             7.609259
rel_frequency                                             -1.973176
rel_letters_count                                         -1.264181
rel_orthographic_density                                   0.080753
rel_synonyms_count                                        -6.879203
global_aoa * global_clustering                             0.271333
global_aoa * global_frequency                              0.100831
global_aoa * global_letters_count                          0.025348
global_aoa * global_orthographic_density                   0.077263
global_aoa * global_synonyms_count                        -0.262201
global_aoa * rel_aoa                                      -0.017622
global_aoa * rel_clustering                               -0.265454
global_aoa * rel_frequency                                 0.047122
global_aoa * rel_letters_count                             0.007317
global_aoa * rel_orthographic_density                     -0.245236
global_aoa * rel_synonyms_count                            0.273872
global_clustering * global_frequency                       0.054697
global_clustering * global_letters_count                   0.079930
global_clustering * global_orthographic_density            1.342365
global_clustering * global_synonyms_count                  0.701642
global_clustering * rel_aoa                               -0.220217
global_clustering * rel_clustering                         0.169575
global_clustering * rel_frequency                         -0.234925
global_clustering * rel_letters_count                      0.057017
global_clustering * rel_orthographic_density              -0.769322
global_clustering * rel_synonyms_count                    -0.505825
global_frequency * global_letters_count                   -0.159051
global_frequency * global_orthographic_density            -0.053122
global_frequency * global_synonyms_count                  -0.584292
global_frequency * rel_aoa                                -0.216044
global_frequency * rel_clustering                         -0.047626
global_frequency * rel_frequency                           0.040442
global_frequency * rel_letters_count                       0.137898
global_frequency * rel_orthographic_density               -0.297945
global_frequency * rel_synonyms_count                      0.466604
global_letters_count * global_orthographic_density         0.127643
global_letters_count * global_synonyms_count               0.330127
global_letters_count * rel_aoa                            -0.062610
global_letters_count * rel_clustering                     -0.231260
global_letters_count * rel_frequency                      -0.005274
global_letters_count * rel_letters_count                   0.032556
global_letters_count * rel_orthographic_density           -0.049678
global_letters_count * rel_synonyms_count                 -0.655297
global_orthographic_density * global_synonyms_count        0.393843
global_orthographic_density * rel_aoa                     -0.111785
global_orthographic_density * rel_clustering              -1.247651
global_orthographic_density * rel_frequency               -0.049337
global_orthographic_density * rel_letters_count           -0.005983
global_orthographic_density * rel_orthographic_density     0.034096
global_orthographic_density * rel_synonyms_count          -0.196541
global_synonyms_count * rel_aoa                            0.286723
global_synonyms_count * rel_clustering                    -1.178420
global_synonyms_count * rel_frequency                      0.384614
global_synonyms_count * rel_letters_count                  0.204406
global_synonyms_count * rel_orthographic_density           0.345825
global_synonyms_count * rel_synonyms_count                -0.064036
rel_aoa * rel_clustering                                   0.084219
rel_aoa * rel_frequency                                    0.085279
rel_aoa * rel_letters_count                                0.041359
rel_aoa * rel_orthographic_density                         0.139123
rel_aoa * rel_synonyms_count                              -0.181705
rel_clustering * rel_frequency                             0.189849
rel_clustering * rel_letters_count                         0.070867
rel_clustering * rel_orthographic_density                  0.742465
rel_clustering * rel_synonyms_count                        1.371243
rel_frequency * rel_letters_count                          0.002353
rel_frequency * rel_orthographic_density                   0.237991
rel_frequency * rel_synonyms_count                        -0.176970
rel_letters_count * rel_orthographic_density               0.135490
rel_letters_count * rel_synonyms_count                     0.030819
rel_orthographic_density * rel_synonyms_count             -0.638591
dtype: float64

Regressing rel frequency with 609 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.393048930944016

intercept                      2.345890
global_aoa                     0.045457
global_clustering             -0.031667
global_frequency              -0.461495
global_letters_count           0.137902
global_orthographic_density    0.120582
global_synonyms_count          0.448356
rel_aoa                        0.053204
rel_clustering                 0.144242
rel_frequency                  0.931519
rel_letters_count             -0.183532
rel_orthographic_density      -0.129485
rel_synonyms_count            -0.569856
dtype: float64

Regressing rel frequency with 609 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.4809117604616871

intercept                                                -34.347864
global_aoa                                                 0.605012
global_clustering                                         -6.246620
global_frequency                                           0.286469
global_letters_count                                       1.860699
global_orthographic_density                                7.255983
global_synonyms_count                                      9.835468
rel_aoa                                                    1.192516
rel_clustering                                             8.258826
rel_frequency                                             -1.115456
rel_letters_count                                         -1.829569
rel_orthographic_density                                  -0.368011
rel_synonyms_count                                        -6.606251
global_aoa * global_clustering                             0.258791
global_aoa * global_frequency                              0.087222
global_aoa * global_letters_count                          0.015840
global_aoa * global_orthographic_density                   0.068128
global_aoa * global_synonyms_count                        -0.248010
global_aoa * rel_aoa                                      -0.014158
global_aoa * rel_clustering                               -0.263579
global_aoa * rel_frequency                                 0.053210
global_aoa * rel_letters_count                             0.030670
global_aoa * rel_orthographic_density                     -0.207683
global_aoa * rel_synonyms_count                            0.260415
global_clustering * global_frequency                       0.088992
global_clustering * global_letters_count                   0.159601
global_clustering * global_orthographic_density            1.223483
global_clustering * global_synonyms_count                  0.540761
global_clustering * rel_aoa                               -0.227072
global_clustering * rel_clustering                         0.133520
global_clustering * rel_frequency                         -0.230076
global_clustering * rel_letters_count                     -0.003043
global_clustering * rel_orthographic_density              -0.644806
global_clustering * rel_synonyms_count                    -0.370721
global_frequency * global_letters_count                   -0.114333
global_frequency * global_orthographic_density            -0.086468
global_frequency * global_synonyms_count                  -0.573994
global_frequency * rel_aoa                                -0.194417
global_frequency * rel_clustering                         -0.118255
global_frequency * rel_frequency                           0.055515
global_frequency * rel_letters_count                       0.119777
global_frequency * rel_orthographic_density               -0.236793
global_frequency * rel_synonyms_count                      0.438164
global_letters_count * global_orthographic_density         0.072959
global_letters_count * global_synonyms_count               0.180928
global_letters_count * rel_aoa                            -0.058704
global_letters_count * rel_clustering                     -0.316928
global_letters_count * rel_frequency                      -0.025912
global_letters_count * rel_letters_count                   0.032225
global_letters_count * rel_orthographic_density            0.017767
global_letters_count * rel_synonyms_count                 -0.474261
global_orthographic_density * global_synonyms_count        0.252785
global_orthographic_density * rel_aoa                     -0.114549
global_orthographic_density * rel_clustering              -1.196371
global_orthographic_density * rel_frequency               -0.008583
global_orthographic_density * rel_letters_count            0.077960
global_orthographic_density * rel_orthographic_density     0.054375
global_orthographic_density * rel_synonyms_count          -0.037434
global_synonyms_count * rel_aoa                            0.269803
global_synonyms_count * rel_clustering                    -1.026888
global_synonyms_count * rel_frequency                      0.346105
global_synonyms_count * rel_letters_count                  0.275920
global_synonyms_count * rel_orthographic_density           0.339363
global_synonyms_count * rel_synonyms_count                -0.059588
rel_aoa * rel_clustering                                   0.099921
rel_aoa * rel_frequency                                    0.062805
rel_aoa * rel_letters_count                                0.019094
rel_aoa * rel_orthographic_density                         0.119023
rel_aoa * rel_synonyms_count                              -0.168754
rel_clustering * rel_frequency                             0.219882
rel_clustering * rel_letters_count                         0.140015
rel_clustering * rel_orthographic_density                  0.660464
rel_clustering * rel_synonyms_count                        1.219103
rel_frequency * rel_letters_count                          0.006125
rel_frequency * rel_orthographic_density                   0.178708
rel_frequency * rel_synonyms_count                        -0.124701
rel_letters_count * rel_orthographic_density               0.045916
rel_letters_count * rel_synonyms_count                    -0.068240
rel_orthographic_density * rel_synonyms_count             -0.651354
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 565 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.19139830506455813

intercept                      5.105384
global_aoa                     0.390833
global_clustering              0.038403
global_frequency              -0.026923
global_letters_count           0.025135
global_orthographic_density   -0.178582
global_synonyms_count          0.057076
dtype: float64

Regressing global aoa with 565 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.2214333087567271

intercept                                              9.298254
global_aoa                                             0.825377
global_clustering                                      0.214761
global_frequency                                       0.069873
global_letters_count                                  -0.849158
global_orthographic_density                           -4.031308
global_synonyms_count                                 -2.072774
global_aoa * global_clustering                         0.041490
global_aoa * global_frequency                         -0.044204
global_aoa * global_letters_count                      0.019265
global_aoa * global_orthographic_density               0.063165
global_aoa * global_synonyms_count                    -0.018955
global_clustering * global_frequency                   0.050784
global_clustering * global_letters_count              -0.041880
global_clustering * global_orthographic_density       -0.391421
global_clustering * global_synonyms_count             -0.528054
global_frequency * global_letters_count                0.057550
global_frequency * global_orthographic_density         0.141453
global_frequency * global_synonyms_count              -0.114748
global_letters_count * global_orthographic_density    -0.056899
global_letters_count * global_synonyms_count           0.002450
global_orthographic_density * global_synonyms_count    0.151615
dtype: float64

Regressing rel aoa with 565 measures, no interactions
           ^^^^^^^
R^2 = 0.0678952897488978

intercept                      0.813199
global_aoa                     0.174962
global_clustering              0.116709
global_frequency              -0.096094
global_letters_count           0.058135
global_orthographic_density    0.027109
global_synonyms_count         -0.001114
dtype: float64

Regressing rel aoa with 565 measures, with interactions
           ^^^^^^^
R^2 = 0.11811073078286216

intercept                                              1.071320
global_aoa                                             1.669151
global_clustering                                      0.558256
global_frequency                                       0.083799
global_letters_count                                  -0.878332
global_orthographic_density                           -2.924725
global_synonyms_count                                 -0.741151
global_aoa * global_clustering                         0.106543
global_aoa * global_frequency                         -0.081555
global_aoa * global_letters_count                     -0.032807
global_aoa * global_orthographic_density               0.029616
global_aoa * global_synonyms_count                    -0.008694
global_clustering * global_frequency                   0.018353
global_clustering * global_letters_count              -0.107085
global_clustering * global_orthographic_density       -0.414535
global_clustering * global_synonyms_count             -0.472372
global_frequency * global_letters_count                0.073170
global_frequency * global_orthographic_density         0.081519
global_frequency * global_synonyms_count              -0.223603
global_letters_count * global_orthographic_density    -0.093002
global_letters_count * global_synonyms_count          -0.034510
global_orthographic_density * global_synonyms_count    0.176714
dtype: float64

Regressing global aoa with 565 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.0984118386068984

intercept                   6.799396
rel_aoa                     0.205942
rel_clustering              0.179137
rel_frequency               0.093853
rel_letters_count          -0.002312
rel_orthographic_density   -0.522313
rel_synonyms_count          0.109088
dtype: float64

Regressing global aoa with 565 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.160108624268134

intercept                                        6.693246
rel_aoa                                         -0.099218
rel_clustering                                   0.229911
rel_frequency                                    0.109146
rel_letters_count                                0.090340
rel_orthographic_density                        -0.357041
rel_synonyms_count                               0.389927
rel_aoa * rel_clustering                         0.163234
rel_aoa * rel_frequency                         -0.112174
rel_aoa * rel_letters_count                      0.001742
rel_aoa * rel_orthographic_density               0.084286
rel_aoa * rel_synonyms_count                    -0.008098
rel_clustering * rel_frequency                   0.116916
rel_clustering * rel_letters_count               0.009178
rel_clustering * rel_orthographic_density       -0.183921
rel_clustering * rel_synonyms_count             -0.553668
rel_frequency * rel_letters_count                0.023982
rel_frequency * rel_orthographic_density         0.079630
rel_frequency * rel_synonyms_count              -0.115396
rel_letters_count * rel_orthographic_density     0.014857
rel_letters_count * rel_synonyms_count           0.080958
rel_orthographic_density * rel_synonyms_count    0.682570
dtype: float64

Regressing rel aoa with 565 measures, no interactions
           ^^^^^^^
R^2 = 0.28475110276758064

intercept                   0.469942
rel_aoa                     0.553270
rel_clustering              0.015290
rel_frequency              -0.055666
rel_letters_count           0.006858
rel_orthographic_density    0.038755
rel_synonyms_count          0.034713
dtype: float64

Regressing rel aoa with 565 measures, with interactions
           ^^^^^^^
R^2 = 0.3175760564533403

intercept                                        0.619068
rel_aoa                                          0.473297
rel_clustering                                  -0.112177
rel_frequency                                    0.032841
rel_letters_count                                0.006404
rel_orthographic_density                         0.359374
rel_synonyms_count                               0.293482
rel_aoa * rel_clustering                         0.069385
rel_aoa * rel_frequency                         -0.048337
rel_aoa * rel_letters_count                     -0.049335
rel_aoa * rel_orthographic_density              -0.031068
rel_aoa * rel_synonyms_count                     0.000274
rel_clustering * rel_frequency                   0.003495
rel_clustering * rel_letters_count              -0.039720
rel_clustering * rel_orthographic_density       -0.208399
rel_clustering * rel_synonyms_count             -0.376746
rel_frequency * rel_letters_count               -0.001497
rel_frequency * rel_orthographic_density         0.062460
rel_frequency * rel_synonyms_count              -0.073631
rel_letters_count * rel_orthographic_density    -0.051047
rel_letters_count * rel_synonyms_count           0.029907
rel_orthographic_density * rel_synonyms_count    0.422732
dtype: float64

Regressing global aoa with 565 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.20618438075284662

intercept                      4.681468
global_aoa                     0.468075
global_clustering              0.036994
global_frequency              -0.098164
global_letters_count           0.134727
global_orthographic_density    0.025013
global_synonyms_count         -0.225781
rel_aoa                       -0.123991
rel_clustering                 0.012308
rel_frequency                  0.071599
rel_letters_count             -0.113844
rel_orthographic_density      -0.182201
rel_synonyms_count             0.311203
dtype: float64

Regressing global aoa with 565 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.3536216511866521

intercept                                                 90.007287
global_aoa                                                -0.394137
global_clustering                                         10.957295
global_frequency                                          -2.875137
global_letters_count                                      -4.996187
global_orthographic_density                              -18.897225
global_synonyms_count                                    -11.902075
rel_aoa                                                    0.098614
rel_clustering                                            -8.222546
rel_frequency                                              3.598513
rel_letters_count                                          3.110634
rel_orthographic_density                                  13.056371
rel_synonyms_count                                         0.035866
global_aoa * global_clustering                            -0.125733
global_aoa * global_frequency                             -0.075716
global_aoa * global_letters_count                          0.104930
global_aoa * global_orthographic_density                   0.155530
global_aoa * global_synonyms_count                        -0.076248
global_aoa * rel_aoa                                       0.004226
global_aoa * rel_clustering                                0.017372
global_aoa * rel_frequency                                -0.028351
global_aoa * rel_letters_count                            -0.070814
global_aoa * rel_orthographic_density                     -0.054003
global_aoa * rel_synonyms_count                            0.308761
global_clustering * global_frequency                      -0.227725
global_clustering * global_letters_count                  -0.114922
global_clustering * global_orthographic_density           -2.746006
global_clustering * global_synonyms_count                 -1.603176
global_clustering * rel_aoa                                0.222865
global_clustering * rel_clustering                         0.071571
global_clustering * rel_frequency                          0.480019
global_clustering * rel_letters_count                     -0.144446
global_clustering * rel_orthographic_density               2.048148
global_clustering * rel_synonyms_count                     0.323020
global_frequency * global_letters_count                    0.368812
global_frequency * global_orthographic_density             0.176054
global_frequency * global_synonyms_count                   0.186384
global_frequency * rel_aoa                                 0.176895
global_frequency * rel_clustering                          0.090258
global_frequency * rel_frequency                          -0.009871
global_frequency * rel_letters_count                      -0.353419
global_frequency * rel_orthographic_density               -0.077192
global_frequency * rel_synonyms_count                      0.164350
global_letters_count * global_orthographic_density        -0.023643
global_letters_count * global_synonyms_count              -0.096465
global_letters_count * rel_aoa                            -0.086935
global_letters_count * rel_clustering                      0.307848
global_letters_count * rel_frequency                      -0.118567
global_letters_count * rel_letters_count                   0.031388
global_letters_count * rel_orthographic_density            0.021441
global_letters_count * rel_synonyms_count                  0.273287
global_orthographic_density * global_synonyms_count       -0.041671
global_orthographic_density * rel_aoa                     -0.157876
global_orthographic_density * rel_clustering               2.411302
global_orthographic_density * rel_frequency                0.150747
global_orthographic_density * rel_letters_count            0.047325
global_orthographic_density * rel_orthographic_density     0.261527
global_orthographic_density * rel_synonyms_count          -0.858772
global_synonyms_count * rel_aoa                           -0.421292
global_synonyms_count * rel_clustering                     1.565240
global_synonyms_count * rel_frequency                     -0.591566
global_synonyms_count * rel_letters_count                 -0.617610
global_synonyms_count * rel_orthographic_density          -1.100774
global_synonyms_count * rel_synonyms_count                 0.176364
rel_aoa * rel_clustering                                   0.060139
rel_aoa * rel_frequency                                   -0.135251
rel_aoa * rel_letters_count                                0.017819
rel_aoa * rel_orthographic_density                         0.107668
rel_aoa * rel_synonyms_count                               0.215139
rel_clustering * rel_frequency                            -0.262616
rel_clustering * rel_letters_count                        -0.046375
rel_clustering * rel_orthographic_density                 -1.827053
rel_clustering * rel_synonyms_count                       -1.184241
rel_frequency * rel_letters_count                          0.153907
rel_frequency * rel_orthographic_density                  -0.072931
rel_frequency * rel_synonyms_count                         0.071448
rel_letters_count * rel_orthographic_density               0.022351
rel_letters_count * rel_synonyms_count                     0.365355
rel_orthographic_density * rel_synonyms_count              2.302797
dtype: float64

Regressing rel aoa with 565 measures, no interactions
           ^^^^^^^
R^2 = 0.33607935868805916

intercept                      3.816291
global_aoa                    -0.396051
global_clustering              0.023583
global_frequency              -0.101884
global_letters_count           0.085693
global_orthographic_density   -0.028133
global_synonyms_count         -0.110503
rel_aoa                        0.830702
rel_clustering                 0.056881
rel_frequency                  0.042403
rel_letters_count             -0.051601
rel_orthographic_density      -0.125117
rel_synonyms_count             0.114861
dtype: float64

Regressing rel aoa with 565 measures, with interactions
           ^^^^^^^
R^2 = 0.4536230791575908

intercept                                                 76.455325
global_aoa                                                -1.167346
global_clustering                                          9.715342
global_frequency                                          -3.068724
global_letters_count                                      -3.331625
global_orthographic_density                              -14.489518
global_synonyms_count                                     -7.771518
rel_aoa                                                    1.169291
rel_clustering                                            -6.373530
rel_frequency                                              3.127874
rel_letters_count                                          2.812575
rel_orthographic_density                                  12.522347
rel_synonyms_count                                        -0.910600
global_aoa * global_clustering                            -0.163447
global_aoa * global_frequency                             -0.090943
global_aoa * global_letters_count                          0.050744
global_aoa * global_orthographic_density                   0.160555
global_aoa * global_synonyms_count                         0.112768
global_aoa * rel_aoa                                      -0.010260
global_aoa * rel_clustering                                0.083579
global_aoa * rel_frequency                                -0.022751
global_aoa * rel_letters_count                            -0.047585
global_aoa * rel_orthographic_density                     -0.114366
global_aoa * rel_synonyms_count                            0.042210
global_clustering * global_frequency                      -0.319848
global_clustering * global_letters_count                  -0.012800
global_clustering * global_orthographic_density           -1.996050
global_clustering * global_synonyms_count                 -1.365262
global_clustering * rel_aoa                                0.209690
global_clustering * rel_clustering                        -0.016441
global_clustering * rel_frequency                          0.435758
global_clustering * rel_letters_count                     -0.093025
global_clustering * rel_orthographic_density               1.557566
global_clustering * rel_synonyms_count                     0.416993
global_frequency * global_letters_count                    0.306018
global_frequency * global_orthographic_density             0.176996
global_frequency * global_synonyms_count                  -0.039324
global_frequency * rel_aoa                                 0.162187
global_frequency * rel_clustering                          0.080416
global_frequency * rel_frequency                           0.010728
global_frequency * rel_letters_count                      -0.296276
global_frequency * rel_orthographic_density               -0.235209
global_frequency * rel_synonyms_count                      0.297867
global_letters_count * global_orthographic_density        -0.016712
global_letters_count * global_synonyms_count              -0.323436
global_letters_count * rel_aoa                            -0.069674
global_letters_count * rel_clustering                      0.101766
global_letters_count * rel_frequency                      -0.100797
global_letters_count * rel_letters_count                   0.023538
global_letters_count * rel_orthographic_density           -0.039200
global_letters_count * rel_synonyms_count                  0.455406
global_orthographic_density * global_synonyms_count       -0.082399
global_orthographic_density * rel_aoa                     -0.139327
global_orthographic_density * rel_clustering               1.804132
global_orthographic_density * rel_frequency                0.075561
global_orthographic_density * rel_letters_count           -0.019601
global_orthographic_density * rel_orthographic_density     0.162677
global_orthographic_density * rel_synonyms_count          -0.604489
global_synonyms_count * rel_aoa                           -0.475534
global_synonyms_count * rel_clustering                     1.159370
global_synonyms_count * rel_frequency                     -0.297119
global_synonyms_count * rel_letters_count                 -0.262129
global_synonyms_count * rel_orthographic_density          -0.907601
global_synonyms_count * rel_synonyms_count                 0.149970
rel_aoa * rel_clustering                                   0.005129
rel_aoa * rel_frequency                                   -0.108117
rel_aoa * rel_letters_count                                0.009243
rel_aoa * rel_orthographic_density                         0.063329
rel_aoa * rel_synonyms_count                               0.317655
rel_clustering * rel_frequency                            -0.155985
rel_clustering * rel_letters_count                         0.018438
rel_clustering * rel_orthographic_density                 -1.468626
rel_clustering * rel_synonyms_count                       -1.063794
rel_frequency * rel_letters_count                          0.122724
rel_frequency * rel_orthographic_density                   0.054238
rel_frequency * rel_synonyms_count                        -0.127551
rel_letters_count * rel_orthographic_density               0.134140
rel_letters_count * rel_synonyms_count                     0.043716
rel_orthographic_density * rel_synonyms_count              1.778085
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 510 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17330686430765696

intercept                     -2.517914
global_aoa                    -0.023776
global_clustering              0.406621
global_frequency              -0.057034
global_letters_count          -0.022784
global_orthographic_density   -0.031893
global_synonyms_count         -0.106328
dtype: float64

Regressing global clustering with 510 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.25517745222000443

intercept                                             -4.547827
global_aoa                                             0.392317
global_clustering                                     -0.190823
global_frequency                                      -0.368164
global_letters_count                                   0.088954
global_orthographic_density                            0.329331
global_synonyms_count                                 -1.592748
global_aoa * global_clustering                         0.082869
global_aoa * global_frequency                         -0.004131
global_aoa * global_letters_count                      0.011660
global_aoa * global_orthographic_density               0.017149
global_aoa * global_synonyms_count                     0.018790
global_clustering * global_frequency                  -0.037712
global_clustering * global_letters_count               0.051830
global_clustering * global_orthographic_density        0.081134
global_clustering * global_synonyms_count             -0.173056
global_frequency * global_letters_count                0.012952
global_frequency * global_orthographic_density         0.018464
global_frequency * global_synonyms_count               0.017123
global_letters_count * global_orthographic_density    -0.036024
global_letters_count * global_synonyms_count           0.032468
global_orthographic_density * global_synonyms_count   -0.004325
dtype: float64

Regressing rel clustering with 510 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.13173815162038727

intercept                      3.156688
global_aoa                    -0.018951
global_clustering              0.354356
global_frequency              -0.041100
global_letters_count          -0.024263
global_orthographic_density   -0.031267
global_synonyms_count         -0.108351
dtype: float64

Regressing rel clustering with 510 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.20667201864315154

intercept                                              2.290160
global_aoa                                             0.273374
global_clustering                                     -0.181763
global_frequency                                      -0.471406
global_letters_count                                   0.018358
global_orthographic_density                            0.612535
global_synonyms_count                                 -1.559298
global_aoa * global_clustering                         0.076108
global_aoa * global_frequency                          0.002877
global_aoa * global_letters_count                      0.015658
global_aoa * global_orthographic_density               0.017838
global_aoa * global_synonyms_count                     0.010007
global_clustering * global_frequency                  -0.048213
global_clustering * global_letters_count               0.055219
global_clustering * global_orthographic_density        0.124241
global_clustering * global_synonyms_count             -0.154892
global_frequency * global_letters_count                0.017525
global_frequency * global_orthographic_density         0.009510
global_frequency * global_synonyms_count               0.004361
global_letters_count * global_orthographic_density    -0.026888
global_letters_count * global_synonyms_count           0.067121
global_orthographic_density * global_synonyms_count    0.023266
dtype: float64

Regressing global clustering with 510 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11266095428968814

intercept                  -5.895185
rel_aoa                     0.004256
rel_clustering              0.346082
rel_frequency              -0.018107
rel_letters_count          -0.019316
rel_orthographic_density   -0.003774
rel_synonyms_count         -0.124445
dtype: float64

Regressing global clustering with 510 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17523360185494663

intercept                                       -5.753557
rel_aoa                                         -0.011581
rel_clustering                                   0.114961
rel_frequency                                    0.023778
rel_letters_count                               -0.097837
rel_orthographic_density                         0.006678
rel_synonyms_count                              -0.237214
rel_aoa * rel_clustering                         0.066902
rel_aoa * rel_frequency                         -0.005091
rel_aoa * rel_letters_count                     -0.000175
rel_aoa * rel_orthographic_density               0.025890
rel_aoa * rel_synonyms_count                     0.010990
rel_clustering * rel_frequency                  -0.034880
rel_clustering * rel_letters_count               0.069168
rel_clustering * rel_orthographic_density        0.072226
rel_clustering * rel_synonyms_count             -0.091637
rel_frequency * rel_letters_count               -0.010500
rel_frequency * rel_orthographic_density         0.009039
rel_frequency * rel_synonyms_count              -0.029552
rel_letters_count * rel_orthographic_density    -0.019290
rel_letters_count * rel_synonyms_count           0.027757
rel_orthographic_density * rel_synonyms_count    0.039817
dtype: float64

Regressing rel clustering with 510 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2698044394982232

intercept                   0.205122
rel_aoa                    -0.013243
rel_clustering              0.552688
rel_frequency              -0.007132
rel_letters_count          -0.009572
rel_orthographic_density    0.008598
rel_synonyms_count         -0.089296
dtype: float64

Regressing rel clustering with 510 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.32510710430781464

intercept                                        0.317760
rel_aoa                                         -0.012305
rel_clustering                                   0.367219
rel_frequency                                    0.019281
rel_letters_count                               -0.082752
rel_orthographic_density                        -0.025893
rel_synonyms_count                              -0.234255
rel_aoa * rel_clustering                         0.053996
rel_aoa * rel_frequency                         -0.001241
rel_aoa * rel_letters_count                     -0.002377
rel_aoa * rel_orthographic_density               0.019276
rel_aoa * rel_synonyms_count                    -0.003049
rel_clustering * rel_frequency                  -0.038398
rel_clustering * rel_letters_count               0.072227
rel_clustering * rel_orthographic_density        0.118223
rel_clustering * rel_synonyms_count             -0.183810
rel_frequency * rel_letters_count               -0.010237
rel_frequency * rel_orthographic_density        -0.006449
rel_frequency * rel_synonyms_count              -0.045502
rel_letters_count * rel_orthographic_density    -0.016241
rel_letters_count * rel_synonyms_count           0.042280
rel_orthographic_density * rel_synonyms_count    0.037788
dtype: float64

Regressing global clustering with 510 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1881776543945609

intercept                     -1.037244
global_aoa                    -0.043931
global_clustering              0.429133
global_frequency              -0.129101
global_letters_count          -0.076187
global_orthographic_density   -0.107689
global_synonyms_count         -0.049441
rel_aoa                        0.024062
rel_clustering                -0.019453
rel_frequency                  0.083528
rel_letters_count              0.054752
rel_orthographic_density       0.076907
rel_synonyms_count            -0.075732
dtype: float64

Regressing global clustering with 510 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3551430439079043

intercept                                                 10.141671
global_aoa                                                -0.090860
global_clustering                                          3.483029
global_frequency                                          -0.987922
global_letters_count                                       0.444818
global_orthographic_density                                1.801503
global_synonyms_count                                     -1.905610
rel_aoa                                                   -0.238220
rel_clustering                                            -4.689488
rel_frequency                                              0.266573
rel_letters_count                                         -1.152446
rel_orthographic_density                                  -3.287966
rel_synonyms_count                                        -0.169457
global_aoa * global_clustering                            -0.049440
global_aoa * global_frequency                              0.010029
global_aoa * global_letters_count                         -0.021911
global_aoa * global_orthographic_density                  -0.143169
global_aoa * global_synonyms_count                        -0.047679
global_aoa * rel_aoa                                       0.013535
global_aoa * rel_clustering                                0.099198
global_aoa * rel_frequency                                -0.011138
global_aoa * rel_letters_count                             0.039902
global_aoa * rel_orthographic_density                      0.157123
global_aoa * rel_synonyms_count                            0.187343
global_clustering * global_frequency                      -0.192606
global_clustering * global_letters_count                  -0.027476
global_clustering * global_orthographic_density           -0.197864
global_clustering * global_synonyms_count                 -0.431326
global_clustering * rel_aoa                                0.018869
global_clustering * rel_clustering                        -0.082053
global_clustering * rel_frequency                          0.090155
global_clustering * rel_letters_count                     -0.014198
global_clustering * rel_orthographic_density               0.072095
global_clustering * rel_synonyms_count                     0.535183
global_frequency * global_letters_count                   -0.024388
global_frequency * global_orthographic_density            -0.143896
global_frequency * global_synonyms_count                   0.136531
global_frequency * rel_aoa                                -0.011219
global_frequency * rel_clustering                          0.172212
global_frequency * rel_frequency                          -0.002295
global_frequency * rel_letters_count                       0.042229
global_frequency * rel_orthographic_density                0.195137
global_frequency * rel_synonyms_count                      0.038618
global_letters_count * global_orthographic_density        -0.094717
global_letters_count * global_synonyms_count              -0.211544
global_letters_count * rel_aoa                             0.036988
global_letters_count * rel_clustering                      0.233670
global_letters_count * rel_frequency                       0.067024
global_letters_count * rel_letters_count                   0.007418
global_letters_count * rel_orthographic_density            0.095966
global_letters_count * rel_synonyms_count                  0.152322
global_orthographic_density * global_synonyms_count       -0.536885
global_orthographic_density * rel_aoa                      0.131001
global_orthographic_density * rel_clustering               0.252977
global_orthographic_density * rel_frequency                0.111100
global_orthographic_density * rel_letters_count            0.105241
global_orthographic_density * rel_orthographic_density     0.067316
global_orthographic_density * rel_synonyms_count           0.519578
global_synonyms_count * rel_aoa                            0.051228
global_synonyms_count * rel_clustering                    -0.095768
global_synonyms_count * rel_frequency                     -0.066381
global_synonyms_count * rel_letters_count                  0.176447
global_synonyms_count * rel_orthographic_density           0.228200
global_synonyms_count * rel_synonyms_count                 0.070481
rel_aoa * rel_clustering                                  -0.000445
rel_aoa * rel_frequency                                    0.010513
rel_aoa * rel_letters_count                               -0.051875
rel_aoa * rel_orthographic_density                        -0.103433
rel_aoa * rel_synonyms_count                              -0.163021
rel_clustering * rel_frequency                            -0.152095
rel_clustering * rel_letters_count                        -0.091337
rel_clustering * rel_orthographic_density                  0.016496
rel_clustering * rel_synonyms_count                       -0.137694
rel_frequency * rel_letters_count                         -0.081543
rel_frequency * rel_orthographic_density                  -0.131686
rel_frequency * rel_synonyms_count                        -0.086458
rel_letters_count * rel_orthographic_density              -0.095254
rel_letters_count * rel_synonyms_count                    -0.108520
rel_orthographic_density * rel_synonyms_count             -0.179651
dtype: float64

Regressing rel clustering with 510 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.3267998873047939

intercept                     -0.436949
global_aoa                    -0.035386
global_clustering             -0.414826
global_frequency              -0.108777
global_letters_count          -0.071847
global_orthographic_density   -0.085650
global_synonyms_count         -0.049706
rel_aoa                        0.015516
rel_clustering                 0.892033
rel_frequency                  0.073403
rel_letters_count              0.054453
rel_orthographic_density       0.052856
rel_synonyms_count            -0.066549
dtype: float64

Regressing rel clustering with 510 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.46407016259061473

intercept                                                 7.591661
global_aoa                                               -0.131126
global_clustering                                         1.696368
global_frequency                                         -0.819784
global_letters_count                                      0.249872
global_orthographic_density                               2.165339
global_synonyms_count                                    -1.493908
rel_aoa                                                  -0.075619
rel_clustering                                           -3.047435
rel_frequency                                             0.285212
rel_letters_count                                        -0.879116
rel_orthographic_density                                 -2.981356
rel_synonyms_count                                       -0.831931
global_aoa * global_clustering                           -0.039191
global_aoa * global_frequency                             0.007568
global_aoa * global_letters_count                        -0.006966
global_aoa * global_orthographic_density                 -0.115710
global_aoa * global_synonyms_count                       -0.031576
global_aoa * rel_aoa                                      0.013728
global_aoa * rel_clustering                               0.083627
global_aoa * rel_frequency                               -0.013718
global_aoa * rel_letters_count                            0.030429
global_aoa * rel_orthographic_density                     0.138779
global_aoa * rel_synonyms_count                           0.141753
global_clustering * global_frequency                     -0.144688
global_clustering * global_letters_count                  0.001656
global_clustering * global_orthographic_density          -0.066558
global_clustering * global_synonyms_count                -0.363054
global_clustering * rel_aoa                               0.006477
global_clustering * rel_clustering                       -0.113140
global_clustering * rel_frequency                         0.074023
global_clustering * rel_letters_count                    -0.013737
global_clustering * rel_orthographic_density              0.046616
global_clustering * rel_synonyms_count                    0.408264
global_frequency * global_letters_count                   0.001751
global_frequency * global_orthographic_density           -0.119388
global_frequency * global_synonyms_count                  0.085172
global_frequency * rel_aoa                               -0.016245
global_frequency * rel_clustering                         0.140379
global_frequency * rel_frequency                         -0.001608
global_frequency * rel_letters_count                      0.021953
global_frequency * rel_orthographic_density               0.168262
global_frequency * rel_synonyms_count                     0.078638
global_letters_count * global_orthographic_density       -0.104009
global_letters_count * global_synonyms_count             -0.174742
global_letters_count * rel_aoa                            0.014038
global_letters_count * rel_clustering                     0.182357
global_letters_count * rel_frequency                      0.047348
global_letters_count * rel_letters_count                  0.006971
global_letters_count * rel_orthographic_density           0.093349
global_letters_count * rel_synonyms_count                 0.158305
global_orthographic_density * global_synonyms_count      -0.420174
global_orthographic_density * rel_aoa                     0.096210
global_orthographic_density * rel_clustering              0.118416
global_orthographic_density * rel_frequency               0.096387
global_orthographic_density * rel_letters_count           0.118707
global_orthographic_density * rel_orthographic_density    0.055813
global_orthographic_density * rel_synonyms_count          0.393632
global_synonyms_count * rel_aoa                           0.032121
global_synonyms_count * rel_clustering                   -0.131923
global_synonyms_count * rel_frequency                    -0.055771
global_synonyms_count * rel_letters_count                 0.145467
global_synonyms_count * rel_orthographic_density          0.120126
global_synonyms_count * rel_synonyms_count                0.056769
rel_aoa * rel_clustering                                 -0.001927
rel_aoa * rel_frequency                                   0.017553
rel_aoa * rel_letters_count                              -0.036626
rel_aoa * rel_orthographic_density                       -0.083753
rel_aoa * rel_synonyms_count                             -0.126938
rel_clustering * rel_frequency                           -0.144406
rel_clustering * rel_letters_count                       -0.066831
rel_clustering * rel_orthographic_density                 0.039184
rel_clustering * rel_synonyms_count                      -0.052906
rel_frequency * rel_letters_count                        -0.065293
rel_frequency * rel_orthographic_density                 -0.117658
rel_frequency * rel_synonyms_count                       -0.101169
rel_letters_count * rel_orthographic_density             -0.098859
rel_letters_count * rel_synonyms_count                   -0.118851
rel_orthographic_density * rel_synonyms_count            -0.104267
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1713151320987324

intercept                      2.446281
global_aoa                    -0.001142
global_clustering             -0.157662
global_frequency               0.093614
global_letters_count           0.402701
global_orthographic_density   -0.139561
global_synonyms_count         -0.359267
dtype: float64

Regressing global letters_count with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.19809651843073442

intercept                                              4.380445
global_aoa                                             0.005046
global_clustering                                     -1.053606
global_frequency                                       0.109877
global_letters_count                                  -0.554307
global_orthographic_density                           -2.642408
global_synonyms_count                                 -1.508426
global_aoa * global_clustering                         0.104121
global_aoa * global_frequency                          0.063118
global_aoa * global_letters_count                      0.003722
global_aoa * global_orthographic_density              -0.000675
global_aoa * global_synonyms_count                     0.067937
global_clustering * global_frequency                   0.132449
global_clustering * global_letters_count              -0.154444
global_clustering * global_orthographic_density       -0.147281
global_clustering * global_synonyms_count              0.235321
global_frequency * global_letters_count                0.003282
global_frequency * global_orthographic_density         0.198147
global_frequency * global_synonyms_count               0.114793
global_letters_count * global_orthographic_density    -0.046770
global_letters_count * global_synonyms_count           0.118386
global_orthographic_density * global_synonyms_count    0.270126
dtype: float64

Regressing rel letters_count with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.10287666731398448

intercept                     -0.334681
global_aoa                    -0.046120
global_clustering             -0.153415
global_frequency               0.032783
global_letters_count           0.336031
global_orthographic_density   -0.043435
global_synonyms_count         -0.453793
dtype: float64

Regressing rel letters_count with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12792986587906563

intercept                                             -4.053557
global_aoa                                             0.514061
global_clustering                                     -1.772502
global_frequency                                       0.380264
global_letters_count                                  -0.510836
global_orthographic_density                           -2.572537
global_synonyms_count                                 -1.924468
global_aoa * global_clustering                         0.159049
global_aoa * global_frequency                          0.044306
global_aoa * global_letters_count                     -0.013410
global_aoa * global_orthographic_density               0.020913
global_aoa * global_synonyms_count                     0.068804
global_clustering * global_frequency                   0.173948
global_clustering * global_letters_count              -0.142443
global_clustering * global_orthographic_density       -0.158303
global_clustering * global_synonyms_count              0.039266
global_frequency * global_letters_count                0.016136
global_frequency * global_orthographic_density         0.195445
global_frequency * global_synonyms_count               0.017904
global_letters_count * global_orthographic_density    -0.078645
global_letters_count * global_synonyms_count           0.117053
global_orthographic_density * global_synonyms_count    0.281444
dtype: float64

Regressing global letters_count with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13490163264927824

intercept                   5.708935
rel_aoa                    -0.109648
rel_clustering              0.043974
rel_frequency               0.124693
rel_letters_count           0.315432
rel_orthographic_density   -0.363750
rel_synonyms_count         -0.226005
dtype: float64

Regressing global letters_count with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15318251896183277

intercept                                        5.606865
rel_aoa                                         -0.240767
rel_clustering                                   0.164113
rel_frequency                                    0.149694
rel_letters_count                                0.429272
rel_orthographic_density                        -0.367954
rel_synonyms_count                              -0.099240
rel_aoa * rel_clustering                         0.081765
rel_aoa * rel_frequency                         -0.016816
rel_aoa * rel_letters_count                     -0.016047
rel_aoa * rel_orthographic_density              -0.072940
rel_aoa * rel_synonyms_count                     0.056172
rel_clustering * rel_frequency                   0.003407
rel_clustering * rel_letters_count              -0.074411
rel_clustering * rel_orthographic_density        0.027695
rel_clustering * rel_synonyms_count              0.311901
rel_frequency * rel_letters_count                0.001749
rel_frequency * rel_orthographic_density         0.033318
rel_frequency * rel_synonyms_count               0.093112
rel_letters_count * rel_orthographic_density     0.052110
rel_letters_count * rel_synonyms_count           0.179167
rel_orthographic_density * rel_synonyms_count    0.448362
dtype: float64

Regressing rel letters_count with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1955443448891214

intercept                   1.187384
rel_aoa                    -0.094487
rel_clustering             -0.043143
rel_frequency              -0.076661
rel_letters_count           0.483879
rel_orthographic_density   -0.044325
rel_synonyms_count         -0.268160
dtype: float64

Regressing rel letters_count with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2236896144233963

intercept                                        0.965528
rel_aoa                                         -0.186210
rel_clustering                                   0.146766
rel_frequency                                   -0.117066
rel_letters_count                                0.685240
rel_orthographic_density                         0.016315
rel_synonyms_count                              -0.078320
rel_aoa * rel_clustering                         0.087112
rel_aoa * rel_frequency                          0.002430
rel_aoa * rel_letters_count                     -0.052447
rel_aoa * rel_orthographic_density              -0.155958
rel_aoa * rel_synonyms_count                     0.101363
rel_clustering * rel_frequency                   0.045172
rel_clustering * rel_letters_count              -0.046808
rel_clustering * rel_orthographic_density        0.054944
rel_clustering * rel_synonyms_count              0.280195
rel_frequency * rel_letters_count                0.027363
rel_frequency * rel_orthographic_density         0.052596
rel_frequency * rel_synonyms_count               0.096569
rel_letters_count * rel_orthographic_density     0.056445
rel_letters_count * rel_synonyms_count           0.148402
rel_orthographic_density * rel_synonyms_count    0.422623
dtype: float64

Regressing global letters_count with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1934857935809252

intercept                     -2.336964
global_aoa                     0.122635
global_clustering             -0.473283
global_frequency               0.174034
global_letters_count           0.563111
global_orthographic_density    0.112444
global_synonyms_count         -0.642544
rel_aoa                       -0.183687
rel_clustering                 0.360158
rel_frequency                 -0.112118
rel_letters_count             -0.167570
rel_orthographic_density      -0.239506
rel_synonyms_count             0.350062
dtype: float64

Regressing global letters_count with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.31665856766364453

intercept                                                -25.162270
global_aoa                                                 3.156662
global_clustering                                         -4.685306
global_frequency                                           1.518827
global_letters_count                                      -1.129975
global_orthographic_density                               -3.804525
global_synonyms_count                                      4.933874
rel_aoa                                                   -5.262951
rel_clustering                                            -0.757940
rel_frequency                                             -0.542928
rel_letters_count                                         -0.021549
rel_orthographic_density                                  -2.689180
rel_synonyms_count                                        -8.498060
global_aoa * global_clustering                             0.337983
global_aoa * global_frequency                             -0.061966
global_aoa * global_letters_count                         -0.089420
global_aoa * global_orthographic_density                   0.081892
global_aoa * global_synonyms_count                        -0.176185
global_aoa * rel_aoa                                       0.017329
global_aoa * rel_clustering                               -0.302220
global_aoa * rel_frequency                                 0.059299
global_aoa * rel_letters_count                             0.087478
global_aoa * rel_orthographic_density                      0.008943
global_aoa * rel_synonyms_count                            0.165225
global_clustering * global_frequency                       0.380217
global_clustering * global_letters_count                  -0.049709
global_clustering * global_orthographic_density           -0.487784
global_clustering * global_synonyms_count                 -0.125689
global_clustering * rel_aoa                               -0.015009
global_clustering * rel_clustering                        -0.053655
global_clustering * rel_frequency                         -0.109855
global_clustering * rel_letters_count                     -0.325795
global_clustering * rel_orthographic_density               0.037647
global_clustering * rel_synonyms_count                    -0.324223
global_frequency * global_letters_count                    0.258865
global_frequency * global_orthographic_density             0.245266
global_frequency * global_synonyms_count                  -0.234764
global_frequency * rel_aoa                                 0.313618
global_frequency * rel_clustering                         -0.011509
global_frequency * rel_frequency                          -0.046979
global_frequency * rel_letters_count                      -0.300806
global_frequency * rel_orthographic_density                0.093536
global_frequency * rel_synonyms_count                      0.235276
global_letters_count * global_orthographic_density        -0.311706
global_letters_count * global_synonyms_count              -0.247637
global_letters_count * rel_aoa                             0.288953
global_letters_count * rel_clustering                      0.338324
global_letters_count * rel_frequency                      -0.077834
global_letters_count * rel_letters_count                   0.037220
global_letters_count * rel_orthographic_density            0.174047
global_letters_count * rel_synonyms_count                  0.533808
global_orthographic_density * global_synonyms_count       -0.596968
global_orthographic_density * rel_aoa                      0.072303
global_orthographic_density * rel_clustering               0.413864
global_orthographic_density * rel_frequency               -0.005763
global_orthographic_density * rel_letters_count            0.222709
global_orthographic_density * rel_orthographic_density     0.260975
global_orthographic_density * rel_synonyms_count           0.562356
global_synonyms_count * rel_aoa                            0.131241
global_synonyms_count * rel_clustering                     0.318705
global_synonyms_count * rel_frequency                      0.070171
global_synonyms_count * rel_letters_count                 -0.495882
global_synonyms_count * rel_orthographic_density          -0.249319
global_synonyms_count * rel_synonyms_count                -0.192803
rel_aoa * rel_clustering                                   0.156158
rel_aoa * rel_frequency                                   -0.256469
rel_aoa * rel_letters_count                               -0.309261
rel_aoa * rel_orthographic_density                        -0.158319
rel_aoa * rel_synonyms_count                               0.113902
rel_clustering * rel_frequency                            -0.118528
rel_clustering * rel_letters_count                        -0.051599
rel_clustering * rel_orthographic_density                  0.076186
rel_clustering * rel_synonyms_count                        0.454312
rel_frequency * rel_letters_count                          0.118850
rel_frequency * rel_orthographic_density                  -0.085682
rel_frequency * rel_synonyms_count                         0.033772
rel_letters_count * rel_orthographic_density               0.051027
rel_letters_count * rel_synonyms_count                     0.286127
rel_orthographic_density * rel_synonyms_count              0.777714
dtype: float64

Regressing rel letters_count with 609 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.24957104629250249

intercept                     -2.471324
global_aoa                     0.094983
global_clustering             -0.471208
global_frequency               0.163135
global_letters_count          -0.354166
global_orthographic_density    0.101509
global_synonyms_count         -0.657001
rel_aoa                       -0.143752
rel_clustering                 0.352876
rel_frequency                 -0.130775
rel_letters_count              0.767625
rel_orthographic_density      -0.248074
rel_synonyms_count             0.356424
dtype: float64

Regressing rel letters_count with 609 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3625274571867614

intercept                                                -32.193335
global_aoa                                                 2.240588
global_clustering                                         -6.751899
global_frequency                                           1.541670
global_letters_count                                      -0.814242
global_orthographic_density                               -3.351596
global_synonyms_count                                      5.006043
rel_aoa                                                   -4.074224
rel_clustering                                             0.660396
rel_frequency                                             -0.686266
rel_letters_count                                         -0.183617
rel_orthographic_density                                  -3.268569
rel_synonyms_count                                        -9.754856
global_aoa * global_clustering                             0.285601
global_aoa * global_frequency                             -0.024320
global_aoa * global_letters_count                         -0.045604
global_aoa * global_orthographic_density                   0.065293
global_aoa * global_synonyms_count                        -0.182992
global_aoa * rel_aoa                                       0.020580
global_aoa * rel_clustering                               -0.216304
global_aoa * rel_frequency                                 0.030031
global_aoa * rel_letters_count                             0.047932
global_aoa * rel_orthographic_density                      0.020653
global_aoa * rel_synonyms_count                            0.135084
global_clustering * global_frequency                       0.438055
global_clustering * global_letters_count                   0.180782
global_clustering * global_orthographic_density           -0.162671
global_clustering * global_synonyms_count                 -0.017666
global_clustering * rel_aoa                                0.038947
global_clustering * rel_clustering                        -0.072376
global_clustering * rel_frequency                         -0.181904
global_clustering * rel_letters_count                     -0.543246
global_clustering * rel_orthographic_density              -0.303542
global_clustering * rel_synonyms_count                    -0.540451
global_frequency * global_letters_count                    0.221547
global_frequency * global_orthographic_density             0.350331
global_frequency * global_synonyms_count                  -0.204272
global_frequency * rel_aoa                                 0.255962
global_frequency * rel_clustering                         -0.085035
global_frequency * rel_frequency                          -0.044907
global_frequency * rel_letters_count                      -0.260555
global_frequency * rel_orthographic_density               -0.021387
global_frequency * rel_synonyms_count                      0.239697
global_letters_count * global_orthographic_density        -0.203547
global_letters_count * global_synonyms_count              -0.154188
global_letters_count * rel_aoa                             0.229427
global_letters_count * rel_clustering                      0.128710
global_letters_count * rel_frequency                      -0.068510
global_letters_count * rel_letters_count                   0.030326
global_letters_count * rel_orthographic_density            0.091957
global_letters_count * rel_synonyms_count                  0.473130
global_orthographic_density * global_synonyms_count       -0.604347
global_orthographic_density * rel_aoa                      0.106978
global_orthographic_density * rel_clustering               0.235366
global_orthographic_density * rel_frequency               -0.097321
global_orthographic_density * rel_letters_count            0.090913
global_orthographic_density * rel_orthographic_density     0.232005
global_orthographic_density * rel_synonyms_count           0.649238
global_synonyms_count * rel_aoa                            0.181301
global_synonyms_count * rel_clustering                     0.331472
global_synonyms_count * rel_frequency                      0.055641
global_synonyms_count * rel_letters_count                 -0.569262
global_synonyms_count * rel_orthographic_density          -0.133004
global_synonyms_count * rel_synonyms_count                -0.184834
rel_aoa * rel_clustering                                   0.040684
rel_aoa * rel_frequency                                   -0.206142
rel_aoa * rel_letters_count                               -0.258000
rel_aoa * rel_orthographic_density                        -0.180687
rel_aoa * rel_synonyms_count                               0.108875
rel_clustering * rel_frequency                            -0.047470
rel_clustering * rel_letters_count                         0.164598
rel_clustering * rel_orthographic_density                  0.281950
rel_clustering * rel_synonyms_count                        0.558130
rel_frequency * rel_letters_count                          0.113559
rel_frequency * rel_orthographic_density                  -0.001508
rel_frequency * rel_synonyms_count                         0.010852
rel_letters_count * rel_orthographic_density               0.149620
rel_letters_count * rel_synonyms_count                     0.335412
rel_orthographic_density * rel_synonyms_count              0.600736
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 594 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15183455542099833

intercept                      1.143313
global_aoa                    -0.027439
global_clustering              0.068574
global_frequency              -0.015180
global_letters_count          -0.020560
global_orthographic_density   -0.012299
global_synonyms_count          0.349898
dtype: float64

Regressing global synonyms_count with 594 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16576136222799154

intercept                                              1.138154
global_aoa                                             0.049828
global_clustering                                      0.328755
global_frequency                                       0.122088
global_letters_count                                  -0.066537
global_orthographic_density                            0.015788
global_synonyms_count                                  0.561106
global_aoa * global_clustering                         0.005388
global_aoa * global_frequency                         -0.007110
global_aoa * global_letters_count                      0.000299
global_aoa * global_orthographic_density               0.005887
global_aoa * global_synonyms_count                     0.020882
global_clustering * global_frequency                  -0.007934
global_clustering * global_letters_count              -0.032016
global_clustering * global_orthographic_density       -0.032167
global_clustering * global_synonyms_count              0.054419
global_frequency * global_letters_count               -0.015701
global_frequency * global_orthographic_density        -0.026754
global_frequency * global_synonyms_count              -0.004763
global_letters_count * global_orthographic_density    -0.004956
global_letters_count * global_synonyms_count          -0.002992
global_orthographic_density * global_synonyms_count    0.003527
dtype: float64

Regressing rel synonyms_count with 594 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.10769703520733298

intercept                      0.731604
global_aoa                    -0.025059
global_clustering              0.055613
global_frequency              -0.011082
global_letters_count          -0.018465
global_orthographic_density   -0.021448
global_synonyms_count          0.290985
dtype: float64

Regressing rel synonyms_count with 594 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.12306231580975202

intercept                                              1.075610
global_aoa                                             0.012212
global_clustering                                      0.332151
global_frequency                                       0.059371
global_letters_count                                  -0.049978
global_orthographic_density                            0.032673
global_synonyms_count                                  0.673911
global_aoa * global_clustering                         0.002766
global_aoa * global_frequency                         -0.002844
global_aoa * global_letters_count                     -0.000804
global_aoa * global_orthographic_density               0.005050
global_aoa * global_synonyms_count                     0.014165
global_clustering * global_frequency                  -0.013005
global_clustering * global_letters_count              -0.029483
global_clustering * global_orthographic_density       -0.017024
global_clustering * global_synonyms_count              0.075270
global_frequency * global_letters_count               -0.015265
global_frequency * global_orthographic_density        -0.021468
global_frequency * global_synonyms_count              -0.004176
global_letters_count * global_orthographic_density     0.002312
global_letters_count * global_synonyms_count          -0.000828
global_orthographic_density * global_synonyms_count   -0.012058
dtype: float64

Regressing global synonyms_count with 594 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1243201238076147

intercept                   0.391692
rel_aoa                    -0.006507
rel_clustering              0.008280
rel_frequency              -0.025895
rel_letters_count          -0.025266
rel_orthographic_density    0.014583
rel_synonyms_count          0.330850
dtype: float64

Regressing global synonyms_count with 594 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1506700576385015

intercept                                        0.436817
rel_aoa                                         -0.038782
rel_clustering                                  -0.063988
rel_frequency                                   -0.008560
rel_letters_count                               -0.072376
rel_orthographic_density                         0.010137
rel_synonyms_count                               0.320664
rel_aoa * rel_clustering                         0.006022
rel_aoa * rel_frequency                         -0.008035
rel_aoa * rel_letters_count                      0.012516
rel_aoa * rel_orthographic_density               0.020526
rel_aoa * rel_synonyms_count                     0.027510
rel_clustering * rel_frequency                  -0.015300
rel_clustering * rel_letters_count              -0.000646
rel_clustering * rel_orthographic_density       -0.026912
rel_clustering * rel_synonyms_count              0.045047
rel_frequency * rel_letters_count               -0.007615
rel_frequency * rel_orthographic_density        -0.007039
rel_frequency * rel_synonyms_count               0.021539
rel_letters_count * rel_orthographic_density    -0.012829
rel_letters_count * rel_synonyms_count           0.004036
rel_orthographic_density * rel_synonyms_count   -0.012510
dtype: float64

Regressing rel synonyms_count with 594 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.23254080895582285

intercept                   0.064257
rel_aoa                    -0.017937
rel_clustering              0.050293
rel_frequency              -0.016428
rel_letters_count          -0.022260
rel_orthographic_density   -0.012582
rel_synonyms_count          0.462948
dtype: float64

Regressing rel synonyms_count with 594 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.2523188532697641

intercept                                        0.104680
rel_aoa                                         -0.038831
rel_clustering                                  -0.028311
rel_frequency                                    0.007932
rel_letters_count                               -0.057547
rel_orthographic_density                        -0.041093
rel_synonyms_count                               0.542182
rel_aoa * rel_clustering                         0.010535
rel_aoa * rel_frequency                         -0.001789
rel_aoa * rel_letters_count                      0.007938
rel_aoa * rel_orthographic_density               0.004933
rel_aoa * rel_synonyms_count                     0.008840
rel_clustering * rel_frequency                  -0.020297
rel_clustering * rel_letters_count              -0.000685
rel_clustering * rel_orthographic_density       -0.014393
rel_clustering * rel_synonyms_count              0.046832
rel_frequency * rel_letters_count               -0.013214
rel_frequency * rel_orthographic_density        -0.008511
rel_frequency * rel_synonyms_count               0.037457
rel_letters_count * rel_orthographic_density     0.004086
rel_letters_count * rel_synonyms_count           0.002771
rel_orthographic_density * rel_synonyms_count   -0.000826
dtype: float64

Regressing global synonyms_count with 594 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16071502308507912

intercept                      1.312018
global_aoa                    -0.037163
global_clustering              0.150443
global_frequency               0.015655
global_letters_count           0.015602
global_orthographic_density   -0.046590
global_synonyms_count          0.270836
rel_aoa                        0.013958
rel_clustering                -0.095368
rel_frequency                 -0.033690
rel_letters_count             -0.039408
rel_orthographic_density       0.039047
rel_synonyms_count             0.085773
dtype: float64

Regressing global synonyms_count with 594 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.26848713890883824

intercept                                                 9.707692
global_aoa                                               -0.135712
global_clustering                                         2.690342
global_frequency                                         -0.329999
global_letters_count                                     -0.066763
global_orthographic_density                               0.721871
global_synonyms_count                                     6.330672
rel_aoa                                                  -0.446393
rel_clustering                                           -2.187646
rel_frequency                                             0.032138
rel_letters_count                                        -0.482762
rel_orthographic_density                                 -1.060377
rel_synonyms_count                                       -5.127450
global_aoa * global_clustering                           -0.035108
global_aoa * global_frequency                             0.003077
global_aoa * global_letters_count                        -0.016308
global_aoa * global_orthographic_density                 -0.037432
global_aoa * global_synonyms_count                       -0.026016
global_aoa * rel_aoa                                     -0.009081
global_aoa * rel_clustering                               0.070970
global_aoa * rel_frequency                               -0.005423
global_aoa * rel_letters_count                            0.013561
global_aoa * rel_orthographic_density                     0.045602
global_aoa * rel_synonyms_count                           0.029853
global_clustering * global_frequency                     -0.133675
global_clustering * global_letters_count                 -0.113256
global_clustering * global_orthographic_density          -0.262155
global_clustering * global_synonyms_count                 0.658833
global_clustering * rel_aoa                              -0.071050
global_clustering * rel_clustering                        0.014680
global_clustering * rel_frequency                         0.067013
global_clustering * rel_letters_count                     0.021961
global_clustering * rel_orthographic_density              0.198730
global_clustering * rel_synonyms_count                   -0.544856
global_frequency * global_letters_count                  -0.023777
global_frequency * global_orthographic_density           -0.162731
global_frequency * global_synonyms_count                 -0.084226
global_frequency * rel_aoa                               -0.001099
global_frequency * rel_clustering                         0.135675
global_frequency * rel_frequency                          0.001037
global_frequency * rel_letters_count                      0.012869
global_frequency * rel_orthographic_density               0.160043
global_frequency * rel_synonyms_count                     0.066331
global_letters_count * global_orthographic_density       -0.098941
global_letters_count * global_synonyms_count             -0.145168
global_letters_count * rel_aoa                            0.015073
global_letters_count * rel_clustering                     0.045781
global_letters_count * rel_frequency                      0.010905
global_letters_count * rel_letters_count                 -0.001675
global_letters_count * rel_orthographic_density           0.090606
global_letters_count * rel_synonyms_count                 0.184110
global_orthographic_density * global_synonyms_count      -0.003561
global_orthographic_density * rel_aoa                    -0.020414
global_orthographic_density * rel_clustering              0.094119
global_orthographic_density * rel_frequency               0.117982
global_orthographic_density * rel_letters_count           0.146796
global_orthographic_density * rel_orthographic_density   -0.015753
global_orthographic_density * rel_synonyms_count         -0.060087
global_synonyms_count * rel_aoa                           0.077959
global_synonyms_count * rel_clustering                   -0.358887
global_synonyms_count * rel_frequency                     0.216997
global_synonyms_count * rel_letters_count                 0.053673
global_synonyms_count * rel_orthographic_density         -0.116993
global_synonyms_count * rel_synonyms_count                0.115991
rel_aoa * rel_clustering                                  0.041756
rel_aoa * rel_frequency                                  -0.011390
rel_aoa * rel_letters_count                               0.002135
rel_aoa * rel_orthographic_density                        0.017903
rel_aoa * rel_synonyms_count                             -0.058002
rel_clustering * rel_frequency                           -0.066230
rel_clustering * rel_letters_count                       -0.004027
rel_clustering * rel_orthographic_density                -0.079822
rel_clustering * rel_synonyms_count                       0.262359
rel_frequency * rel_letters_count                        -0.008913
rel_frequency * rel_orthographic_density                 -0.129861
rel_frequency * rel_synonyms_count                       -0.182298
rel_letters_count * rel_orthographic_density             -0.159765
rel_letters_count * rel_synonyms_count                   -0.068340
rel_orthographic_density * rel_synonyms_count             0.213600
dtype: float64

Regressing rel synonyms_count with 594 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.3047148303056578

intercept                      0.996423
global_aoa                    -0.030442
global_clustering              0.123409
global_frequency               0.017873
global_letters_count           0.016830
global_orthographic_density   -0.033831
global_synonyms_count         -0.608601
rel_aoa                        0.010151
rel_clustering                -0.073699
rel_frequency                 -0.031691
rel_letters_count             -0.037166
rel_orthographic_density       0.017756
rel_synonyms_count             1.027281
dtype: float64

Regressing rel synonyms_count with 594 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.39894965916493297

intercept                                                 9.546130
global_aoa                                               -0.070141
global_clustering                                         2.607870
global_frequency                                         -0.396489
global_letters_count                                     -0.073834
global_orthographic_density                               0.656745
global_synonyms_count                                     4.900638
rel_aoa                                                  -0.346487
rel_clustering                                           -2.213601
rel_frequency                                            -0.013860
rel_letters_count                                        -0.471168
rel_orthographic_density                                 -0.779922
rel_synonyms_count                                       -3.841513
global_aoa * global_clustering                           -0.031411
global_aoa * global_frequency                             0.000845
global_aoa * global_letters_count                        -0.016919
global_aoa * global_orthographic_density                 -0.040726
global_aoa * global_synonyms_count                       -0.039290
global_aoa * rel_aoa                                     -0.007259
global_aoa * rel_clustering                               0.058530
global_aoa * rel_frequency                               -0.004090
global_aoa * rel_letters_count                            0.017558
global_aoa * rel_orthographic_density                     0.052688
global_aoa * rel_synonyms_count                           0.054543
global_clustering * global_frequency                     -0.136718
global_clustering * global_letters_count                 -0.110065
global_clustering * global_orthographic_density          -0.236808
global_clustering * global_synonyms_count                 0.568173
global_clustering * rel_aoa                              -0.055113
global_clustering * rel_clustering                        0.012926
global_clustering * rel_frequency                         0.057775
global_clustering * rel_letters_count                     0.034363
global_clustering * rel_orthographic_density              0.225553
global_clustering * rel_synonyms_count                   -0.472905
global_frequency * global_letters_count                  -0.023159
global_frequency * global_orthographic_density           -0.143515
global_frequency * global_synonyms_count                 -0.078785
global_frequency * rel_aoa                               -0.002170
global_frequency * rel_clustering                         0.139150
global_frequency * rel_frequency                         -0.001424
global_frequency * rel_letters_count                      0.021451
global_frequency * rel_orthographic_density               0.149878
global_frequency * rel_synonyms_count                     0.070131
global_letters_count * global_orthographic_density       -0.080555
global_letters_count * global_synonyms_count             -0.138036
global_letters_count * rel_aoa                            0.014779
global_letters_count * rel_clustering                     0.062415
global_letters_count * rel_frequency                      0.018189
global_letters_count * rel_letters_count                 -0.003305
global_letters_count * rel_orthographic_density           0.068517
global_letters_count * rel_synonyms_count                 0.166928
global_orthographic_density * global_synonyms_count      -0.003966
global_orthographic_density * rel_aoa                    -0.015743
global_orthographic_density * rel_clustering              0.097064
global_orthographic_density * rel_frequency               0.113153
global_orthographic_density * rel_letters_count           0.124017
global_orthographic_density * rel_orthographic_density   -0.011098
global_orthographic_density * rel_synonyms_count         -0.058954
global_synonyms_count * rel_aoa                           0.070187
global_synonyms_count * rel_clustering                   -0.311793
global_synonyms_count * rel_frequency                     0.223026
global_synonyms_count * rel_letters_count                 0.067629
global_synonyms_count * rel_orthographic_density         -0.126831
global_synonyms_count * rel_synonyms_count                0.115249
rel_aoa * rel_clustering                                  0.036660
rel_aoa * rel_frequency                                  -0.008205
rel_aoa * rel_letters_count                              -0.003006
rel_aoa * rel_orthographic_density                        0.011335
rel_aoa * rel_synonyms_count                             -0.072415
rel_clustering * rel_frequency                           -0.061134
rel_clustering * rel_letters_count                       -0.033020
rel_clustering * rel_orthographic_density                -0.132693
rel_clustering * rel_synonyms_count                       0.217065
rel_frequency * rel_letters_count                        -0.024768
rel_frequency * rel_orthographic_density                 -0.127606
rel_frequency * rel_synonyms_count                       -0.195062
rel_letters_count * rel_orthographic_density             -0.127231
rel_letters_count * rel_synonyms_count                   -0.082096
rel_orthographic_density * rel_synonyms_count             0.212365
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 530 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.21230863796378818

intercept                      1.182306
global_aoa                    -0.019218
global_clustering              0.014444
global_frequency              -0.024743
global_letters_count          -0.013350
global_orthographic_density    0.404556
global_synonyms_count          0.088273
dtype: float64

Regressing global orthographic_density with 530 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.24320609923281333

intercept                                              2.027013
global_aoa                                            -0.237205
global_clustering                                     -0.091988
global_frequency                                      -0.055561
global_letters_count                                  -0.130128
global_orthographic_density                            0.552350
global_synonyms_count                                  0.307361
global_aoa * global_clustering                        -0.013233
global_aoa * global_frequency                         -0.008093
global_aoa * global_letters_count                      0.020643
global_aoa * global_orthographic_density               0.083220
global_aoa * global_synonyms_count                    -0.027597
global_clustering * global_frequency                  -0.007992
global_clustering * global_letters_count               0.024128
global_clustering * global_orthographic_density        0.075590
global_clustering * global_synonyms_count              0.083352
global_frequency * global_letters_count                0.011123
global_frequency * global_orthographic_density        -0.031379
global_frequency * global_synonyms_count               0.049126
global_letters_count * global_orthographic_density     0.006805
global_letters_count * global_synonyms_count           0.000993
global_orthographic_density * global_synonyms_count    0.005955
dtype: float64

Regressing rel orthographic_density with 530 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1606310305198546

intercept                     -1.279486
global_aoa                    -0.007826
global_clustering             -0.004328
global_frequency              -0.011164
global_letters_count          -0.008002
global_orthographic_density    0.346649
global_synonyms_count          0.106317
dtype: float64

Regressing rel orthographic_density with 530 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18918136929916904

intercept                                              1.517643
global_aoa                                            -0.428780
global_clustering                                     -0.196053
global_frequency                                      -0.190973
global_letters_count                                  -0.317173
global_orthographic_density                            0.192228
global_synonyms_count                                 -0.039511
global_aoa * global_clustering                        -0.009682
global_aoa * global_frequency                          0.003640
global_aoa * global_letters_count                      0.037462
global_aoa * global_orthographic_density               0.088206
global_aoa * global_synonyms_count                    -0.002371
global_clustering * global_frequency                  -0.005827
global_clustering * global_letters_count               0.023347
global_clustering * global_orthographic_density        0.104155
global_clustering * global_synonyms_count              0.088199
global_frequency * global_letters_count                0.016069
global_frequency * global_orthographic_density         0.005751
global_frequency * global_synonyms_count               0.064964
global_letters_count * global_orthographic_density     0.020482
global_letters_count * global_synonyms_count           0.013045
global_orthographic_density * global_synonyms_count    0.010074
dtype: float64

Regressing global orthographic_density with 530 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18508117236505006

intercept                   1.515803
rel_aoa                    -0.021045
rel_clustering             -0.025577
rel_frequency              -0.039210
rel_letters_count           0.005966
rel_orthographic_density    0.434949
rel_synonyms_count          0.077205
dtype: float64

Regressing global orthographic_density with 530 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.20635513132280559

intercept                                        1.503145
rel_aoa                                          0.051811
rel_clustering                                   0.075094
rel_frequency                                   -0.046791
rel_letters_count                               -0.002003
rel_orthographic_density                         0.355600
rel_synonyms_count                               0.280923
rel_aoa * rel_clustering                         0.029589
rel_aoa * rel_frequency                          0.011587
rel_aoa * rel_letters_count                     -0.001394
rel_aoa * rel_orthographic_density               0.048067
rel_aoa * rel_synonyms_count                    -0.000378
rel_clustering * rel_frequency                   0.011305
rel_clustering * rel_letters_count              -0.014554
rel_clustering * rel_orthographic_density        0.070196
rel_clustering * rel_synonyms_count             -0.025382
rel_frequency * rel_letters_count               -0.009587
rel_frequency * rel_orthographic_density        -0.017176
rel_frequency * rel_synonyms_count               0.013951
rel_letters_count * rel_orthographic_density     0.003218
rel_letters_count * rel_synonyms_count          -0.035843
rel_orthographic_density * rel_synonyms_count    0.053322
dtype: float64

Regressing rel orthographic_density with 530 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2727079334236584

intercept                  -0.519535
rel_aoa                    -0.010944
rel_clustering             -0.018025
rel_frequency               0.010285
rel_letters_count           0.024715
rel_orthographic_density    0.529489
rel_synonyms_count          0.056458
dtype: float64

Regressing rel orthographic_density with 530 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.29107998214796804

intercept                                       -0.447821
rel_aoa                                          0.059865
rel_clustering                                   0.030559
rel_frequency                                    0.043597
rel_letters_count                                0.004134
rel_orthographic_density                         0.472621
rel_synonyms_count                               0.152971
rel_aoa * rel_clustering                         0.033577
rel_aoa * rel_frequency                          0.013027
rel_aoa * rel_letters_count                      0.010622
rel_aoa * rel_orthographic_density               0.071366
rel_aoa * rel_synonyms_count                    -0.006593
rel_clustering * rel_frequency                   0.001676
rel_clustering * rel_letters_count               0.002810
rel_clustering * rel_orthographic_density        0.089878
rel_clustering * rel_synonyms_count             -0.036390
rel_frequency * rel_letters_count               -0.013449
rel_frequency * rel_orthographic_density         0.014031
rel_frequency * rel_synonyms_count               0.002394
rel_letters_count * rel_orthographic_density     0.017016
rel_letters_count * rel_synonyms_count          -0.025611
rel_orthographic_density * rel_synonyms_count    0.008830
dtype: float64

Regressing global orthographic_density with 530 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2252647529308175

intercept                      3.028678
global_aoa                     0.001532
global_clustering              0.135718
global_frequency              -0.056574
global_letters_count          -0.152308
global_orthographic_density    0.267160
global_synonyms_count          0.100120
rel_aoa                       -0.022238
rel_clustering                -0.135559
rel_frequency                  0.040037
rel_letters_count              0.149410
rel_orthographic_density       0.151244
rel_synonyms_count            -0.007787
dtype: float64

Regressing global orthographic_density with 530 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3355106307666611

intercept                                                 3.904949
global_aoa                                               -1.907673
global_clustering                                        -0.321767
global_frequency                                          0.153339
global_letters_count                                      0.634923
global_orthographic_density                               0.377215
global_synonyms_count                                    -2.317046
rel_aoa                                                   0.959723
rel_clustering                                            0.896092
rel_frequency                                            -0.560006
rel_letters_count                                        -0.353686
rel_orthographic_density                                 -1.319969
rel_synonyms_count                                        7.003461
global_aoa * global_clustering                           -0.175233
global_aoa * global_frequency                             0.024783
global_aoa * global_letters_count                         0.059891
global_aoa * global_orthographic_density                  0.135209
global_aoa * global_synonyms_count                        0.177207
global_aoa * rel_aoa                                      0.015373
global_aoa * rel_clustering                               0.158616
global_aoa * rel_frequency                               -0.022089
global_aoa * rel_letters_count                           -0.036355
global_aoa * rel_orthographic_density                    -0.057201
global_aoa * rel_synonyms_count                          -0.226437
global_clustering * global_frequency                      0.042939
global_clustering * global_letters_count                  0.115864
global_clustering * global_orthographic_density           0.077628
global_clustering * global_synonyms_count                 0.110738
global_clustering * rel_aoa                              -0.030819
global_clustering * rel_clustering                       -0.102557
global_clustering * rel_frequency                        -0.173229
global_clustering * rel_letters_count                     0.061580
global_clustering * rel_orthographic_density              0.041752
global_clustering * rel_synonyms_count                    0.387353
global_frequency * global_letters_count                  -0.028479
global_frequency * global_orthographic_density           -0.021515
global_frequency * global_synonyms_count                  0.143565
global_frequency * rel_aoa                               -0.059465
global_frequency * rel_clustering                        -0.051692
global_frequency * rel_frequency                          0.017029
global_frequency * rel_letters_count                      0.098445
global_frequency * rel_orthographic_density               0.094073
global_frequency * rel_synonyms_count                    -0.179442
global_letters_count * global_orthographic_density       -0.083471
global_letters_count * global_synonyms_count              0.024815
global_letters_count * rel_aoa                           -0.078264
global_letters_count * rel_clustering                    -0.186308
global_letters_count * rel_frequency                     -0.054889
global_letters_count * rel_letters_count                 -0.004041
global_letters_count * rel_orthographic_density           0.218292
global_letters_count * rel_synonyms_count                -0.087901
global_orthographic_density * global_synonyms_count       0.373325
global_orthographic_density * rel_aoa                    -0.089947
global_orthographic_density * rel_clustering             -0.330884
global_orthographic_density * rel_frequency              -0.119549
global_orthographic_density * rel_letters_count           0.009519
global_orthographic_density * rel_orthographic_density    0.020649
global_orthographic_density * rel_synonyms_count         -0.563932
global_synonyms_count * rel_aoa                          -0.064624
global_synonyms_count * rel_clustering                   -0.466973
global_synonyms_count * rel_frequency                     0.084193
global_synonyms_count * rel_letters_count                 0.103563
global_synonyms_count * rel_orthographic_density         -0.197650
global_synonyms_count * rel_synonyms_count               -0.052951
rel_aoa * rel_clustering                                  0.044429
rel_aoa * rel_frequency                                   0.060949
rel_aoa * rel_letters_count                               0.055316
rel_aoa * rel_orthographic_density                        0.114020
rel_aoa * rel_synonyms_count                              0.039409
rel_clustering * rel_frequency                            0.164152
rel_clustering * rel_letters_count                        0.023867
rel_clustering * rel_orthographic_density                 0.277787
rel_clustering * rel_synonyms_count                      -0.004933
rel_frequency * rel_letters_count                        -0.016676
rel_frequency * rel_orthographic_density                 -0.001246
rel_frequency * rel_synonyms_count                       -0.031992
rel_letters_count * rel_orthographic_density             -0.139332
rel_letters_count * rel_synonyms_count                   -0.042732
rel_orthographic_density * rel_synonyms_count             0.455379
dtype: float64

Regressing rel orthographic_density with 530 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3169295649552263

intercept                      2.272444
global_aoa                     0.003676
global_clustering              0.127670
global_frequency              -0.046062
global_letters_count          -0.118499
global_orthographic_density   -0.500778
global_synonyms_count          0.128505
rel_aoa                       -0.022773
rel_clustering                -0.117516
rel_frequency                  0.039173
rel_letters_count              0.108039
rel_orthographic_density       0.961957
rel_synonyms_count            -0.048554
dtype: float64

Regressing rel orthographic_density with 530 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.41047224253628145

intercept                                                 3.346320
global_aoa                                               -1.287112
global_clustering                                        -0.188827
global_frequency                                          0.022583
global_letters_count                                      0.238350
global_orthographic_density                              -0.472724
global_synonyms_count                                    -2.108467
rel_aoa                                                   0.540334
rel_clustering                                            0.432584
rel_frequency                                            -0.644098
rel_letters_count                                         0.043833
rel_orthographic_density                                 -0.035732
rel_synonyms_count                                        6.664813
global_aoa * global_clustering                           -0.130963
global_aoa * global_frequency                             0.010803
global_aoa * global_letters_count                         0.048471
global_aoa * global_orthographic_density                  0.075634
global_aoa * global_synonyms_count                        0.120282
global_aoa * rel_aoa                                      0.012447
global_aoa * rel_clustering                               0.111056
global_aoa * rel_frequency                               -0.008553
global_aoa * rel_letters_count                           -0.017044
global_aoa * rel_orthographic_density                     0.005617
global_aoa * rel_synonyms_count                          -0.153043
global_clustering * global_frequency                      0.022603
global_clustering * global_letters_count                  0.106220
global_clustering * global_orthographic_density           0.029823
global_clustering * global_synonyms_count                 0.069898
global_clustering * rel_aoa                              -0.044989
global_clustering * rel_clustering                       -0.085381
global_clustering * rel_frequency                        -0.143385
global_clustering * rel_letters_count                     0.101667
global_clustering * rel_orthographic_density              0.171251
global_clustering * rel_synonyms_count                    0.428810
global_frequency * global_letters_count                   0.003842
global_frequency * global_orthographic_density           -0.033235
global_frequency * global_synonyms_count                  0.137312
global_frequency * rel_aoa                               -0.046848
global_frequency * rel_clustering                        -0.020725
global_frequency * rel_frequency                          0.014240
global_frequency * rel_letters_count                      0.068745
global_frequency * rel_orthographic_density               0.113800
global_frequency * rel_synonyms_count                    -0.191035
global_letters_count * global_orthographic_density       -0.008622
global_letters_count * global_synonyms_count              0.054833
global_letters_count * rel_aoa                           -0.054432
global_letters_count * rel_clustering                    -0.157519
global_letters_count * rel_frequency                     -0.046033
global_letters_count * rel_letters_count                 -0.005180
global_letters_count * rel_orthographic_density           0.153340
global_letters_count * rel_synonyms_count                -0.110897
global_orthographic_density * global_synonyms_count       0.309206
global_orthographic_density * rel_aoa                    -0.047923
global_orthographic_density * rel_clustering             -0.177101
global_orthographic_density * rel_frequency              -0.042369
global_orthographic_density * rel_letters_count          -0.014102
global_orthographic_density * rel_orthographic_density    0.046808
global_orthographic_density * rel_synonyms_count         -0.432798
global_synonyms_count * rel_aoa                          -0.032203
global_synonyms_count * rel_clustering                   -0.274192
global_synonyms_count * rel_frequency                     0.099196
global_synonyms_count * rel_letters_count                 0.026544
global_synonyms_count * rel_orthographic_density         -0.207932
global_synonyms_count * rel_synonyms_count               -0.091682
rel_aoa * rel_clustering                                  0.066355
rel_aoa * rel_frequency                                   0.046487
rel_aoa * rel_letters_count                               0.032878
rel_aoa * rel_orthographic_density                        0.075485
rel_aoa * rel_synonyms_count                             -0.008448
rel_clustering * rel_frequency                            0.136032
rel_clustering * rel_letters_count                       -0.036107
rel_clustering * rel_orthographic_density                 0.033122
rel_clustering * rel_synonyms_count                      -0.181537
rel_frequency * rel_letters_count                        -0.029615
rel_frequency * rel_orthographic_density                 -0.077730
rel_frequency * rel_synonyms_count                       -0.047645
rel_letters_count * rel_orthographic_density             -0.127625
rel_letters_count * rel_synonyms_count                    0.016103
rel_orthographic_density * rel_synonyms_count             0.366219
dtype: float64