Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.majority, past=Past.last_bin, durl=Durl.exclude_past, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 1150 substitutions for model Model(time=Time.continuous, source=Source.majority, past=Past.last_bin, durl=Durl.exclude_past, max_distance=1)
100% (1150 of 1150) |######################| Elapsed Time: 0:00:43 Time: 0:00:43

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | ns. | ns. | **  |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | **  |
H_00 | *** | **  | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *   | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *   | *** | *** | *   |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | ns. | *** | *** | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | **  |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *   | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | ns. | ns. | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | *   | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *** |
H_00 | **  | **  | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *   | *** | **  | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | ns. | ns. | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | ns. | ns. | **  |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *   | *** | **  | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | ns. | ns. | **  |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *   | ns. | ns. | **  |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *   | *   | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | *** | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | **  | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | *   | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | *** | *   |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | **  |
H_00 | **  | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | *** | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | *   | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *   | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | **  | ns. | ns. | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | ns. | ns. | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.53042892  0.18104279  0.07913708  0.06786873  0.03697332  0.02788193
  0.0206366   0.01860622  0.01583829  0.01092013  0.00649223]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 0.496992 -0.270738 0.090738 -0.227000 -0.221398 0.436484 -0.232202 -0.272162 0.389509 -0.281082 0.147148 -0.006038
Component-1 0.343778 -0.390886 0.156088 -0.292271 -0.224462 -0.413586 0.168769 -0.291347 -0.444570 0.257901 -0.140484 0.023374
Component-2 0.414526 -0.071759 -0.057576 0.021356 0.899897 -0.022039 -0.028583 -0.051746 -0.035438 0.043382 -0.010351 -0.046988

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (1150 of 1150) |######################| Elapsed Time: 0:00:54 Time: 0:00:54

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *** | ns. | ns. | *   |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | *   | *** | *** | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | *   | **  | *** | *   |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.63854591  0.21830627]

Out[35]:
aoa frequency letters_count
Component-0 0.764777 -0.385333 0.516367
Component-1 -0.379317 0.378551 0.844285

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (1150 of 1150) |######################| Elapsed Time: 0:00:09 Time: 0:00:09

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | ns. |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 729 (cluster-unique) substitutions, but the PCA is in fact computed on 550 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
    * global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.11988854708304109

intercept                      3.809587
global_aoa                     0.083983
global_clustering              0.043883
global_frequency               0.463145
global_letters_count           0.058487
global_orthographic_density    0.128289
global_synonyms_count         -0.109480
dtype: float64

Regressing global frequency with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.15191950175779256

intercept                                             -16.949642
global_aoa                                              0.662014
global_clustering                                      -1.848897
global_frequency                                        2.403301
global_letters_count                                    0.993853
global_orthographic_density                             3.425309
global_synonyms_count                                   0.825941
global_aoa * global_clustering                          0.010463
global_aoa * global_frequency                          -0.046892
global_aoa * global_letters_count                      -0.018836
global_aoa * global_orthographic_density               -0.028232
global_aoa * global_synonyms_count                      0.036573
global_clustering * global_frequency                    0.156988
global_clustering * global_letters_count                0.026974
global_clustering * global_orthographic_density         0.197243
global_clustering * global_synonyms_count               0.142715
global_frequency * global_letters_count                -0.072419
global_frequency * global_orthographic_density         -0.238864
global_frequency * global_synonyms_count               -0.016037
global_letters_count * global_orthographic_density      0.035632
global_letters_count * global_synonyms_count           -0.043202
global_orthographic_density * global_synonyms_count     0.003333
dtype: float64

Regressing rel frequency with 394 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.07421295594398379

intercept                     -6.900193
global_aoa                     0.050084
global_clustering              0.085016
global_frequency               0.390089
global_letters_count           0.126365
global_orthographic_density    0.039007
global_synonyms_count         -0.107603
dtype: float64

Regressing rel frequency with 394 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.09681726963584547

intercept                                             -26.076740
global_aoa                                              0.460260
global_clustering                                      -1.991287
global_frequency                                        2.526543
global_letters_count                                    0.751103
global_orthographic_density                             1.354939
global_synonyms_count                                   0.069002
global_aoa * global_clustering                         -0.032975
global_aoa * global_frequency                          -0.091915
global_aoa * global_letters_count                       0.019294
global_aoa * global_orthographic_density                0.051606
global_aoa * global_synonyms_count                      0.023543
global_clustering * global_frequency                    0.192399
global_clustering * global_letters_count                0.085925
global_clustering * global_orthographic_density         0.037906
global_clustering * global_synonyms_count               0.091908
global_frequency * global_letters_count                -0.034662
global_frequency * global_orthographic_density         -0.189150
global_frequency * global_synonyms_count                0.011712
global_letters_count * global_orthographic_density      0.036959
global_letters_count * global_synonyms_count            0.004565
global_orthographic_density * global_synonyms_count    -0.023228
dtype: float64

Regressing global frequency with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06289590505070453

intercept                   9.478980
rel_aoa                     0.076604
rel_clustering             -0.093171
rel_frequency               0.275135
rel_letters_count           0.067605
rel_orthographic_density    0.118766
rel_synonyms_count         -0.216010
dtype: float64

Regressing global frequency with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09713067555340638

intercept                                        9.323895
rel_aoa                                          0.178496
rel_clustering                                   0.024393
rel_frequency                                    0.190291
rel_letters_count                                0.071330
rel_orthographic_density                        -0.237660
rel_synonyms_count                              -0.011491
rel_aoa * rel_clustering                         0.002093
rel_aoa * rel_frequency                          0.015518
rel_aoa * rel_letters_count                     -0.027461
rel_aoa * rel_orthographic_density              -0.013075
rel_aoa * rel_synonyms_count                     0.001794
rel_clustering * rel_frequency                   0.040303
rel_clustering * rel_letters_count               0.097334
rel_clustering * rel_orthographic_density        0.265367
rel_clustering * rel_synonyms_count              0.251537
rel_frequency * rel_letters_count               -0.008928
rel_frequency * rel_orthographic_density        -0.102721
rel_frequency * rel_synonyms_count               0.073275
rel_letters_count * rel_orthographic_density     0.032305
rel_letters_count * rel_synonyms_count          -0.207544
rel_orthographic_density * rel_synonyms_count   -0.393851
dtype: float64

Regressing rel frequency with 394 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.28270729308265086

intercept                  -1.261048
rel_aoa                     0.037501
rel_clustering              0.168730
rel_frequency               0.661103
rel_letters_count          -0.008447
rel_orthographic_density   -0.082476
rel_synonyms_count         -0.216672
dtype: float64

Regressing rel frequency with 394 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.32332978419730085

intercept                                       -1.509630
rel_aoa                                         -0.042486
rel_clustering                                   0.035899
rel_frequency                                    0.581996
rel_letters_count                                0.013611
rel_orthographic_density                        -0.518436
rel_synonyms_count                              -0.176321
rel_aoa * rel_clustering                        -0.036515
rel_aoa * rel_frequency                         -0.075858
rel_aoa * rel_letters_count                      0.004802
rel_aoa * rel_orthographic_density               0.110623
rel_aoa * rel_synonyms_count                     0.074121
rel_clustering * rel_frequency                   0.013976
rel_clustering * rel_letters_count               0.086710
rel_clustering * rel_orthographic_density        0.008294
rel_clustering * rel_synonyms_count              0.303992
rel_frequency * rel_letters_count               -0.008213
rel_frequency * rel_orthographic_density        -0.173022
rel_frequency * rel_synonyms_count               0.058164
rel_letters_count * rel_orthographic_density     0.013183
rel_letters_count * rel_synonyms_count          -0.092321
rel_orthographic_density * rel_synonyms_count   -0.093774
dtype: float64

Regressing global frequency with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.12808703763187135

intercept                      3.251995
global_aoa                     0.072879
global_clustering              0.160953
global_frequency               0.527759
global_letters_count           0.103279
global_orthographic_density    0.256526
global_synonyms_count          0.339634
rel_aoa                        0.016092
rel_clustering                -0.145028
rel_frequency                 -0.070555
rel_letters_count             -0.046543
rel_orthographic_density      -0.145210
rel_synonyms_count            -0.526989
dtype: float64

Regressing global frequency with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.29362252747953554

intercept                                                -38.747403
global_aoa                                                 1.855764
global_clustering                                         -4.932409
global_frequency                                           1.700485
global_letters_count                                       1.936658
global_orthographic_density                               10.366804
global_synonyms_count                                     12.378797
rel_aoa                                                    2.270647
rel_clustering                                            -1.504630
rel_frequency                                             -1.552374
rel_letters_count                                          0.935759
rel_orthographic_density                                  -0.441427
rel_synonyms_count                                       -11.300487
global_aoa * global_clustering                             0.165611
global_aoa * global_frequency                              0.017813
global_aoa * global_letters_count                         -0.068412
global_aoa * global_orthographic_density                  -0.159727
global_aoa * global_synonyms_count                        -0.592754
global_aoa * rel_aoa                                      -0.036657
global_aoa * rel_clustering                               -0.144292
global_aoa * rel_frequency                                 0.087375
global_aoa * rel_letters_count                             0.027423
global_aoa * rel_orthographic_density                      0.040094
global_aoa * rel_synonyms_count                            0.581732
global_clustering * global_frequency                       0.072150
global_clustering * global_letters_count                  -0.156791
global_clustering * global_orthographic_density            1.598773
global_clustering * global_synonyms_count                  1.030941
global_clustering * rel_aoa                               -0.044155
global_clustering * rel_clustering                         0.180808
global_clustering * rel_frequency                         -0.152159
global_clustering * rel_letters_count                      0.476564
global_clustering * rel_orthographic_density              -0.686882
global_clustering * rel_synonyms_count                    -0.398967
global_frequency * global_letters_count                   -0.301870
global_frequency * global_orthographic_density             0.014519
global_frequency * global_synonyms_count                  -0.374817
global_frequency * rel_aoa                                -0.264253
global_frequency * rel_clustering                          0.508385
global_frequency * rel_frequency                          -0.046528
global_frequency * rel_letters_count                       0.238195
global_frequency * rel_orthographic_density               -0.337954
global_frequency * rel_synonyms_count                      0.440252
global_letters_count * global_orthographic_density         0.220447
global_letters_count * global_synonyms_count               0.541798
global_letters_count * rel_aoa                            -0.022266
global_letters_count * rel_clustering                      0.183977
global_letters_count * rel_frequency                       0.107159
global_letters_count * rel_letters_count                  -0.000877
global_letters_count * rel_orthographic_density           -0.222968
global_letters_count * rel_synonyms_count                 -0.479360
global_orthographic_density * global_synonyms_count       -0.714001
global_orthographic_density * rel_aoa                      0.216221
global_orthographic_density * rel_clustering              -1.290083
global_orthographic_density * rel_frequency                0.100958
global_orthographic_density * rel_letters_count           -0.202205
global_orthographic_density * rel_orthographic_density    -0.046734
global_orthographic_density * rel_synonyms_count           1.531137
global_synonyms_count * rel_aoa                            0.138630
global_synonyms_count * rel_clustering                    -0.971057
global_synonyms_count * rel_frequency                      0.034340
global_synonyms_count * rel_letters_count                 -0.125701
global_synonyms_count * rel_orthographic_density           0.590777
global_synonyms_count * rel_synonyms_count                 0.143230
rel_aoa * rel_clustering                                  -0.040103
rel_aoa * rel_frequency                                    0.089465
rel_aoa * rel_letters_count                                0.063114
rel_aoa * rel_orthographic_density                        -0.089097
rel_aoa * rel_synonyms_count                              -0.169658
rel_clustering * rel_frequency                            -0.340259
rel_clustering * rel_letters_count                        -0.401569
rel_clustering * rel_orthographic_density                  0.363223
rel_clustering * rel_synonyms_count                        0.855538
rel_frequency * rel_letters_count                         -0.053537
rel_frequency * rel_orthographic_density                   0.072874
rel_frequency * rel_synonyms_count                         0.068577
rel_letters_count * rel_orthographic_density               0.232078
rel_letters_count * rel_synonyms_count                     0.092980
rel_orthographic_density * rel_synonyms_count             -1.454993
dtype: float64

Regressing rel frequency with 394 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3404879904417327

intercept                      3.341755
global_aoa                     0.072045
global_clustering              0.276161
global_frequency              -0.402357
global_letters_count           0.101565
global_orthographic_density    0.232495
global_synonyms_count          0.252270
rel_aoa                       -0.006829
rel_clustering                -0.193322
rel_frequency                  0.900656
rel_letters_count             -0.052083
rel_orthographic_density      -0.114892
rel_synonyms_count            -0.427423
dtype: float64

Regressing rel frequency with 394 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.4539421881930371

intercept                                                -41.462842
global_aoa                                                 2.084592
global_clustering                                         -5.581950
global_frequency                                           0.743618
global_letters_count                                       1.948545
global_orthographic_density                               10.067547
global_synonyms_count                                     10.901073
rel_aoa                                                    1.652097
rel_clustering                                             0.011954
rel_frequency                                             -0.609224
rel_letters_count                                          0.389186
rel_orthographic_density                                  -1.153949
rel_synonyms_count                                        -9.962115
global_aoa * global_clustering                             0.164837
global_aoa * global_frequency                              0.007520
global_aoa * global_letters_count                         -0.091807
global_aoa * global_orthographic_density                  -0.188466
global_aoa * global_synonyms_count                        -0.543070
global_aoa * rel_aoa                                      -0.037934
global_aoa * rel_clustering                               -0.170895
global_aoa * rel_frequency                                 0.097401
global_aoa * rel_letters_count                             0.079904
global_aoa * rel_orthographic_density                      0.101526
global_aoa * rel_synonyms_count                            0.536094
global_clustering * global_frequency                       0.122958
global_clustering * global_letters_count                  -0.026358
global_clustering * global_orthographic_density            1.491992
global_clustering * global_synonyms_count                  0.802242
global_clustering * rel_aoa                               -0.089152
global_clustering * rel_clustering                         0.143898
global_clustering * rel_frequency                         -0.159971
global_clustering * rel_letters_count                      0.391934
global_clustering * rel_orthographic_density              -0.577710
global_clustering * rel_synonyms_count                    -0.236831
global_frequency * global_letters_count                   -0.191956
global_frequency * global_orthographic_density             0.021292
global_frequency * global_synonyms_count                  -0.358339
global_frequency * rel_aoa                                -0.232880
global_frequency * rel_clustering                          0.368758
global_frequency * rel_frequency                          -0.032276
global_frequency * rel_letters_count                       0.165134
global_frequency * rel_orthographic_density               -0.289214
global_frequency * rel_synonyms_count                      0.411214
global_letters_count * global_orthographic_density         0.150510
global_letters_count * global_synonyms_count               0.425653
global_letters_count * rel_aoa                            -0.026791
global_letters_count * rel_clustering                      0.075100
global_letters_count * rel_frequency                       0.040237
global_letters_count * rel_letters_count                   0.014209
global_letters_count * rel_orthographic_density           -0.124946
global_letters_count * rel_synonyms_count                 -0.394160
global_orthographic_density * global_synonyms_count       -0.727242
global_orthographic_density * rel_aoa                      0.206735
global_orthographic_density * rel_clustering              -1.147583
global_orthographic_density * rel_frequency                0.128455
global_orthographic_density * rel_letters_count           -0.029863
global_orthographic_density * rel_orthographic_density     0.018265
global_orthographic_density * rel_synonyms_count           1.500549
global_synonyms_count * rel_aoa                            0.131360
global_synonyms_count * rel_clustering                    -0.800549
global_synonyms_count * rel_frequency                     -0.030173
global_synonyms_count * rel_letters_count                 -0.092083
global_synonyms_count * rel_orthographic_density           0.520988
global_synonyms_count * rel_synonyms_count                 0.154858
rel_aoa * rel_clustering                                   0.027178
rel_aoa * rel_frequency                                    0.048318
rel_aoa * rel_letters_count                                0.032964
rel_aoa * rel_orthographic_density                        -0.130301
rel_aoa * rel_synonyms_count                              -0.162496
rel_clustering * rel_frequency                            -0.246035
rel_clustering * rel_letters_count                        -0.325703
rel_clustering * rel_orthographic_density                  0.216954
rel_clustering * rel_synonyms_count                        0.751237
rel_frequency * rel_letters_count                         -0.012904
rel_frequency * rel_orthographic_density                   0.003580
rel_frequency * rel_synonyms_count                         0.124626
rel_letters_count * rel_orthographic_density               0.104753
rel_letters_count * rel_synonyms_count                     0.097782
rel_orthographic_density * rel_synonyms_count             -1.319883
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 359 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.1437800438591058

intercept                      5.253935
global_aoa                     0.281145
global_clustering             -0.233922
global_frequency              -0.167489
global_letters_count           0.072056
global_orthographic_density   -0.232251
global_synonyms_count         -0.042253
dtype: float64

Regressing global aoa with 359 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.19055039114416983

intercept                                              2.734401
global_aoa                                             0.840473
global_clustering                                     -1.393679
global_frequency                                      -1.083200
global_letters_count                                   0.848606
global_orthographic_density                           -1.813714
global_synonyms_count                                 -2.186114
global_aoa * global_clustering                         0.099643
global_aoa * global_frequency                         -0.004552
global_aoa * global_letters_count                      0.010859
global_aoa * global_orthographic_density              -0.010322
global_aoa * global_synonyms_count                     0.042819
global_clustering * global_frequency                  -0.089888
global_clustering * global_letters_count               0.209122
global_clustering * global_orthographic_density       -0.049794
global_clustering * global_synonyms_count              0.047576
global_frequency * global_letters_count                0.034702
global_frequency * global_orthographic_density         0.119877
global_frequency * global_synonyms_count               0.173405
global_letters_count * global_orthographic_density     0.028969
global_letters_count * global_synonyms_count           0.037251
global_orthographic_density * global_synonyms_count    0.240869
dtype: float64

Regressing rel aoa with 359 measures, no interactions
           ^^^^^^^
R^2 = 0.06175304088096001

intercept                      0.272302
global_aoa                     0.166180
global_clustering             -0.240620
global_frequency              -0.243465
global_letters_count           0.023783
global_orthographic_density    0.050087
global_synonyms_count         -0.050456
dtype: float64

Regressing rel aoa with 359 measures, with interactions
           ^^^^^^^
R^2 = 0.11300428232770554

intercept                                              2.906153
global_aoa                                             1.435000
global_clustering                                     -0.042714
global_frequency                                      -1.513417
global_letters_count                                   0.031236
global_orthographic_density                           -1.720538
global_synonyms_count                                 -1.387724
global_aoa * global_clustering                         0.214634
global_aoa * global_frequency                          0.020291
global_aoa * global_letters_count                     -0.040915
global_aoa * global_orthographic_density               0.002613
global_aoa * global_synonyms_count                     0.093821
global_clustering * global_frequency                  -0.144799
global_clustering * global_letters_count              -0.045452
global_clustering * global_orthographic_density       -0.125697
global_clustering * global_synonyms_count              0.084003
global_frequency * global_letters_count                0.016220
global_frequency * global_orthographic_density         0.153743
global_frequency * global_synonyms_count               0.129128
global_letters_count * global_orthographic_density    -0.076301
global_letters_count * global_synonyms_count          -0.050537
global_orthographic_density * global_synonyms_count    0.247296
dtype: float64

Regressing global aoa with 359 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.0703580433486064

intercept                   6.685036
rel_aoa                     0.121890
rel_clustering              0.233788
rel_frequency               0.072368
rel_letters_count          -0.063255
rel_orthographic_density   -0.587374
rel_synonyms_count         -0.232031
dtype: float64

Regressing global aoa with 359 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.14135783667802881

intercept                                        6.669157
rel_aoa                                         -0.241589
rel_clustering                                  -0.069809
rel_frequency                                    0.146078
rel_letters_count                               -0.064043
rel_orthographic_density                        -0.481996
rel_synonyms_count                               0.041665
rel_aoa * rel_clustering                         0.104034
rel_aoa * rel_frequency                         -0.105249
rel_aoa * rel_letters_count                      0.078211
rel_aoa * rel_orthographic_density               0.156246
rel_aoa * rel_synonyms_count                     0.045364
rel_clustering * rel_frequency                   0.015033
rel_clustering * rel_letters_count               0.113728
rel_clustering * rel_orthographic_density       -0.057271
rel_clustering * rel_synonyms_count             -0.083393
rel_frequency * rel_letters_count               -0.007698
rel_frequency * rel_orthographic_density         0.082273
rel_frequency * rel_synonyms_count               0.020825
rel_letters_count * rel_orthographic_density     0.049144
rel_letters_count * rel_synonyms_count           0.210823
rel_orthographic_density * rel_synonyms_count    0.812227
dtype: float64

Regressing rel aoa with 359 measures, no interactions
           ^^^^^^^
R^2 = 0.22623019563267358

intercept                   0.495689
rel_aoa                     0.519912
rel_clustering             -0.136578
rel_frequency              -0.094381
rel_letters_count          -0.057006
rel_orthographic_density    0.001609
rel_synonyms_count         -0.171368
dtype: float64

Regressing rel aoa with 359 measures, with interactions
           ^^^^^^^
R^2 = 0.26615675329799326

intercept                                        0.787144
rel_aoa                                          0.465974
rel_clustering                                  -0.222806
rel_frequency                                    0.073990
rel_letters_count                               -0.153135
rel_orthographic_density                         0.395954
rel_synonyms_count                               0.010776
rel_aoa * rel_clustering                         0.069060
rel_aoa * rel_frequency                         -0.020370
rel_aoa * rel_letters_count                     -0.036752
rel_aoa * rel_orthographic_density              -0.046793
rel_aoa * rel_synonyms_count                    -0.095606
rel_clustering * rel_frequency                  -0.013883
rel_clustering * rel_letters_count               0.020177
rel_clustering * rel_orthographic_density        0.032539
rel_clustering * rel_synonyms_count             -0.030792
rel_frequency * rel_letters_count               -0.031669
rel_frequency * rel_orthographic_density         0.140500
rel_frequency * rel_synonyms_count               0.034604
rel_letters_count * rel_orthographic_density    -0.028238
rel_letters_count * rel_synonyms_count           0.031082
rel_orthographic_density * rel_synonyms_count    0.127183
dtype: float64

Regressing global aoa with 359 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.19090496789723665

intercept                     -0.557325
global_aoa                     0.344567
global_clustering             -0.904335
global_frequency              -0.243984
global_letters_count           0.473390
global_orthographic_density    0.000701
global_synonyms_count          0.545691
rel_aoa                       -0.076554
rel_clustering                 0.719715
rel_frequency                  0.058019
rel_letters_count             -0.482598
rel_orthographic_density      -0.230954
rel_synonyms_count            -0.679063
dtype: float64

Regressing global aoa with 359 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.40893415249940746

intercept                                                 85.986073
global_aoa                                                -1.299566
global_clustering                                          8.175673
global_frequency                                          -1.529772
global_letters_count                                      -7.456612
global_orthographic_density                              -25.007443
global_synonyms_count                                    -26.029078
rel_aoa                                                   -0.497616
rel_clustering                                             2.134235
rel_frequency                                              4.023629
rel_letters_count                                          5.993760
rel_orthographic_density                                  17.194245
rel_synonyms_count                                         8.637617
global_aoa * global_clustering                             0.307267
global_aoa * global_frequency                              0.124071
global_aoa * global_letters_count                          0.264959
global_aoa * global_orthographic_density                   0.356239
global_aoa * global_synonyms_count                         0.363697
global_aoa * rel_aoa                                       0.026750
global_aoa * rel_clustering                               -0.316824
global_aoa * rel_frequency                                -0.124826
global_aoa * rel_letters_count                            -0.235424
global_aoa * rel_orthographic_density                     -0.436561
global_aoa * rel_synonyms_count                           -0.106588
global_clustering * global_frequency                      -0.014416
global_clustering * global_letters_count                  -0.421300
global_clustering * global_orthographic_density           -3.669478
global_clustering * global_synonyms_count                 -2.075086
global_clustering * rel_aoa                               -0.157945
global_clustering * rel_clustering                         0.018368
global_clustering * rel_frequency                          0.407021
global_clustering * rel_letters_count                      0.035896
global_clustering * rel_orthographic_density               2.630752
global_clustering * rel_synonyms_count                     0.674435
global_frequency * global_letters_count                    0.270171
global_frequency * global_orthographic_density            -0.162869
global_frequency * global_synonyms_count                   0.488395
global_frequency * rel_aoa                                 0.037029
global_frequency * rel_clustering                         -0.721650
global_frequency * rel_frequency                          -0.000425
global_frequency * rel_letters_count                      -0.339008
global_frequency * rel_orthographic_density                0.214618
global_frequency * rel_synonyms_count                      0.025067
global_letters_count * global_orthographic_density         0.379874
global_letters_count * global_synonyms_count               0.674055
global_letters_count * rel_aoa                            -0.205543
global_letters_count * rel_clustering                      0.379727
global_letters_count * rel_frequency                      -0.081542
global_letters_count * rel_letters_count                   0.033577
global_letters_count * rel_orthographic_density           -0.202971
global_letters_count * rel_synonyms_count                 -0.073500
global_orthographic_density * global_synonyms_count        1.663784
global_orthographic_density * rel_aoa                     -0.087950
global_orthographic_density * rel_clustering               2.675607
global_orthographic_density * rel_frequency               -0.093541
global_orthographic_density * rel_letters_count           -0.561289
global_orthographic_density * rel_orthographic_density     0.262336
global_orthographic_density * rel_synonyms_count          -2.058332
global_synonyms_count * rel_aoa                           -0.140624
global_synonyms_count * rel_clustering                     2.402544
global_synonyms_count * rel_frequency                     -0.766149
global_synonyms_count * rel_letters_count                 -1.216139
global_synonyms_count * rel_orthographic_density          -1.503980
global_synonyms_count * rel_synonyms_count                 0.100689
rel_aoa * rel_clustering                                   0.324801
rel_aoa * rel_frequency                                   -0.047282
rel_aoa * rel_letters_count                                0.130827
rel_aoa * rel_orthographic_density                         0.069398
rel_aoa * rel_synonyms_count                              -0.015152
rel_clustering * rel_frequency                             0.190086
rel_clustering * rel_letters_count                         0.132674
rel_clustering * rel_orthographic_density                 -1.150224
rel_clustering * rel_synonyms_count                       -1.437300
rel_frequency * rel_letters_count                          0.083696
rel_frequency * rel_orthographic_density                   0.174889
rel_frequency * rel_synonyms_count                         0.231714
rel_letters_count * rel_orthographic_density               0.559970
rel_letters_count * rel_synonyms_count                     0.632855
rel_orthographic_density * rel_synonyms_count              2.282247
dtype: float64

Regressing rel aoa with 359 measures, no interactions
           ^^^^^^^
R^2 = 0.29727832928774234

intercept                      0.797462
global_aoa                    -0.453695
global_clustering             -0.624398
global_frequency              -0.238219
global_letters_count           0.254667
global_orthographic_density   -0.084287
global_synonyms_count          0.504352
rel_aoa                        0.839252
rel_clustering                 0.520334
rel_frequency                  0.064946
rel_letters_count             -0.254964
rel_orthographic_density      -0.136367
rel_synonyms_count            -0.683377
dtype: float64

Regressing rel aoa with 359 measures, with interactions
           ^^^^^^^
R^2 = 0.4712118614191564

intercept                                                 92.267569
global_aoa                                                -3.166015
global_clustering                                         10.003661
global_frequency                                          -3.321952
global_letters_count                                      -5.639174
global_orthographic_density                              -18.065750
global_synonyms_count                                    -21.617487
rel_aoa                                                    0.565075
rel_clustering                                            -0.633230
rel_frequency                                              4.544293
rel_letters_count                                          5.430448
rel_orthographic_density                                  12.578175
rel_synonyms_count                                         9.469546
global_aoa * global_clustering                             0.044357
global_aoa * global_frequency                              0.096427
global_aoa * global_letters_count                          0.210170
global_aoa * global_orthographic_density                   0.303156
global_aoa * global_synonyms_count                         0.709212
global_aoa * rel_aoa                                       0.017895
global_aoa * rel_clustering                               -0.099000
global_aoa * rel_frequency                                -0.125409
global_aoa * rel_letters_count                            -0.220707
global_aoa * rel_orthographic_density                     -0.425826
global_aoa * rel_synonyms_count                           -0.558315
global_clustering * global_frequency                      -0.314482
global_clustering * global_letters_count                  -0.259854
global_clustering * global_orthographic_density           -2.722392
global_clustering * global_synonyms_count                 -1.522635
global_clustering * rel_aoa                               -0.159693
global_clustering * rel_clustering                         0.038666
global_clustering * rel_frequency                          0.514521
global_clustering * rel_letters_count                      0.094219
global_clustering * rel_orthographic_density               1.887178
global_clustering * rel_synonyms_count                     0.847305
global_frequency * global_letters_count                    0.252789
global_frequency * global_orthographic_density            -0.151392
global_frequency * global_synonyms_count                   0.469897
global_frequency * rel_aoa                                 0.033912
global_frequency * rel_clustering                         -0.313768
global_frequency * rel_frequency                          -0.000485
global_frequency * rel_letters_count                      -0.310077
global_frequency * rel_orthographic_density                0.136082
global_frequency * rel_synonyms_count                     -0.001138
global_letters_count * global_orthographic_density         0.178671
global_letters_count * global_synonyms_count               0.218043
global_letters_count * rel_aoa                            -0.176692
global_letters_count * rel_clustering                      0.200819
global_letters_count * rel_frequency                      -0.071149
global_letters_count * rel_letters_count                   0.053614
global_letters_count * rel_orthographic_density           -0.011727
global_letters_count * rel_synonyms_count                  0.303680
global_orthographic_density * global_synonyms_count        1.307822
global_orthographic_density * rel_aoa                     -0.118453
global_orthographic_density * rel_clustering               1.925755
global_orthographic_density * rel_frequency               -0.044843
global_orthographic_density * rel_letters_count           -0.318125
global_orthographic_density * rel_orthographic_density     0.249681
global_orthographic_density * rel_synonyms_count          -1.441649
global_synonyms_count * rel_aoa                           -0.261480
global_synonyms_count * rel_clustering                     1.521204
global_synonyms_count * rel_frequency                     -0.532373
global_synonyms_count * rel_letters_count                 -0.723642
global_synonyms_count * rel_orthographic_density          -1.223528
global_synonyms_count * rel_synonyms_count                -0.036711
rel_aoa * rel_clustering                                   0.291448
rel_aoa * rel_frequency                                   -0.025232
rel_aoa * rel_letters_count                                0.099576
rel_aoa * rel_orthographic_density                         0.097293
rel_aoa * rel_synonyms_count                               0.127507
rel_clustering * rel_frequency                             0.042704
rel_clustering * rel_letters_count                         0.139688
rel_clustering * rel_orthographic_density                 -0.643816
rel_clustering * rel_synonyms_count                       -1.241876
rel_frequency * rel_letters_count                          0.074832
rel_frequency * rel_orthographic_density                   0.158832
rel_frequency * rel_synonyms_count                         0.045730
rel_letters_count * rel_orthographic_density               0.350043
rel_letters_count * rel_synonyms_count                     0.120628
rel_orthographic_density * rel_synonyms_count              1.410765
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 320 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.06491143991215753

intercept                     -3.963960
global_aoa                    -0.022170
global_clustering              0.197436
global_frequency              -0.052393
global_letters_count           0.005545
global_orthographic_density   -0.027040
global_synonyms_count         -0.074569
dtype: float64

Regressing global clustering with 320 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2013752016139495

intercept                                              1.666954
global_aoa                                             0.302547
global_clustering                                      0.661796
global_frequency                                      -1.159007
global_letters_count                                  -0.164300
global_orthographic_density                            0.280379
global_synonyms_count                                 -2.137407
global_aoa * global_clustering                         0.039908
global_aoa * global_frequency                         -0.019142
global_aoa * global_letters_count                      0.017822
global_aoa * global_orthographic_density              -0.023814
global_aoa * global_synonyms_count                     0.027015
global_clustering * global_frequency                  -0.144357
global_clustering * global_letters_count               0.066296
global_clustering * global_orthographic_density        0.181686
global_clustering * global_synonyms_count             -0.214599
global_frequency * global_letters_count                0.046340
global_frequency * global_orthographic_density         0.099285
global_frequency * global_synonyms_count               0.031172
global_letters_count * global_orthographic_density    -0.005506
global_letters_count * global_synonyms_count           0.062989
global_orthographic_density * global_synonyms_count   -0.009648
dtype: float64

Regressing rel clustering with 320 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.0333445128125065

intercept                      1.664190
global_aoa                    -0.009918
global_clustering              0.133470
global_frequency              -0.035176
global_letters_count          -0.011276
global_orthographic_density   -0.008470
global_synonyms_count         -0.096828
dtype: float64

Regressing rel clustering with 320 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.13124985183926374

intercept                                              8.726558
global_aoa                                             0.120192
global_clustering                                      0.933107
global_frequency                                      -1.052035
global_letters_count                                  -0.284423
global_orthographic_density                            0.422433
global_synonyms_count                                 -1.635828
global_aoa * global_clustering                         0.032194
global_aoa * global_frequency                         -0.009752
global_aoa * global_letters_count                      0.026063
global_aoa * global_orthographic_density              -0.007143
global_aoa * global_synonyms_count                     0.001827
global_clustering * global_frequency                  -0.149182
global_clustering * global_letters_count               0.031699
global_clustering * global_orthographic_density        0.162728
global_clustering * global_synonyms_count             -0.186489
global_frequency * global_letters_count                0.027684
global_frequency * global_orthographic_density         0.057675
global_frequency * global_synonyms_count               0.005494
global_letters_count * global_orthographic_density     0.001525
global_letters_count * global_synonyms_count           0.069853
global_orthographic_density * global_synonyms_count   -0.011361
dtype: float64

Regressing global clustering with 320 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.032291875057349984

intercept                  -5.803064
rel_aoa                     0.014370
rel_clustering              0.171026
rel_frequency               0.002220
rel_letters_count          -0.017667
rel_orthographic_density   -0.018944
rel_synonyms_count         -0.091159
dtype: float64

Regressing global clustering with 320 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12268567460402846

intercept                                       -5.704130
rel_aoa                                         -0.049724
rel_clustering                                  -0.066404
rel_frequency                                    0.073738
rel_letters_count                               -0.028919
rel_orthographic_density                         0.040327
rel_synonyms_count                              -0.202695
rel_aoa * rel_clustering                         0.062032
rel_aoa * rel_frequency                         -0.009269
rel_aoa * rel_letters_count                     -0.010205
rel_aoa * rel_orthographic_density              -0.029457
rel_aoa * rel_synonyms_count                    -0.001944
rel_clustering * rel_frequency                  -0.091804
rel_clustering * rel_letters_count               0.007172
rel_clustering * rel_orthographic_density        0.025664
rel_clustering * rel_synonyms_count             -0.196091
rel_frequency * rel_letters_count               -0.008744
rel_frequency * rel_orthographic_density         0.008778
rel_frequency * rel_synonyms_count              -0.018047
rel_letters_count * rel_orthographic_density    -0.010196
rel_letters_count * rel_synonyms_count           0.068843
rel_orthographic_density * rel_synonyms_count    0.043570
dtype: float64

Regressing rel clustering with 320 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.11487067981455712

intercept                   0.330690
rel_aoa                     0.003789
rel_clustering              0.359053
rel_frequency               0.014514
rel_letters_count          -0.009731
rel_orthographic_density    0.022804
rel_synonyms_count         -0.058416
dtype: float64

Regressing rel clustering with 320 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.19959100330173607

intercept                                        0.416817
rel_aoa                                         -0.012606
rel_clustering                                   0.074511
rel_frequency                                    0.073842
rel_letters_count                               -0.013756
rel_orthographic_density                        -0.000961
rel_synonyms_count                              -0.106508
rel_aoa * rel_clustering                         0.032676
rel_aoa * rel_frequency                          0.002617
rel_aoa * rel_letters_count                     -0.017242
rel_aoa * rel_orthographic_density              -0.047434
rel_aoa * rel_synonyms_count                    -0.005364
rel_clustering * rel_frequency                  -0.113255
rel_clustering * rel_letters_count               0.034770
rel_clustering * rel_orthographic_density        0.074138
rel_clustering * rel_synonyms_count             -0.213448
rel_frequency * rel_letters_count               -0.005854
rel_frequency * rel_orthographic_density        -0.008958
rel_frequency * rel_synonyms_count              -0.008219
rel_letters_count * rel_orthographic_density     0.002018
rel_letters_count * rel_synonyms_count           0.081101
rel_orthographic_density * rel_synonyms_count    0.089911
dtype: float64

Regressing global clustering with 320 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0943420772364656

intercept                     -2.372741
global_aoa                    -0.055096
global_clustering              0.254901
global_frequency              -0.113669
global_letters_count           0.010473
global_orthographic_density   -0.161089
global_synonyms_count         -0.033914
rel_aoa                        0.044836
rel_clustering                -0.045610
rel_frequency                  0.077270
rel_letters_count             -0.012968
rel_orthographic_density       0.148677
rel_synonyms_count            -0.062396
dtype: float64

Regressing global clustering with 320 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3834913826945405

intercept                                                 18.283898
global_aoa                                                 1.153611
global_clustering                                          3.945426
global_frequency                                          -2.429942
global_letters_count                                      -0.483867
global_orthographic_density                                0.350964
global_synonyms_count                                     -4.536896
rel_aoa                                                   -1.069427
rel_clustering                                            -3.427363
rel_frequency                                              0.836184
rel_letters_count                                         -0.678813
rel_orthographic_density                                  -1.859523
rel_synonyms_count                                        -0.090567
global_aoa * global_clustering                             0.037614
global_aoa * global_frequency                             -0.034313
global_aoa * global_letters_count                         -0.047474
global_aoa * global_orthographic_density                  -0.248181
global_aoa * global_synonyms_count                         0.105144
global_aoa * rel_aoa                                       0.010561
global_aoa * rel_clustering                               -0.038353
global_aoa * rel_frequency                                 0.021530
global_aoa * rel_letters_count                             0.087538
global_aoa * rel_orthographic_density                      0.240589
global_aoa * rel_synonyms_count                            0.026515
global_clustering * global_frequency                      -0.335540
global_clustering * global_letters_count                   0.017781
global_clustering * global_orthographic_density           -0.093081
global_clustering * global_synonyms_count                 -0.852494
global_clustering * rel_aoa                               -0.183811
global_clustering * rel_clustering                         0.053019
global_clustering * rel_frequency                          0.167399
global_clustering * rel_letters_count                     -0.020799
global_clustering * rel_orthographic_density               0.023508
global_clustering * rel_synonyms_count                     0.945115
global_frequency * global_letters_count                    0.086367
global_frequency * global_orthographic_density             0.059088
global_frequency * global_synonyms_count                   0.201467
global_frequency * rel_aoa                                -0.038466
global_frequency * rel_clustering                          0.203241
global_frequency * rel_frequency                           0.010999
global_frequency * rel_letters_count                      -0.001019
global_frequency * rel_orthographic_density                0.063784
global_frequency * rel_synonyms_count                      0.080325
global_letters_count * global_orthographic_density         0.052184
global_letters_count * global_synonyms_count              -0.307644
global_letters_count * rel_aoa                             0.022101
global_letters_count * rel_clustering                      0.135779
global_letters_count * rel_frequency                       0.004075
global_letters_count * rel_letters_count                   0.004753
global_letters_count * rel_orthographic_density           -0.069808
global_letters_count * rel_synonyms_count                  0.634202
global_orthographic_density * global_synonyms_count       -0.985946
global_orthographic_density * rel_aoa                      0.151118
global_orthographic_density * rel_clustering               0.405004
global_orthographic_density * rel_frequency                0.057687
global_orthographic_density * rel_letters_count           -0.046398
global_orthographic_density * rel_orthographic_density    -0.063506
global_orthographic_density * rel_synonyms_count           0.855161
global_synonyms_count * rel_aoa                           -0.015152
global_synonyms_count * rel_clustering                     0.462394
global_synonyms_count * rel_frequency                     -0.169406
global_synonyms_count * rel_letters_count                 -0.003524
global_synonyms_count * rel_orthographic_density           0.550080
global_synonyms_count * rel_synonyms_count                -0.049932
rel_aoa * rel_clustering                                   0.192363
rel_aoa * rel_frequency                                    0.039081
rel_aoa * rel_letters_count                               -0.068505
rel_aoa * rel_orthographic_density                        -0.212561
rel_aoa * rel_synonyms_count                              -0.100005
rel_clustering * rel_frequency                            -0.165259
rel_clustering * rel_letters_count                        -0.036083
rel_clustering * rel_orthographic_density                 -0.091674
rel_clustering * rel_synonyms_count                       -0.825731
rel_frequency * rel_letters_count                         -0.053368
rel_frequency * rel_orthographic_density                  -0.111581
rel_frequency * rel_synonyms_count                        -0.122873
rel_letters_count * rel_orthographic_density               0.053242
rel_letters_count * rel_synonyms_count                    -0.178346
rel_orthographic_density * rel_synonyms_count             -0.199844
dtype: float64

Regressing rel clustering with 320 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2056361612362234

intercept                     -1.357737
global_aoa                    -0.043947
global_clustering             -0.539173
global_frequency              -0.096148
global_letters_count          -0.006670
global_orthographic_density   -0.131149
global_synonyms_count         -0.113898
rel_aoa                        0.040092
rel_clustering                 0.819978
rel_frequency                  0.067500
rel_letters_count             -0.007243
rel_orthographic_density       0.099179
rel_synonyms_count             0.033116
dtype: float64

Regressing rel clustering with 320 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.4395261478845478

intercept                                                 16.407465
global_aoa                                                 0.654963
global_clustering                                          2.053184
global_frequency                                          -1.949683
global_letters_count                                      -0.955844
global_orthographic_density                               -0.288178
global_synonyms_count                                     -2.004130
rel_aoa                                                   -0.743103
rel_clustering                                            -1.496488
rel_frequency                                              0.712236
rel_letters_count                                         -0.327677
rel_orthographic_density                                  -1.665316
rel_synonyms_count                                        -1.453297
global_aoa * global_clustering                             0.051335
global_aoa * global_frequency                             -0.010293
global_aoa * global_letters_count                         -0.003928
global_aoa * global_orthographic_density                  -0.169762
global_aoa * global_synonyms_count                         0.055472
global_aoa * rel_aoa                                       0.009323
global_aoa * rel_clustering                               -0.061340
global_aoa * rel_frequency                                 0.003012
global_aoa * rel_letters_count                             0.048196
global_aoa * rel_orthographic_density                      0.180561
global_aoa * rel_synonyms_count                            0.031034
global_clustering * global_frequency                      -0.235237
global_clustering * global_letters_count                  -0.023482
global_clustering * global_orthographic_density           -0.090407
global_clustering * global_synonyms_count                 -0.505090
global_clustering * rel_aoa                               -0.188562
global_clustering * rel_clustering                         0.045382
global_clustering * rel_frequency                          0.111095
global_clustering * rel_letters_count                     -0.011400
global_clustering * rel_orthographic_density              -0.014953
global_clustering * rel_synonyms_count                     0.609529
global_frequency * global_letters_count                    0.081688
global_frequency * global_orthographic_density             0.081372
global_frequency * global_synonyms_count                   0.136967
global_frequency * rel_aoa                                -0.048659
global_frequency * rel_clustering                          0.134472
global_frequency * rel_frequency                           0.012934
global_frequency * rel_letters_count                      -0.011376
global_frequency * rel_orthographic_density                0.034295
global_frequency * rel_synonyms_count                      0.090999
global_letters_count * global_orthographic_density         0.027792
global_letters_count * global_synonyms_count              -0.268311
global_letters_count * rel_aoa                            -0.008083
global_letters_count * rel_clustering                      0.131243
global_letters_count * rel_frequency                      -0.006816
global_letters_count * rel_letters_count                   0.010223
global_letters_count * rel_orthographic_density           -0.017305
global_letters_count * rel_synonyms_count                  0.564349
global_orthographic_density * global_synonyms_count       -0.928371
global_orthographic_density * rel_aoa                      0.105200
global_orthographic_density * rel_clustering               0.331065
global_orthographic_density * rel_frequency                0.006308
global_orthographic_density * rel_letters_count           -0.033544
global_orthographic_density * rel_orthographic_density    -0.078660
global_orthographic_density * rel_synonyms_count           0.752651
global_synonyms_count * rel_aoa                           -0.070327
global_synonyms_count * rel_clustering                     0.282601
global_synonyms_count * rel_frequency                     -0.175532
global_synonyms_count * rel_letters_count                 -0.007823
global_synonyms_count * rel_orthographic_density           0.403986
global_synonyms_count * rel_synonyms_count                -0.045043
rel_aoa * rel_clustering                                   0.167475
rel_aoa * rel_frequency                                    0.038499
rel_aoa * rel_letters_count                               -0.045976
rel_aoa * rel_orthographic_density                        -0.180162
rel_aoa * rel_synonyms_count                              -0.016982
rel_clustering * rel_frequency                            -0.117394
rel_clustering * rel_letters_count                         0.000502
rel_clustering * rel_orthographic_density                 -0.055692
rel_clustering * rel_synonyms_count                       -0.628738
rel_frequency * rel_letters_count                         -0.037775
rel_frequency * rel_orthographic_density                  -0.092957
rel_frequency * rel_synonyms_count                        -0.067257
rel_letters_count * rel_orthographic_density               0.017262
rel_letters_count * rel_synonyms_count                    -0.174617
rel_orthographic_density * rel_synonyms_count             -0.100748
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13032504696472014

intercept                      5.737162
global_aoa                    -0.068491
global_clustering              0.012953
global_frequency              -0.055762
global_letters_count           0.307228
global_orthographic_density   -0.204065
global_synonyms_count         -0.245055
dtype: float64

Regressing global letters_count with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17333459888548164

intercept                                             -1.545861
global_aoa                                             0.050521
global_clustering                                     -2.731626
global_frequency                                      -0.699220
global_letters_count                                   0.829238
global_orthographic_density                           -1.131887
global_synonyms_count                                  1.428274
global_aoa * global_clustering                         0.094716
global_aoa * global_frequency                          0.095556
global_aoa * global_letters_count                     -0.040990
global_aoa * global_orthographic_density              -0.101789
global_aoa * global_synonyms_count                     0.023154
global_clustering * global_frequency                   0.114986
global_clustering * global_letters_count               0.087378
global_clustering * global_orthographic_density        0.158931
global_clustering * global_synonyms_count              0.469008
global_frequency * global_letters_count                0.031072
global_frequency * global_orthographic_density         0.285222
global_frequency * global_synonyms_count               0.158697
global_letters_count * global_orthographic_density     0.011608
global_letters_count * global_synonyms_count          -0.050361
global_orthographic_density * global_synonyms_count   -0.088838
dtype: float64

Regressing rel letters_count with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.06455831345259

intercept                      2.815997
global_aoa                    -0.066114
global_clustering             -0.052000
global_frequency              -0.121950
global_letters_count           0.167526
global_orthographic_density   -0.184845
global_synonyms_count         -0.289036
dtype: float64

Regressing rel letters_count with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11991963114397663

intercept                                             -10.011331
global_aoa                                              0.816259
global_clustering                                      -3.434353
global_frequency                                       -0.715191
global_letters_count                                    0.810699
global_orthographic_density                            -0.037953
global_synonyms_count                                   0.399420
global_aoa * global_clustering                          0.205503
global_aoa * global_frequency                           0.119137
global_aoa * global_letters_count                      -0.086898
global_aoa * global_orthographic_density               -0.161741
global_aoa * global_synonyms_count                      0.037520
global_clustering * global_frequency                    0.144534
global_clustering * global_letters_count                0.020290
global_clustering * global_orthographic_density         0.244931
global_clustering * global_synonyms_count               0.235428
global_frequency * global_letters_count                 0.020071
global_frequency * global_orthographic_density          0.300748
global_frequency * global_synonyms_count                0.159528
global_letters_count * global_orthographic_density     -0.038519
global_letters_count * global_synonyms_count           -0.106557
global_orthographic_density * global_synonyms_count    -0.160405
dtype: float64

Regressing global letters_count with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1004651133658877

intercept                   5.529361
rel_aoa                    -0.120126
rel_clustering              0.219427
rel_frequency               0.027351
rel_letters_count           0.169952
rel_orthographic_density   -0.405867
rel_synonyms_count         -0.248795
dtype: float64

Regressing global letters_count with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12313951352278718

intercept                                        5.687609
rel_aoa                                         -0.167285
rel_clustering                                   0.174537
rel_frequency                                    0.141947
rel_letters_count                                0.173507
rel_orthographic_density                        -0.420446
rel_synonyms_count                              -0.212971
rel_aoa * rel_clustering                        -0.066369
rel_aoa * rel_frequency                         -0.036236
rel_aoa * rel_letters_count                      0.006243
rel_aoa * rel_orthographic_density              -0.000699
rel_aoa * rel_synonyms_count                    -0.010317
rel_clustering * rel_frequency                  -0.007689
rel_clustering * rel_letters_count               0.097656
rel_clustering * rel_orthographic_density        0.222909
rel_clustering * rel_synonyms_count              0.344868
rel_frequency * rel_letters_count               -0.017805
rel_frequency * rel_orthographic_density         0.075941
rel_frequency * rel_synonyms_count               0.033998
rel_letters_count * rel_orthographic_density     0.074442
rel_letters_count * rel_synonyms_count           0.086334
rel_orthographic_density * rel_synonyms_count    0.269906
dtype: float64

Regressing rel letters_count with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17316180267766312

intercept                   0.971415
rel_aoa                    -0.097924
rel_clustering              0.061139
rel_frequency              -0.203997
rel_letters_count           0.345257
rel_orthographic_density   -0.104833
rel_synonyms_count         -0.217306
dtype: float64

Regressing rel letters_count with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2030425176067535

intercept                                        1.077112
rel_aoa                                         -0.053215
rel_clustering                                   0.162027
rel_frequency                                   -0.156262
rel_letters_count                                0.418512
rel_orthographic_density                         0.009727
rel_synonyms_count                              -0.000420
rel_aoa * rel_clustering                         0.003568
rel_aoa * rel_frequency                          0.009397
rel_aoa * rel_letters_count                     -0.062206
rel_aoa * rel_orthographic_density              -0.138273
rel_aoa * rel_synonyms_count                    -0.013530
rel_clustering * rel_frequency                   0.069334
rel_clustering * rel_letters_count               0.138994
rel_clustering * rel_orthographic_density        0.324762
rel_clustering * rel_synonyms_count              0.329232
rel_frequency * rel_letters_count                0.009008
rel_frequency * rel_orthographic_density         0.125866
rel_frequency * rel_synonyms_count               0.091401
rel_letters_count * rel_orthographic_density     0.071545
rel_letters_count * rel_synonyms_count           0.007144
rel_orthographic_density * rel_synonyms_count    0.047906
dtype: float64

Regressing global letters_count with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15721644492106146

intercept                     -1.271086
global_aoa                     0.005973
global_clustering             -0.583592
global_frequency               0.017053
global_letters_count           0.611973
global_orthographic_density    0.123289
global_synonyms_count         -0.070321
rel_aoa                       -0.102514
rel_clustering                 0.627606
rel_frequency                 -0.117493
rel_letters_count             -0.343072
rel_orthographic_density      -0.351472
rel_synonyms_count            -0.169474
dtype: float64

Regressing global letters_count with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3581391008788797

intercept                                                -29.890892
global_aoa                                                 0.579710
global_clustering                                         -8.923597
global_frequency                                           2.219875
global_letters_count                                      -2.190715
global_orthographic_density                               -6.075488
global_synonyms_count                                     -8.328890
rel_aoa                                                   -4.023562
rel_clustering                                            10.075561
rel_frequency                                             -0.713481
rel_letters_count                                          2.893152
rel_orthographic_density                                  -2.273741
rel_synonyms_count                                         5.053271
global_aoa * global_clustering                             0.440540
global_aoa * global_frequency                              0.137324
global_aoa * global_letters_count                          0.086470
global_aoa * global_orthographic_density                   0.138029
global_aoa * global_synonyms_count                        -0.015733
global_aoa * rel_aoa                                       0.002471
global_aoa * rel_clustering                               -0.325065
global_aoa * rel_frequency                                -0.097881
global_aoa * rel_letters_count                            -0.130770
global_aoa * rel_orthographic_density                     -0.187197
global_aoa * rel_synonyms_count                            0.012571
global_clustering * global_frequency                       0.702953
global_clustering * global_letters_count                   0.167278
global_clustering * global_orthographic_density           -0.960466
global_clustering * global_synonyms_count                 -0.817246
global_clustering * rel_aoa                               -0.179969
global_clustering * rel_clustering                        -0.053074
global_clustering * rel_frequency                         -0.280966
global_clustering * rel_letters_count                     -0.307855
global_clustering * rel_orthographic_density               0.405643
global_clustering * rel_synonyms_count                     0.153927
global_frequency * global_letters_count                    0.397154
global_frequency * global_orthographic_density             0.079024
global_frequency * global_synonyms_count                   0.124012
global_frequency * rel_aoa                                 0.224092
global_frequency * rel_clustering                         -0.675850
global_frequency * rel_frequency                           0.022704
global_frequency * rel_letters_count                      -0.492210
global_frequency * rel_orthographic_density                0.328219
global_frequency * rel_synonyms_count                     -0.052437
global_letters_count * global_orthographic_density        -0.331137
global_letters_count * global_synonyms_count               0.450087
global_letters_count * rel_aoa                             0.168554
global_letters_count * rel_clustering                     -0.236579
global_letters_count * rel_frequency                      -0.090836
global_letters_count * rel_letters_count                   0.049427
global_letters_count * rel_orthographic_density            0.364558
global_letters_count * rel_synonyms_count                 -0.269337
global_orthographic_density * global_synonyms_count        0.375979
global_orthographic_density * rel_aoa                     -0.182150
global_orthographic_density * rel_clustering               0.032819
global_orthographic_density * rel_frequency               -0.301965
global_orthographic_density * rel_letters_count            0.289486
global_orthographic_density * rel_orthographic_density     0.227520
global_orthographic_density * rel_synonyms_count          -1.350456
global_synonyms_count * rel_aoa                            0.272453
global_synonyms_count * rel_clustering                     1.495635
global_synonyms_count * rel_frequency                     -0.049120
global_synonyms_count * rel_letters_count                 -0.854930
global_synonyms_count * rel_orthographic_density          -0.352755
global_synonyms_count * rel_synonyms_count                -0.464532
rel_aoa * rel_clustering                                   0.178822
rel_aoa * rel_frequency                                   -0.228853
rel_aoa * rel_letters_count                               -0.209004
rel_aoa * rel_orthographic_density                         0.121872
rel_aoa * rel_synonyms_count                              -0.174900
rel_clustering * rel_frequency                             0.476436
rel_clustering * rel_letters_count                         0.562858
rel_clustering * rel_orthographic_density                  0.918307
rel_clustering * rel_synonyms_count                       -0.284772
rel_frequency * rel_letters_count                          0.170069
rel_frequency * rel_orthographic_density                   0.124374
rel_frequency * rel_synonyms_count                        -0.006848
rel_letters_count * rel_orthographic_density              -0.110451
rel_letters_count * rel_synonyms_count                     0.466675
rel_orthographic_density * rel_synonyms_count              1.279539
dtype: float64

Regressing rel letters_count with 394 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.21659424257739568

intercept                     -1.181566
global_aoa                    -0.004889
global_clustering             -0.569133
global_frequency              -0.012780
global_letters_count          -0.331141
global_orthographic_density    0.146035
global_synonyms_count         -0.037892
rel_aoa                       -0.093954
rel_clustering                 0.593840
rel_frequency                 -0.123942
rel_letters_count              0.620495
rel_orthographic_density      -0.389105
rel_synonyms_count            -0.194463
dtype: float64

Regressing rel letters_count with 394 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3904663688223984

intercept                                                -39.507274
global_aoa                                                 0.799075
global_clustering                                        -10.647704
global_frequency                                           1.941107
global_letters_count                                      -1.689524
global_orthographic_density                               -3.616238
global_synonyms_count                                     -7.568360
rel_aoa                                                   -3.439694
rel_clustering                                             8.922114
rel_frequency                                             -0.560598
rel_letters_count                                          2.867710
rel_orthographic_density                                  -3.733525
rel_synonyms_count                                         4.389096
global_aoa * global_clustering                             0.405755
global_aoa * global_frequency                              0.118832
global_aoa * global_letters_count                          0.054929
global_aoa * global_orthographic_density                   0.071551
global_aoa * global_synonyms_count                        -0.001518
global_aoa * rel_aoa                                       0.005497
global_aoa * rel_clustering                               -0.265152
global_aoa * rel_frequency                                -0.091958
global_aoa * rel_letters_count                            -0.112876
global_aoa * rel_orthographic_density                     -0.132488
global_aoa * rel_synonyms_count                           -0.032142
global_clustering * global_frequency                       0.687485
global_clustering * global_letters_count                   0.349191
global_clustering * global_orthographic_density           -0.348523
global_clustering * global_synonyms_count                 -0.674100
global_clustering * rel_aoa                               -0.141412
global_clustering * rel_clustering                        -0.113370
global_clustering * rel_frequency                         -0.271503
global_clustering * rel_letters_count                     -0.461104
global_clustering * rel_orthographic_density              -0.159394
global_clustering * rel_synonyms_count                    -0.023700
global_frequency * global_letters_count                    0.367090
global_frequency * global_orthographic_density             0.212495
global_frequency * global_synonyms_count                   0.227040
global_frequency * rel_aoa                                 0.200769
global_frequency * rel_clustering                         -0.554767
global_frequency * rel_frequency                           0.018071
global_frequency * rel_letters_count                      -0.476729
global_frequency * rel_orthographic_density                0.154497
global_frequency * rel_synonyms_count                     -0.174458
global_letters_count * global_orthographic_density        -0.268787
global_letters_count * global_synonyms_count               0.300991
global_letters_count * rel_aoa                             0.138162
global_letters_count * rel_clustering                     -0.295392
global_letters_count * rel_frequency                      -0.083034
global_letters_count * rel_letters_count                   0.038890
global_letters_count * rel_orthographic_density            0.257959
global_letters_count * rel_synonyms_count                 -0.135587
global_orthographic_density * global_synonyms_count        0.147813
global_orthographic_density * rel_aoa                     -0.142405
global_orthographic_density * rel_clustering              -0.306357
global_orthographic_density * rel_frequency               -0.341413
global_orthographic_density * rel_letters_count            0.269241
global_orthographic_density * rel_orthographic_density     0.227964
global_orthographic_density * rel_synonyms_count          -1.027569
global_synonyms_count * rel_aoa                            0.296443
global_synonyms_count * rel_clustering                     1.378110
global_synonyms_count * rel_frequency                     -0.119999
global_synonyms_count * rel_letters_count                 -0.762286
global_synonyms_count * rel_orthographic_density          -0.182673
global_synonyms_count * rel_synonyms_count                -0.453265
rel_aoa * rel_clustering                                   0.107600
rel_aoa * rel_frequency                                   -0.187982
rel_aoa * rel_letters_count                               -0.165881
rel_aoa * rel_orthographic_density                         0.107586
rel_aoa * rel_synonyms_count                              -0.159493
rel_clustering * rel_frequency                             0.351525
rel_clustering * rel_letters_count                         0.602119
rel_clustering * rel_orthographic_density                  1.177019
rel_clustering * rel_synonyms_count                       -0.150285
rel_frequency * rel_letters_count                          0.181966
rel_frequency * rel_orthographic_density                   0.191336
rel_frequency * rel_synonyms_count                         0.098425
rel_letters_count * rel_orthographic_density              -0.051700
rel_letters_count * rel_synonyms_count                     0.384399
rel_orthographic_density * rel_synonyms_count              1.016369
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 384 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08514126031391922

intercept                      0.775771
global_aoa                    -0.006724
global_clustering              0.036243
global_frequency               0.008091
global_letters_count          -0.045195
global_orthographic_density   -0.023181
global_synonyms_count          0.235617
dtype: float64

Regressing global synonyms_count with 384 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12447141984212863

intercept                                              3.831718
global_aoa                                            -0.148853
global_clustering                                      0.405811
global_frequency                                      -0.185880
global_letters_count                                  -0.204132
global_orthographic_density                           -0.381037
global_synonyms_count                                  0.283075
global_aoa * global_clustering                        -0.010413
global_aoa * global_frequency                         -0.008008
global_aoa * global_letters_count                      0.014212
global_aoa * global_orthographic_density               0.046548
global_aoa * global_synonyms_count                     0.044962
global_clustering * global_frequency                  -0.035665
global_clustering * global_letters_count              -0.001794
global_clustering * global_orthographic_density       -0.007477
global_clustering * global_synonyms_count              0.143124
global_frequency * global_letters_count                0.006658
global_frequency * global_orthographic_density         0.004343
global_frequency * global_synonyms_count               0.036606
global_letters_count * global_orthographic_density    -0.012130
global_letters_count * global_synonyms_count           0.001123
global_orthographic_density * global_synonyms_count    0.107523
dtype: float64

Regressing rel synonyms_count with 384 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.06532465085689598

intercept                      0.400958
global_aoa                    -0.008588
global_clustering              0.026999
global_frequency               0.011151
global_letters_count          -0.039362
global_orthographic_density   -0.022275
global_synonyms_count          0.194227
dtype: float64

Regressing rel synonyms_count with 384 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.10493810758656519

intercept                                              5.227025
global_aoa                                            -0.250664
global_clustering                                      0.590385
global_frequency                                      -0.337613
global_letters_count                                  -0.262695
global_orthographic_density                           -0.509192
global_synonyms_count                                  0.691803
global_aoa * global_clustering                        -0.014462
global_aoa * global_frequency                         -0.000424
global_aoa * global_letters_count                      0.016502
global_aoa * global_orthographic_density               0.052298
global_aoa * global_synonyms_count                     0.011637
global_clustering * global_frequency                  -0.051584
global_clustering * global_letters_count              -0.006575
global_clustering * global_orthographic_density       -0.009162
global_clustering * global_synonyms_count              0.161533
global_frequency * global_letters_count                0.008049
global_frequency * global_orthographic_density         0.010983
global_frequency * global_synonyms_count               0.024658
global_letters_count * global_orthographic_density    -0.003160
global_letters_count * global_synonyms_count           0.006139
global_orthographic_density * global_synonyms_count    0.073341
dtype: float64

Regressing global synonyms_count with 384 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09299334246310498

intercept                   0.490423
rel_aoa                     0.038112
rel_clustering             -0.052551
rel_frequency               0.003368
rel_letters_count          -0.054048
rel_orthographic_density    0.003481
rel_synonyms_count          0.235804
dtype: float64

Regressing global synonyms_count with 384 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13594910942653882

intercept                                        0.569634
rel_aoa                                          0.043329
rel_clustering                                  -0.180154
rel_frequency                                    0.038264
rel_letters_count                               -0.089758
rel_orthographic_density                         0.030076
rel_synonyms_count                               0.313173
rel_aoa * rel_clustering                        -0.033180
rel_aoa * rel_frequency                          0.000360
rel_aoa * rel_letters_count                      0.020653
rel_aoa * rel_orthographic_density               0.033092
rel_aoa * rel_synonyms_count                     0.057183
rel_clustering * rel_frequency                  -0.048274
rel_clustering * rel_letters_count               0.007646
rel_clustering * rel_orthographic_density       -0.002390
rel_clustering * rel_synonyms_count              0.144004
rel_frequency * rel_letters_count               -0.002697
rel_frequency * rel_orthographic_density         0.013407
rel_frequency * rel_synonyms_count               0.033682
rel_letters_count * rel_orthographic_density    -0.008784
rel_letters_count * rel_synonyms_count          -0.004497
rel_orthographic_density * rel_synonyms_count    0.072259
dtype: float64

Regressing rel synonyms_count with 384 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.17276339489253534

intercept                   0.175798
rel_aoa                     0.022825
rel_clustering             -0.018611
rel_frequency               0.012414
rel_letters_count          -0.045556
rel_orthographic_density   -0.014366
rel_synonyms_count          0.379706
dtype: float64

Regressing rel synonyms_count with 384 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.21852678351156885

intercept                                        0.245913
rel_aoa                                          0.017178
rel_clustering                                  -0.172362
rel_frequency                                    0.049000
rel_letters_count                               -0.075548
rel_orthographic_density                        -0.001894
rel_synonyms_count                               0.489238
rel_aoa * rel_clustering                        -0.019401
rel_aoa * rel_frequency                          0.001594
rel_aoa * rel_letters_count                      0.023645
rel_aoa * rel_orthographic_density               0.031399
rel_aoa * rel_synonyms_count                     0.033432
rel_clustering * rel_frequency                  -0.057910
rel_clustering * rel_letters_count               0.015300
rel_clustering * rel_orthographic_density        0.017299
rel_clustering * rel_synonyms_count              0.125470
rel_frequency * rel_letters_count               -0.002860
rel_frequency * rel_orthographic_density         0.014971
rel_frequency * rel_synonyms_count               0.034868
rel_letters_count * rel_orthographic_density    -0.000248
rel_letters_count * rel_synonyms_count           0.011893
rel_orthographic_density * rel_synonyms_count    0.114319
dtype: float64

Regressing global synonyms_count with 384 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13144419622845238

intercept                      2.410436
global_aoa                    -0.061897
global_clustering              0.248265
global_frequency               0.012065
global_letters_count           0.000393
global_orthographic_density   -0.087526
global_synonyms_count          0.095604
rel_aoa                        0.074819
rel_clustering                -0.227802
rel_frequency                  0.004769
rel_letters_count             -0.053239
rel_orthographic_density       0.059674
rel_synonyms_count             0.141995
dtype: float64

Regressing global synonyms_count with 384 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3090793394881727

intercept                                                 12.103572
global_aoa                                                 0.870944
global_clustering                                          3.388177
global_frequency                                          -0.836936
global_letters_count                                       0.118111
global_orthographic_density                                1.782083
global_synonyms_count                                      3.039463
rel_aoa                                                   -0.338944
rel_clustering                                            -3.440105
rel_frequency                                              0.314815
rel_letters_count                                         -0.898740
rel_orthographic_density                                   0.592186
rel_synonyms_count                                        -3.419927
global_aoa * global_clustering                            -0.049636
global_aoa * global_frequency                             -0.075137
global_aoa * global_letters_count                         -0.063730
global_aoa * global_orthographic_density                  -0.068687
global_aoa * global_synonyms_count                         0.156323
global_aoa * rel_aoa                                      -0.002719
global_aoa * rel_clustering                                0.073698
global_aoa * rel_frequency                                 0.076254
global_aoa * rel_letters_count                             0.073314
global_aoa * rel_orthographic_density                      0.110774
global_aoa * rel_synonyms_count                           -0.122061
global_clustering * global_frequency                      -0.214281
global_clustering * global_letters_count                  -0.017834
global_clustering * global_orthographic_density           -0.215262
global_clustering * global_synonyms_count                  0.634656
global_clustering * rel_aoa                               -0.043127
global_clustering * rel_clustering                         0.095380
global_clustering * rel_frequency                          0.153474
global_clustering * rel_letters_count                     -0.039643
global_clustering * rel_orthographic_density               0.327944
global_clustering * rel_synonyms_count                    -0.664183
global_frequency * global_letters_count                    0.050774
global_frequency * global_orthographic_density            -0.170062
global_frequency * global_synonyms_count                   0.135711
global_frequency * rel_aoa                                 0.018981
global_frequency * rel_clustering                          0.257274
global_frequency * rel_frequency                           0.000591
global_frequency * rel_letters_count                      -0.042693
global_frequency * rel_orthographic_density                0.043622
global_frequency * rel_synonyms_count                     -0.124498
global_letters_count * global_orthographic_density        -0.197135
global_letters_count * global_synonyms_count              -0.190560
global_letters_count * rel_aoa                             0.023210
global_letters_count * rel_clustering                      0.028957
global_letters_count * rel_frequency                      -0.039085
global_letters_count * rel_letters_count                  -0.002064
global_letters_count * rel_orthographic_density            0.062692
global_letters_count * rel_synonyms_count                  0.221442
global_orthographic_density * global_synonyms_count       -0.122451
global_orthographic_density * rel_aoa                     -0.072361
global_orthographic_density * rel_clustering               0.131900
global_orthographic_density * rel_frequency                0.195828
global_orthographic_density * rel_letters_count            0.318844
global_orthographic_density * rel_orthographic_density     0.043619
global_orthographic_density * rel_synonyms_count           0.136899
global_synonyms_count * rel_aoa                           -0.073747
global_synonyms_count * rel_clustering                    -0.371237
global_synonyms_count * rel_frequency                      0.125465
global_synonyms_count * rel_letters_count                  0.133362
global_synonyms_count * rel_orthographic_density           0.004675
global_synonyms_count * rel_synonyms_count                 0.093074
rel_aoa * rel_clustering                                  -0.015440
rel_aoa * rel_frequency                                   -0.027947
rel_aoa * rel_letters_count                               -0.000876
rel_aoa * rel_orthographic_density                         0.075507
rel_aoa * rel_synonyms_count                               0.087175
rel_clustering * rel_frequency                            -0.217735
rel_clustering * rel_letters_count                        -0.004444
rel_clustering * rel_orthographic_density                 -0.249834
rel_clustering * rel_synonyms_count                        0.416973
rel_frequency * rel_letters_count                          0.025966
rel_frequency * rel_orthographic_density                  -0.070372
rel_frequency * rel_synonyms_count                        -0.106574
rel_letters_count * rel_orthographic_density              -0.166712
rel_letters_count * rel_synonyms_count                    -0.142047
rel_orthographic_density * rel_synonyms_count              0.054389
dtype: float64

Regressing rel synonyms_count with 384 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.28050874198367703

intercept                      1.928402
global_aoa                    -0.054786
global_clustering              0.211266
global_frequency               0.010512
global_letters_count           0.012226
global_orthographic_density   -0.042678
global_synonyms_count         -0.700208
rel_aoa                        0.061726
rel_clustering                -0.197385
rel_frequency                  0.004740
rel_letters_count             -0.049821
rel_orthographic_density       0.021303
rel_synonyms_count             1.020926
dtype: float64

Regressing rel synonyms_count with 384 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.4322750167937489

intercept                                                 8.673109
global_aoa                                                0.816371
global_clustering                                         2.783738
global_frequency                                         -0.658754
global_letters_count                                      0.109408
global_orthographic_density                               2.265012
global_synonyms_count                                     1.918338
rel_aoa                                                  -0.179448
rel_clustering                                           -3.090228
rel_frequency                                             0.171957
rel_letters_count                                        -0.812049
rel_orthographic_density                                  0.163699
rel_synonyms_count                                       -2.369126
global_aoa * global_clustering                           -0.035934
global_aoa * global_frequency                            -0.062307
global_aoa * global_letters_count                        -0.056558
global_aoa * global_orthographic_density                 -0.084149
global_aoa * global_synonyms_count                        0.142107
global_aoa * rel_aoa                                     -0.003819
global_aoa * rel_clustering                               0.056647
global_aoa * rel_frequency                                0.062315
global_aoa * rel_letters_count                            0.066531
global_aoa * rel_orthographic_density                     0.118808
global_aoa * rel_synonyms_count                          -0.098536
global_clustering * global_frequency                     -0.179846
global_clustering * global_letters_count                 -0.027088
global_clustering * global_orthographic_density          -0.131119
global_clustering * global_synonyms_count                 0.510742
global_clustering * rel_aoa                              -0.027269
global_clustering * rel_clustering                        0.076013
global_clustering * rel_frequency                         0.117045
global_clustering * rel_letters_count                    -0.014057
global_clustering * rel_orthographic_density              0.309328
global_clustering * rel_synonyms_count                   -0.528314
global_frequency * global_letters_count                   0.039087
global_frequency * global_orthographic_density           -0.166106
global_frequency * global_synonyms_count                  0.092644
global_frequency * rel_aoa                                0.008356
global_frequency * rel_clustering                         0.218818
global_frequency * rel_frequency                         -0.004167
global_frequency * rel_letters_count                     -0.025572
global_frequency * rel_orthographic_density               0.065133
global_frequency * rel_synonyms_count                    -0.056450
global_letters_count * global_orthographic_density       -0.164405
global_letters_count * global_synonyms_count             -0.165876
global_letters_count * rel_aoa                            0.016483
global_letters_count * rel_clustering                     0.065746
global_letters_count * rel_frequency                     -0.026655
global_letters_count * rel_letters_count                 -0.001177
global_letters_count * rel_orthographic_density           0.055007
global_letters_count * rel_synonyms_count                 0.188407
global_orthographic_density * global_synonyms_count      -0.110719
global_orthographic_density * rel_aoa                    -0.041050
global_orthographic_density * rel_clustering              0.102444
global_orthographic_density * rel_frequency               0.188850
global_orthographic_density * rel_letters_count           0.282475
global_orthographic_density * rel_orthographic_density    0.047105
global_orthographic_density * rel_synonyms_count          0.103801
global_synonyms_count * rel_aoa                          -0.055323
global_synonyms_count * rel_clustering                   -0.331489
global_synonyms_count * rel_frequency                     0.148850
global_synonyms_count * rel_letters_count                 0.113134
global_synonyms_count * rel_orthographic_density          0.000981
global_synonyms_count * rel_synonyms_count                0.101880
rel_aoa * rel_clustering                                 -0.013237
rel_aoa * rel_frequency                                  -0.018711
rel_aoa * rel_letters_count                               0.000612
rel_aoa * rel_orthographic_density                        0.036983
rel_aoa * rel_synonyms_count                              0.047144
rel_clustering * rel_frequency                           -0.181881
rel_clustering * rel_letters_count                       -0.050335
rel_clustering * rel_orthographic_density                -0.263459
rel_clustering * rel_synonyms_count                       0.353978
rel_frequency * rel_letters_count                         0.007899
rel_frequency * rel_orthographic_density                 -0.087224
rel_frequency * rel_synonyms_count                       -0.148800
rel_letters_count * rel_orthographic_density             -0.139330
rel_letters_count * rel_synonyms_count                   -0.115928
rel_orthographic_density * rel_synonyms_count             0.078611
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 327 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12900034321874332

intercept                      1.142177
global_aoa                    -0.008186
global_clustering             -0.005621
global_frequency               0.002352
global_letters_count          -0.041013
global_orthographic_density    0.277169
global_synonyms_count          0.087870
dtype: float64

Regressing global orthographic_density with 327 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17597137422351317

intercept                                              0.607233
global_aoa                                             0.074388
global_clustering                                      0.228791
global_frequency                                       0.408316
global_letters_count                                  -0.368977
global_orthographic_density                            0.493250
global_synonyms_count                                  0.755212
global_aoa * global_clustering                        -0.008013
global_aoa * global_frequency                         -0.022478
global_aoa * global_letters_count                      0.003794
global_aoa * global_orthographic_density               0.064603
global_aoa * global_synonyms_count                    -0.097827
global_clustering * global_frequency                   0.029095
global_clustering * global_letters_count              -0.078507
global_clustering * global_orthographic_density        0.003418
global_clustering * global_synonyms_count              0.053992
global_frequency * global_letters_count               -0.012536
global_frequency * global_orthographic_density        -0.040694
global_frequency * global_synonyms_count               0.071077
global_letters_count * global_orthographic_density    -0.035686
global_letters_count * global_synonyms_count          -0.025503
global_orthographic_density * global_synonyms_count   -0.175029
dtype: float64

Regressing rel orthographic_density with 327 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11660688902132033

intercept                     -1.587521
global_aoa                     0.001125
global_clustering             -0.045757
global_frequency               0.018260
global_letters_count          -0.023992
global_orthographic_density    0.265122
global_synonyms_count          0.083673
dtype: float64

Regressing rel orthographic_density with 327 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15705538548638742

intercept                                              2.118686
global_aoa                                            -0.123880
global_clustering                                      0.649313
global_frequency                                       0.235214
global_letters_count                                  -0.680472
global_orthographic_density                            0.046889
global_synonyms_count                                  0.391737
global_aoa * global_clustering                        -0.009603
global_aoa * global_frequency                         -0.019735
global_aoa * global_letters_count                      0.027837
global_aoa * global_orthographic_density               0.081681
global_aoa * global_synonyms_count                    -0.084494
global_clustering * global_frequency                  -0.002981
global_clustering * global_letters_count              -0.100451
global_clustering * global_orthographic_density        0.005029
global_clustering * global_synonyms_count              0.001331
global_frequency * global_letters_count               -0.014223
global_frequency * global_orthographic_density        -0.022435
global_frequency * global_synonyms_count               0.010119
global_letters_count * global_orthographic_density    -0.015696
global_letters_count * global_synonyms_count           0.040195
global_orthographic_density * global_synonyms_count   -0.079355
dtype: float64

Regressing global orthographic_density with 327 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11021963777009913

intercept                   1.580192
rel_aoa                    -0.016605
rel_clustering             -0.037760
rel_frequency               0.003845
rel_letters_count           0.016314
rel_orthographic_density    0.328079
rel_synonyms_count          0.146322
dtype: float64

Regressing global orthographic_density with 327 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14965821993394046

intercept                                        1.533375
rel_aoa                                          0.052070
rel_clustering                                   0.197450
rel_frequency                                   -0.015616
rel_letters_count                                0.042216
rel_orthographic_density                         0.414722
rel_synonyms_count                               0.381132
rel_aoa * rel_clustering                         0.083680
rel_aoa * rel_frequency                          0.017071
rel_aoa * rel_letters_count                     -0.007428
rel_aoa * rel_orthographic_density               0.050164
rel_aoa * rel_synonyms_count                    -0.067226
rel_clustering * rel_frequency                   0.011535
rel_clustering * rel_letters_count              -0.094911
rel_clustering * rel_orthographic_density        0.056811
rel_clustering * rel_synonyms_count             -0.096304
rel_frequency * rel_letters_count                0.008531
rel_frequency * rel_orthographic_density         0.018799
rel_frequency * rel_synonyms_count               0.063714
rel_letters_count * rel_orthographic_density    -0.039964
rel_letters_count * rel_synonyms_count          -0.015328
rel_orthographic_density * rel_synonyms_count   -0.079911
dtype: float64

Regressing rel orthographic_density with 327 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2017204351822768

intercept                  -0.443724
rel_aoa                    -0.011588
rel_clustering             -0.039824
rel_frequency               0.057563
rel_letters_count           0.028408
rel_orthographic_density    0.424569
rel_synonyms_count          0.093663
dtype: float64

Regressing rel orthographic_density with 327 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2236502068573698

intercept                                       -0.420580
rel_aoa                                          0.021570
rel_clustering                                   0.097125
rel_frequency                                    0.083292
rel_letters_count                                0.037872
rel_orthographic_density                         0.515091
rel_synonyms_count                               0.183602
rel_aoa * rel_clustering                         0.068148
rel_aoa * rel_frequency                          0.004533
rel_aoa * rel_letters_count                      0.009055
rel_aoa * rel_orthographic_density               0.078929
rel_aoa * rel_synonyms_count                    -0.047185
rel_clustering * rel_frequency                  -0.015210
rel_clustering * rel_letters_count              -0.083165
rel_clustering * rel_orthographic_density        0.034214
rel_clustering * rel_synonyms_count             -0.077499
rel_frequency * rel_letters_count               -0.000247
rel_frequency * rel_orthographic_density         0.031875
rel_frequency * rel_synonyms_count               0.025411
rel_letters_count * rel_orthographic_density    -0.025898
rel_letters_count * rel_synonyms_count           0.005823
rel_orthographic_density * rel_synonyms_count   -0.030656
dtype: float64

Regressing global orthographic_density with 327 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1631065423853555

intercept                      4.756647
global_aoa                     0.009866
global_clustering              0.174263
global_frequency              -0.127111
global_letters_count          -0.271968
global_orthographic_density    0.215315
global_synonyms_count         -0.114760
rel_aoa                       -0.029931
rel_clustering                -0.198716
rel_frequency                  0.148193
rel_letters_count              0.263393
rel_orthographic_density       0.085459
rel_synonyms_count             0.243339
dtype: float64

Regressing global orthographic_density with 327 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.338259732073833

intercept                                                -26.406299
global_aoa                                                 1.441285
global_clustering                                         -3.018503
global_frequency                                           2.067061
global_letters_count                                       0.388103
global_orthographic_density                                2.889907
global_synonyms_count                                      8.413024
rel_aoa                                                   -1.531953
rel_clustering                                             1.726222
rel_frequency                                             -2.204932
rel_letters_count                                         -1.789801
rel_orthographic_density                                  -2.822141
rel_synonyms_count                                         1.357482
global_aoa * global_clustering                             0.067188
global_aoa * global_frequency                             -0.068964
global_aoa * global_letters_count                         -0.071858
global_aoa * global_orthographic_density                   0.068144
global_aoa * global_synonyms_count                        -0.266721
global_aoa * rel_aoa                                       0.014287
global_aoa * rel_clustering                               -0.188489
global_aoa * rel_frequency                                 0.033279
global_aoa * rel_letters_count                             0.097668
global_aoa * rel_orthographic_density                      0.018154
global_aoa * rel_synonyms_count                            0.117185
global_clustering * global_frequency                       0.226243
global_clustering * global_letters_count                  -0.211601
global_clustering * global_orthographic_density            0.403219
global_clustering * global_synonyms_count                  0.381598
global_clustering * rel_aoa                               -0.306653
global_clustering * rel_clustering                         0.129763
global_clustering * rel_frequency                         -0.409385
global_clustering * rel_letters_count                      0.101495
global_clustering * rel_orthographic_density              -0.215327
global_clustering * rel_synonyms_count                     0.768325
global_frequency * global_letters_count                   -0.110282
global_frequency * global_orthographic_density            -0.000941
global_frequency * global_synonyms_count                  -0.300091
global_frequency * rel_aoa                                 0.005976
global_frequency * rel_clustering                         -0.030807
global_frequency * rel_frequency                          -0.020970
global_frequency * rel_letters_count                       0.186747
global_frequency * rel_orthographic_density                0.116346
global_frequency * rel_synonyms_count                      0.325994
global_letters_count * global_orthographic_density        -0.167291
global_letters_count * global_synonyms_count              -0.348255
global_letters_count * rel_aoa                            -0.028727
global_letters_count * rel_clustering                      0.397554
global_letters_count * rel_frequency                       0.006139
global_letters_count * rel_letters_count                  -0.023772
global_letters_count * rel_orthographic_density            0.156394
global_letters_count * rel_synonyms_count                  0.048540
global_orthographic_density * global_synonyms_count       -0.038988
global_orthographic_density * rel_aoa                     -0.187655
global_orthographic_density * rel_clustering              -0.368988
global_orthographic_density * rel_frequency               -0.119782
global_orthographic_density * rel_letters_count            0.037945
global_orthographic_density * rel_orthographic_density    -0.165680
global_orthographic_density * rel_synonyms_count          -0.189168
global_synonyms_count * rel_aoa                            0.147856
global_synonyms_count * rel_clustering                    -0.871529
global_synonyms_count * rel_frequency                      0.188088
global_synonyms_count * rel_letters_count                  0.467857
global_synonyms_count * rel_orthographic_density          -0.168640
global_synonyms_count * rel_synonyms_count                -0.124610
rel_aoa * rel_clustering                                   0.385814
rel_aoa * rel_frequency                                    0.034633
rel_aoa * rel_letters_count                               -0.004549
rel_aoa * rel_orthographic_density                         0.154895
rel_aoa * rel_synonyms_count                              -0.135288
rel_clustering * rel_frequency                             0.225094
rel_clustering * rel_letters_count                        -0.335270
rel_clustering * rel_orthographic_density                  0.271442
rel_clustering * rel_synonyms_count                       -0.178217
rel_frequency * rel_letters_count                         -0.065065
rel_frequency * rel_orthographic_density                   0.015705
rel_frequency * rel_synonyms_count                        -0.090005
rel_letters_count * rel_orthographic_density              -0.185229
rel_letters_count * rel_synonyms_count                    -0.164402
rel_orthographic_density * rel_synonyms_count              0.407781
dtype: float64

Regressing rel orthographic_density with 327 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.246466308498545

intercept                      3.598853
global_aoa                     0.014780
global_clustering              0.179191
global_frequency              -0.090141
global_letters_count          -0.211058
global_orthographic_density   -0.520806
global_synonyms_count         -0.080734
rel_aoa                       -0.032772
rel_clustering                -0.187237
rel_frequency                  0.129155
rel_letters_count              0.192989
rel_orthographic_density       0.876526
rel_synonyms_count             0.181740
dtype: float64

Regressing rel orthographic_density with 327 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.39900579858479046

intercept                                                -14.716483
global_aoa                                                 1.548886
global_clustering                                         -0.098061
global_frequency                                           1.680529
global_letters_count                                      -0.268282
global_orthographic_density                                2.112941
global_synonyms_count                                      5.960616
rel_aoa                                                   -1.991359
rel_clustering                                            -1.044955
rel_frequency                                             -2.121289
rel_letters_count                                         -1.309024
rel_orthographic_density                                  -2.162980
rel_synonyms_count                                         2.486904
global_aoa * global_clustering                             0.045815
global_aoa * global_frequency                             -0.081147
global_aoa * global_letters_count                         -0.074043
global_aoa * global_orthographic_density                   0.018930
global_aoa * global_synonyms_count                        -0.270379
global_aoa * rel_aoa                                       0.014358
global_aoa * rel_clustering                               -0.167496
global_aoa * rel_frequency                                 0.044688
global_aoa * rel_letters_count                             0.107252
global_aoa * rel_orthographic_density                      0.061941
global_aoa * rel_synonyms_count                            0.164270
global_clustering * global_frequency                       0.093708
global_clustering * global_letters_count                  -0.298025
global_clustering * global_orthographic_density            0.023336
global_clustering * global_synonyms_count                  0.051272
global_clustering * rel_aoa                               -0.307839
global_clustering * rel_clustering                         0.126891
global_clustering * rel_frequency                         -0.292578
global_clustering * rel_letters_count                      0.188324
global_clustering * rel_orthographic_density               0.122125
global_clustering * rel_synonyms_count                     0.948089
global_frequency * global_letters_count                   -0.091842
global_frequency * global_orthographic_density            -0.179734
global_frequency * global_synonyms_count                  -0.309624
global_frequency * rel_aoa                                 0.020161
global_frequency * rel_clustering                          0.067099
global_frequency * rel_frequency                          -0.015259
global_frequency * rel_letters_count                       0.177209
global_frequency * rel_orthographic_density                0.277514
global_frequency * rel_synonyms_count                      0.315775
global_letters_count * global_orthographic_density        -0.150989
global_letters_count * global_synonyms_count              -0.187013
global_letters_count * rel_aoa                             0.001073
global_letters_count * rel_clustering                      0.503390
global_letters_count * rel_frequency                       0.029729
global_letters_count * rel_letters_count                  -0.021507
global_letters_count * rel_orthographic_density            0.166872
global_letters_count * rel_synonyms_count                 -0.066686
global_orthographic_density * global_synonyms_count       -0.089491
global_orthographic_density * rel_aoa                     -0.115470
global_orthographic_density * rel_clustering               0.078470
global_orthographic_density * rel_frequency                0.077247
global_orthographic_density * rel_letters_count            0.025272
global_orthographic_density * rel_orthographic_density    -0.132668
global_orthographic_density * rel_synonyms_count          -0.113249
global_synonyms_count * rel_aoa                            0.154109
global_synonyms_count * rel_clustering                    -0.404885
global_synonyms_count * rel_frequency                      0.187165
global_synonyms_count * rel_letters_count                  0.314872
global_synonyms_count * rel_orthographic_density          -0.078538
global_synonyms_count * rel_synonyms_count                -0.149275
rel_aoa * rel_clustering                                   0.379134
rel_aoa * rel_frequency                                    0.019287
rel_aoa * rel_letters_count                               -0.031419
rel_aoa * rel_orthographic_density                         0.085492
rel_aoa * rel_synonyms_count                              -0.174931
rel_clustering * rel_frequency                             0.144407
rel_clustering * rel_letters_count                        -0.438882
rel_clustering * rel_orthographic_density                 -0.136979
rel_clustering * rel_synonyms_count                       -0.535767
rel_frequency * rel_letters_count                         -0.094855
rel_frequency * rel_orthographic_density                  -0.153094
rel_frequency * rel_synonyms_count                        -0.117099
rel_letters_count * rel_orthographic_density              -0.187538
rel_letters_count * rel_synonyms_count                    -0.034906
rel_orthographic_density * rel_synonyms_count              0.324663
dtype: float64