Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.majority, past=Past.all, durl=Durl.all, max_distance=2)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 24139 substitutions for model Model(time=Time.continuous, source=Source.majority, past=Past.all, durl=Durl.all, max_distance=2)
100% (24139 of 24139) |####################| Elapsed Time: 0:05:40 Time: 0:05:40

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | **  |
H_00 | *** | *** | *** | **  |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *   |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | **  | *   |
H_00 | *** | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | *   | *** | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *** |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | **  |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | *   | *** | *** |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | ns. |
H_00 | *** | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *** |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | *   |
H_00 | ns. | *** | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | ns. | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | **  | ns. | *   |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | ns. | *   |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | *** | *   | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | **  | *   | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *** | *   | *   |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | **  | *   | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | ns. | *** |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.5355341   0.17225699  0.07994875  0.07395224  0.03307962  0.03057057
  0.01890364  0.01827549  0.01646021  0.00893067]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.454236 0.274943 -0.085500 0.242540 0.238240 -0.442913 0.212097 0.282220 -0.415452 0.276729 -0.158735 0.001017
Component-1 -0.321733 0.387562 -0.140448 0.291426 0.284236 0.433281 -0.165669 0.304603 0.420028 -0.220967 0.166155 -0.004438
Component-2 -0.798795 -0.400866 0.160057 -0.141445 -0.347165 0.109200 -0.010967 -0.091953 0.037972 -0.102617 0.007322 0.050591

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (24139 of 24139) |####################| Elapsed Time: 0:04:07 Time: 0:04:07

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | ns. | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | **  | ns. | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.68360133  0.18037225]

Out[35]:
aoa frequency letters_count
Component-0 -0.733639 0.392457 -0.554754
Component-1 0.361148 -0.466351 -0.807520

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (24139 of 24139) |####################| Elapsed Time: 0:02:31 Time: 0:02:31

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 2206 (cluster-unique) substitutions, but the PCA is in fact computed on 1766 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06535392707748533

intercept                      5.431354
global_aoa                     0.026782
global_clustering             -0.052555
global_frequency               0.324253
global_letters_count          -0.018604
global_orthographic_density   -0.032071
global_synonyms_count          0.038869
dtype: float64

Regressing global frequency with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08351774994452299

intercept                                              13.856203
global_aoa                                             -0.192925
global_clustering                                       1.213931
global_frequency                                        0.320376
global_letters_count                                   -1.217232
global_orthographic_density                            -0.815564
global_synonyms_count                                   0.337306
global_aoa * global_clustering                          0.009489
global_aoa * global_frequency                           0.006986
global_aoa * global_letters_count                       0.034177
global_aoa * global_orthographic_density                0.005172
global_aoa * global_synonyms_count                     -0.002502
global_clustering * global_frequency                   -0.016996
global_clustering * global_letters_count               -0.187647
global_clustering * global_orthographic_density        -0.108947
global_clustering * global_synonyms_count               0.249594
global_frequency * global_letters_count                -0.022242
global_frequency * global_orthographic_density         -0.040928
global_frequency * global_synonyms_count                0.104202
global_letters_count * global_orthographic_density      0.090701
global_letters_count * global_synonyms_count            0.006508
global_orthographic_density * global_synonyms_count     0.141222
dtype: float64

Regressing rel frequency with 1391 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.03245801250145941

intercept                     -6.750461
global_aoa                     0.057401
global_clustering             -0.047441
global_frequency               0.268732
global_letters_count           0.057713
global_orthographic_density    0.026576
global_synonyms_count          0.158886
dtype: float64

Regressing rel frequency with 1391 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.051219517387034785

intercept                                              1.297663
global_aoa                                             0.033930
global_clustering                                      0.641268
global_frequency                                       0.028213
global_letters_count                                  -1.314310
global_orthographic_density                           -0.929082
global_synonyms_count                                 -0.732573
global_aoa * global_clustering                         0.048225
global_aoa * global_frequency                         -0.000264
global_aoa * global_letters_count                      0.046948
global_aoa * global_orthographic_density               0.012519
global_aoa * global_synonyms_count                     0.040017
global_clustering * global_frequency                  -0.016815
global_clustering * global_letters_count              -0.152645
global_clustering * global_orthographic_density       -0.023319
global_clustering * global_synonyms_count              0.258280
global_frequency * global_letters_count                0.006907
global_frequency * global_orthographic_density         0.016419
global_frequency * global_synonyms_count               0.174982
global_letters_count * global_orthographic_density     0.101553
global_letters_count * global_synonyms_count           0.040283
global_orthographic_density * global_synonyms_count    0.232945
dtype: float64

Regressing global frequency with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.04077425519878353

intercept                   9.392604
rel_aoa                     0.042108
rel_clustering             -0.176907
rel_frequency               0.198111
rel_letters_count          -0.019778
rel_orthographic_density   -0.025379
rel_synonyms_count         -0.025616
dtype: float64

Regressing global frequency with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.054107583176946084

intercept                                        9.260666
rel_aoa                                          0.025928
rel_clustering                                  -0.057991
rel_frequency                                    0.181069
rel_letters_count                                0.059495
rel_orthographic_density                        -0.133986
rel_synonyms_count                               0.517217
rel_aoa * rel_clustering                         0.030206
rel_aoa * rel_frequency                          0.016321
rel_aoa * rel_letters_count                      0.019252
rel_aoa * rel_orthographic_density              -0.006397
rel_aoa * rel_synonyms_count                     0.032849
rel_clustering * rel_frequency                  -0.007555
rel_clustering * rel_letters_count              -0.099420
rel_clustering * rel_orthographic_density       -0.061183
rel_clustering * rel_synonyms_count              0.261761
rel_frequency * rel_letters_count               -0.006647
rel_frequency * rel_orthographic_density        -0.018275
rel_frequency * rel_synonyms_count               0.154415
rel_letters_count * rel_orthographic_density     0.056545
rel_letters_count * rel_synonyms_count          -0.025970
rel_orthographic_density * rel_synonyms_count    0.121245
dtype: float64

Regressing rel frequency with 1391 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.21823686741373605

intercept                  -1.655794
rel_aoa                     0.033790
rel_clustering              0.043646
rel_frequency               0.568483
rel_letters_count          -0.105381
rel_orthographic_density   -0.187298
rel_synonyms_count          0.073719
dtype: float64

Regressing rel frequency with 1391 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.23387949565317

intercept                                       -1.708934
rel_aoa                                         -0.034348
rel_clustering                                  -0.002409
rel_frequency                                    0.607073
rel_letters_count                               -0.060677
rel_orthographic_density                        -0.373878
rel_synonyms_count                               0.491374
rel_aoa * rel_clustering                        -0.023851
rel_aoa * rel_frequency                         -0.027286
rel_aoa * rel_letters_count                      0.034840
rel_aoa * rel_orthographic_density               0.068549
rel_aoa * rel_synonyms_count                     0.134146
rel_clustering * rel_frequency                  -0.069838
rel_clustering * rel_letters_count              -0.154580
rel_clustering * rel_orthographic_density       -0.239457
rel_clustering * rel_synonyms_count              0.157032
rel_frequency * rel_letters_count               -0.020233
rel_frequency * rel_orthographic_density        -0.061064
rel_frequency * rel_synonyms_count               0.122214
rel_letters_count * rel_orthographic_density     0.043417
rel_letters_count * rel_synonyms_count          -0.009258
rel_orthographic_density * rel_synonyms_count    0.206013
dtype: float64

Regressing global frequency with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06975084761836958

intercept                      5.375451
global_aoa                    -0.007593
global_clustering              0.135919
global_frequency               0.370954
global_letters_count           0.113983
global_orthographic_density    0.119562
global_synonyms_count          0.108171
rel_aoa                        0.048512
rel_clustering                -0.225701
rel_frequency                 -0.051697
rel_letters_count             -0.141242
rel_orthographic_density      -0.165919
rel_synonyms_count            -0.094725
dtype: float64

Regressing global frequency with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.13023395013453798

intercept                                                 1.756665
global_aoa                                               -0.171193
global_clustering                                        -2.485434
global_frequency                                          0.831843
global_letters_count                                     -2.892940
global_orthographic_density                               1.559431
global_synonyms_count                                    -3.224766
rel_aoa                                                   0.108079
rel_clustering                                            5.698582
rel_frequency                                             0.152194
rel_letters_count                                         2.003613
rel_orthographic_density                                 -0.288332
rel_synonyms_count                                        5.414214
global_aoa * global_clustering                            0.296394
global_aoa * global_frequency                             0.098366
global_aoa * global_letters_count                         0.145574
global_aoa * global_orthographic_density                  0.177801
global_aoa * global_synonyms_count                       -0.151229
global_aoa * rel_aoa                                     -0.010089
global_aoa * rel_clustering                              -0.277219
global_aoa * rel_frequency                               -0.034553
global_aoa * rel_letters_count                           -0.128005
global_aoa * rel_orthographic_density                    -0.233411
global_aoa * rel_synonyms_count                           0.050024
global_clustering * global_frequency                      0.146306
global_clustering * global_letters_count                 -0.270037
global_clustering * global_orthographic_density           0.325695
global_clustering * global_synonyms_count                 0.189511
global_clustering * rel_aoa                              -0.386322
global_clustering * rel_clustering                        0.062739
global_clustering * rel_frequency                        -0.112062
global_clustering * rel_letters_count                     0.171738
global_clustering * rel_orthographic_density              0.029972
global_clustering * rel_synonyms_count                    0.421822
global_frequency * global_letters_count                   0.019591
global_frequency * global_orthographic_density           -0.089526
global_frequency * global_synonyms_count                  0.007410
global_frequency * rel_aoa                               -0.189750
global_frequency * rel_clustering                        -0.185809
global_frequency * rel_frequency                         -0.013486
global_frequency * rel_letters_count                      0.001063
global_frequency * rel_orthographic_density               0.152178
global_frequency * rel_synonyms_count                     0.086585
global_letters_count * global_orthographic_density       -0.001556
global_letters_count * global_synonyms_count              0.627555
global_letters_count * rel_aoa                           -0.097922
global_letters_count * rel_clustering                    -0.070259
global_letters_count * rel_frequency                     -0.122098
global_letters_count * rel_letters_count                  0.003704
global_letters_count * rel_orthographic_density           0.139854
global_letters_count * rel_synonyms_count                -0.491117
global_orthographic_density * global_synonyms_count       1.080267
global_orthographic_density * rel_aoa                    -0.004448
global_orthographic_density * rel_clustering             -0.811230
global_orthographic_density * rel_frequency              -0.023529
global_orthographic_density * rel_letters_count          -0.045252
global_orthographic_density * rel_orthographic_density   -0.163700
global_orthographic_density * rel_synonyms_count         -0.628910
global_synonyms_count * rel_aoa                           0.014718
global_synonyms_count * rel_clustering                   -0.453431
global_synonyms_count * rel_frequency                    -0.202353
global_synonyms_count * rel_letters_count                -0.584688
global_synonyms_count * rel_orthographic_density         -1.223050
global_synonyms_count * rel_synonyms_count                0.051004
rel_aoa * rel_clustering                                  0.334397
rel_aoa * rel_frequency                                   0.088368
rel_aoa * rel_letters_count                               0.123554
rel_aoa * rel_orthographic_density                        0.064614
rel_aoa * rel_synonyms_count                              0.089702
rel_clustering * rel_frequency                            0.111666
rel_clustering * rel_letters_count                       -0.035017
rel_clustering * rel_orthographic_density                 0.274581
rel_clustering * rel_synonyms_count                       0.210436
rel_frequency * rel_letters_count                         0.073148
rel_frequency * rel_orthographic_density                 -0.073764
rel_frequency * rel_synonyms_count                        0.293893
rel_letters_count * rel_orthographic_density             -0.049731
rel_letters_count * rel_synonyms_count                    0.477350
rel_orthographic_density * rel_synonyms_count             0.948159
dtype: float64

Regressing rel frequency with 1391 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.30021430023381573

intercept                      4.610044
global_aoa                    -0.009223
global_clustering              0.173981
global_frequency              -0.562998
global_letters_count           0.158144
global_orthographic_density    0.185921
global_synonyms_count          0.101270
rel_aoa                        0.030324
rel_clustering                -0.224811
rel_frequency                  0.908693
rel_letters_count             -0.175924
rel_orthographic_density      -0.218624
rel_synonyms_count            -0.074748
dtype: float64

Regressing rel frequency with 1391 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.3436718812058627

intercept                                                -4.312035
global_aoa                                                0.100553
global_clustering                                        -2.861774
global_frequency                                          0.112156
global_letters_count                                     -2.421092
global_orthographic_density                               2.552016
global_synonyms_count                                    -3.356270
rel_aoa                                                   0.017722
rel_clustering                                            6.315617
rel_frequency                                             0.912294
rel_letters_count                                         1.757242
rel_orthographic_density                                 -0.570808
rel_synonyms_count                                        5.350275
global_aoa * global_clustering                            0.282205
global_aoa * global_frequency                             0.077502
global_aoa * global_letters_count                         0.130379
global_aoa * global_orthographic_density                  0.156106
global_aoa * global_synonyms_count                       -0.142996
global_aoa * rel_aoa                                     -0.008643
global_aoa * rel_clustering                              -0.266373
global_aoa * rel_frequency                               -0.013200
global_aoa * rel_letters_count                           -0.121769
global_aoa * rel_orthographic_density                    -0.223679
global_aoa * rel_synonyms_count                           0.053592
global_clustering * global_frequency                      0.158111
global_clustering * global_letters_count                 -0.215680
global_clustering * global_orthographic_density           0.413108
global_clustering * global_synonyms_count                 0.149213
global_clustering * rel_aoa                              -0.358717
global_clustering * rel_clustering                        0.062229
global_clustering * rel_frequency                        -0.098791
global_clustering * rel_letters_count                     0.158299
global_clustering * rel_orthographic_density             -0.015466
global_clustering * rel_synonyms_count                    0.473786
global_frequency * global_letters_count                   0.024597
global_frequency * global_orthographic_density           -0.097178
global_frequency * global_synonyms_count                 -0.002777
global_frequency * rel_aoa                               -0.168607
global_frequency * rel_clustering                        -0.222233
global_frequency * rel_frequency                          0.004280
global_frequency * rel_letters_count                      0.012130
global_frequency * rel_orthographic_density               0.133320
global_frequency * rel_synonyms_count                     0.116592
global_letters_count * global_orthographic_density       -0.017124
global_letters_count * global_synonyms_count              0.701484
global_letters_count * rel_aoa                           -0.098448
global_letters_count * rel_clustering                    -0.124894
global_letters_count * rel_frequency                     -0.111228
global_letters_count * rel_letters_count                  0.002250
global_letters_count * rel_orthographic_density           0.149742
global_letters_count * rel_synonyms_count                -0.542126
global_orthographic_density * global_synonyms_count       0.921631
global_orthographic_density * rel_aoa                    -0.011171
global_orthographic_density * rel_clustering             -0.841888
global_orthographic_density * rel_frequency              -0.026427
global_orthographic_density * rel_letters_count          -0.061671
global_orthographic_density * rel_orthographic_density   -0.154413
global_orthographic_density * rel_synonyms_count         -0.537740
global_synonyms_count * rel_aoa                           0.018803
global_synonyms_count * rel_clustering                   -0.428068
global_synonyms_count * rel_frequency                    -0.218663
global_synonyms_count * rel_letters_count                -0.645167
global_synonyms_count * rel_orthographic_density         -0.976980
global_synonyms_count * rel_synonyms_count                0.066894
rel_aoa * rel_clustering                                  0.318642
rel_aoa * rel_frequency                                   0.070384
rel_aoa * rel_letters_count                               0.123497
rel_aoa * rel_orthographic_density                        0.062753
rel_aoa * rel_synonyms_count                              0.082508
rel_clustering * rel_frequency                            0.130442
rel_clustering * rel_letters_count                       -0.007343
rel_clustering * rel_orthographic_density                 0.295174
rel_clustering * rel_synonyms_count                       0.181629
rel_frequency * rel_letters_count                         0.055494
rel_frequency * rel_orthographic_density                 -0.031072
rel_frequency * rel_synonyms_count                        0.289786
rel_letters_count * rel_orthographic_density             -0.036389
rel_letters_count * rel_synonyms_count                    0.537574
rel_orthographic_density * rel_synonyms_count             0.788422
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 1264 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.0785141519698066

intercept                      5.796359
global_aoa                     0.251215
global_clustering              0.030581
global_frequency              -0.056697
global_letters_count           0.077343
global_orthographic_density   -0.039276
global_synonyms_count         -0.180029
dtype: float64

Regressing global aoa with 1264 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.09837046434854979

intercept                                             -8.069168
global_aoa                                             0.486136
global_clustering                                     -2.820999
global_frequency                                       0.544377
global_letters_count                                   1.026079
global_orthographic_density                           -0.313623
global_synonyms_count                                 -2.379752
global_aoa * global_clustering                         0.051694
global_aoa * global_frequency                         -0.014935
global_aoa * global_letters_count                      0.020822
global_aoa * global_orthographic_density               0.047687
global_aoa * global_synonyms_count                     0.034471
global_clustering * global_frequency                   0.122279
global_clustering * global_letters_count               0.243275
global_clustering * global_orthographic_density        0.093394
global_clustering * global_synonyms_count             -0.484928
global_frequency * global_letters_count                0.030821
global_frequency * global_orthographic_density         0.052209
global_frequency * global_synonyms_count              -0.137026
global_letters_count * global_orthographic_density    -0.012655
global_letters_count * global_synonyms_count           0.061845
global_orthographic_density * global_synonyms_count    0.011301
dtype: float64

Regressing rel aoa with 1264 measures, no interactions
           ^^^^^^^
R^2 = 0.016752822133851097

intercept                      0.839706
global_aoa                     0.108409
global_clustering              0.013461
global_frequency              -0.060468
global_letters_count           0.022769
global_orthographic_density    0.063656
global_synonyms_count         -0.116764
dtype: float64

Regressing rel aoa with 1264 measures, with interactions
           ^^^^^^^
R^2 = 0.03741461504292476

intercept                                             -12.433603
global_aoa                                              0.760725
global_clustering                                      -1.590433
global_frequency                                        0.911157
global_letters_count                                    0.766854
global_orthographic_density                             0.220381
global_synonyms_count                                  -0.779533
global_aoa * global_clustering                          0.045938
global_aoa * global_frequency                          -0.030885
global_aoa * global_letters_count                      -0.023312
global_aoa * global_orthographic_density                0.007252
global_aoa * global_synonyms_count                      0.019441
global_clustering * global_frequency                    0.112829
global_clustering * global_letters_count                0.096358
global_clustering * global_orthographic_density        -0.098156
global_clustering * global_synonyms_count              -0.432152
global_frequency * global_letters_count                 0.004498
global_frequency * global_orthographic_density         -0.044390
global_frequency * global_synonyms_count               -0.206348
global_letters_count * global_orthographic_density     -0.078143
global_letters_count * global_synonyms_count           -0.002952
global_orthographic_density * global_synonyms_count    -0.069081
dtype: float64

Regressing global aoa with 1264 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.028974690443609674

intercept                   6.681452
rel_aoa                     0.060475
rel_clustering              0.212837
rel_frequency               0.034422
rel_letters_count           0.015611
rel_orthographic_density   -0.318191
rel_synonyms_count         -0.252024
dtype: float64

Regressing global aoa with 1264 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.048791371530799155

intercept                                        6.657707
rel_aoa                                         -0.086903
rel_clustering                                   0.143855
rel_frequency                                    0.015815
rel_letters_count                               -0.003005
rel_orthographic_density                        -0.461287
rel_synonyms_count                              -0.584392
rel_aoa * rel_clustering                         0.012473
rel_aoa * rel_frequency                         -0.060917
rel_aoa * rel_letters_count                      0.028002
rel_aoa * rel_orthographic_density               0.072326
rel_aoa * rel_synonyms_count                     0.035431
rel_clustering * rel_frequency                   0.073409
rel_clustering * rel_letters_count               0.191101
rel_clustering * rel_orthographic_density        0.104418
rel_clustering * rel_synonyms_count             -0.483343
rel_frequency * rel_letters_count                0.012750
rel_frequency * rel_orthographic_density        -0.007606
rel_frequency * rel_synonyms_count              -0.136712
rel_letters_count * rel_orthographic_density     0.028885
rel_letters_count * rel_synonyms_count           0.028280
rel_orthographic_density * rel_synonyms_count    0.034081
dtype: float64

Regressing rel aoa with 1264 measures, no interactions
           ^^^^^^^
R^2 = 0.14216169355006814

intercept                   0.840071
rel_aoa                     0.457586
rel_clustering             -0.084173
rel_frequency              -0.062140
rel_letters_count          -0.013591
rel_orthographic_density    0.143606
rel_synonyms_count         -0.210776
dtype: float64

Regressing rel aoa with 1264 measures, with interactions
           ^^^^^^^
R^2 = 0.15461532934424382

intercept                                        0.950546
rel_aoa                                          0.492698
rel_clustering                                  -0.160498
rel_frequency                                   -0.072069
rel_letters_count                                0.002923
rel_orthographic_density                         0.421541
rel_synonyms_count                              -0.514834
rel_aoa * rel_clustering                         0.017537
rel_aoa * rel_frequency                          0.008745
rel_aoa * rel_letters_count                      0.010021
rel_aoa * rel_orthographic_density               0.037177
rel_aoa * rel_synonyms_count                    -0.005929
rel_clustering * rel_frequency                   0.057631
rel_clustering * rel_letters_count               0.186033
rel_clustering * rel_orthographic_density        0.200481
rel_clustering * rel_synonyms_count             -0.290711
rel_frequency * rel_letters_count                0.044746
rel_frequency * rel_orthographic_density         0.134473
rel_frequency * rel_synonyms_count              -0.107094
rel_letters_count * rel_orthographic_density    -0.012810
rel_letters_count * rel_synonyms_count           0.031609
rel_orthographic_density * rel_synonyms_count   -0.008606
dtype: float64

Regressing global aoa with 1264 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.09934622074674071

intercept                      4.167879
global_aoa                     0.387642
global_clustering             -0.023108
global_frequency              -0.052902
global_letters_count           0.224248
global_orthographic_density   -0.064116
global_synonyms_count          0.092939
rel_aoa                       -0.219782
rel_clustering                 0.069571
rel_frequency                 -0.006377
rel_letters_count             -0.162609
rel_orthographic_density       0.070638
rel_synonyms_count            -0.332240
dtype: float64

Regressing global aoa with 1264 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.14991349639370477

intercept                                                 25.279311
global_aoa                                                 1.773417
global_clustering                                          2.449353
global_frequency                                           0.155337
global_letters_count                                      -2.381600
global_orthographic_density                              -11.500036
global_synonyms_count                                     -4.259304
rel_aoa                                                   -0.385860
rel_clustering                                            -6.302478
rel_frequency                                              0.658639
rel_letters_count                                          2.632069
rel_orthographic_density                                   9.769300
rel_synonyms_count                                        -1.668385
global_aoa * global_clustering                            -0.068028
global_aoa * global_frequency                             -0.156413
global_aoa * global_letters_count                         -0.064216
global_aoa * global_orthographic_density                   0.121184
global_aoa * global_synonyms_count                        -0.416290
global_aoa * rel_aoa                                       0.017183
global_aoa * rel_clustering                                0.087510
global_aoa * rel_frequency                                 0.099988
global_aoa * rel_letters_count                             0.084925
global_aoa * rel_orthographic_density                     -0.030046
global_aoa * rel_synonyms_count                            0.482722
global_clustering * global_frequency                       0.120521
global_clustering * global_letters_count                  -0.094433
global_clustering * global_orthographic_density           -1.336841
global_clustering * global_synonyms_count                 -0.916404
global_clustering * rel_aoa                                0.212150
global_clustering * rel_clustering                         0.061901
global_clustering * rel_frequency                          0.057795
global_clustering * rel_letters_count                      0.080794
global_clustering * rel_orthographic_density               0.695609
global_clustering * rel_synonyms_count                     0.196571
global_frequency * global_letters_count                    0.208062
global_frequency * global_orthographic_density             0.256099
global_frequency * global_synonyms_count                  -0.164375
global_frequency * rel_aoa                                 0.165437
global_frequency * rel_clustering                          0.154058
global_frequency * rel_frequency                          -0.004488
global_frequency * rel_letters_count                      -0.222004
global_frequency * rel_orthographic_density               -0.388055
global_frequency * rel_synonyms_count                      0.213495
global_letters_count * global_orthographic_density        -0.062062
global_letters_count * global_synonyms_count               0.515618
global_letters_count * rel_aoa                            -0.029763
global_letters_count * rel_clustering                      0.426754
global_letters_count * rel_frequency                      -0.054849
global_letters_count * rel_letters_count                  -0.017748
global_letters_count * rel_orthographic_density           -0.246705
global_letters_count * rel_synonyms_count                 -0.348229
global_orthographic_density * global_synonyms_count        0.073578
global_orthographic_density * rel_aoa                     -0.182376
global_orthographic_density * rel_clustering               1.163898
global_orthographic_density * rel_frequency               -0.209401
global_orthographic_density * rel_letters_count           -0.035095
global_orthographic_density * rel_orthographic_density     0.054816
global_orthographic_density * rel_synonyms_count          -0.358378
global_synonyms_count * rel_aoa                            0.380459
global_synonyms_count * rel_clustering                     0.982361
global_synonyms_count * rel_frequency                     -0.145263
global_synonyms_count * rel_letters_count                 -0.364479
global_synonyms_count * rel_orthographic_density           0.074182
global_synonyms_count * rel_synonyms_count                 0.063650
rel_aoa * rel_clustering                                  -0.158858
rel_aoa * rel_frequency                                   -0.106427
rel_aoa * rel_letters_count                                0.007104
rel_aoa * rel_orthographic_density                         0.151898
rel_aoa * rel_synonyms_count                              -0.382565
rel_clustering * rel_frequency                            -0.200942
rel_clustering * rel_letters_count                        -0.150303
rel_clustering * rel_orthographic_density                 -0.284878
rel_clustering * rel_synonyms_count                       -0.773902
rel_frequency * rel_letters_count                          0.090720
rel_frequency * rel_orthographic_density                   0.422907
rel_frequency * rel_synonyms_count                        -0.022317
rel_letters_count * rel_orthographic_density               0.302969
rel_letters_count * rel_synonyms_count                     0.283952
rel_orthographic_density * rel_synonyms_count              0.348010
dtype: float64

Regressing rel aoa with 1264 measures, no interactions
           ^^^^^^^
R^2 = 0.18244427077345074

intercept                      1.844097
global_aoa                    -0.357214
global_clustering             -0.013288
global_frequency               0.030727
global_letters_count           0.157849
global_orthographic_density   -0.088116
global_synonyms_count          0.238287
rel_aoa                        0.708165
rel_clustering                 0.044966
rel_frequency                 -0.079664
rel_letters_count             -0.142442
rel_orthographic_density       0.062293
rel_synonyms_count            -0.465412
dtype: float64

Regressing rel aoa with 1264 measures, with interactions
           ^^^^^^^
R^2 = 0.22920325700203936

intercept                                                 3.530362
global_aoa                                                0.270630
global_clustering                                        -1.166786
global_frequency                                          0.923819
global_letters_count                                     -0.813086
global_orthographic_density                              -9.152791
global_synonyms_count                                    -1.745713
rel_aoa                                                   1.078020
rel_clustering                                           -1.555307
rel_frequency                                            -0.068398
rel_letters_count                                         1.994229
rel_orthographic_density                                  9.230311
rel_synonyms_count                                       -2.109053
global_aoa * global_clustering                           -0.092906
global_aoa * global_frequency                            -0.126881
global_aoa * global_letters_count                        -0.043050
global_aoa * global_orthographic_density                  0.160072
global_aoa * global_synonyms_count                       -0.204382
global_aoa * rel_aoa                                     -0.003404
global_aoa * rel_clustering                               0.105156
global_aoa * rel_frequency                                0.061128
global_aoa * rel_letters_count                            0.016161
global_aoa * rel_orthographic_density                    -0.190178
global_aoa * rel_synonyms_count                           0.267979
global_clustering * global_frequency                      0.238488
global_clustering * global_letters_count                  0.121527
global_clustering * global_orthographic_density          -0.695486
global_clustering * global_synonyms_count                -0.799434
global_clustering * rel_aoa                               0.202641
global_clustering * rel_clustering                        0.064691
global_clustering * rel_frequency                        -0.098315
global_clustering * rel_letters_count                    -0.123273
global_clustering * rel_orthographic_density              0.119052
global_clustering * rel_synonyms_count                    0.148736
global_frequency * global_letters_count                   0.159421
global_frequency * global_orthographic_density            0.323168
global_frequency * global_synonyms_count                 -0.282761
global_frequency * rel_aoa                                0.130830
global_frequency * rel_clustering                        -0.029836
global_frequency * rel_frequency                          0.005215
global_frequency * rel_letters_count                     -0.222019
global_frequency * rel_orthographic_density              -0.557736
global_frequency * rel_synonyms_count                     0.222792
global_letters_count * global_orthographic_density        0.046474
global_letters_count * global_synonyms_count              0.178948
global_letters_count * rel_aoa                           -0.031386
global_letters_count * rel_clustering                     0.147332
global_letters_count * rel_frequency                     -0.063050
global_letters_count * rel_letters_count                 -0.012372
global_letters_count * rel_orthographic_density          -0.222834
global_letters_count * rel_synonyms_count                -0.160544
global_orthographic_density * global_synonyms_count       0.053418
global_orthographic_density * rel_aoa                    -0.179870
global_orthographic_density * rel_clustering              0.491206
global_orthographic_density * rel_frequency              -0.328966
global_orthographic_density * rel_letters_count          -0.175802
global_orthographic_density * rel_orthographic_density    0.000069
global_orthographic_density * rel_synonyms_count         -0.269739
global_synonyms_count * rel_aoa                           0.236075
global_synonyms_count * rel_clustering                    0.964267
global_synonyms_count * rel_frequency                     0.035051
global_synonyms_count * rel_letters_count                -0.099891
global_synonyms_count * rel_orthographic_density          0.042846
global_synonyms_count * rel_synonyms_count                0.048073
rel_aoa * rel_clustering                                 -0.139184
rel_aoa * rel_frequency                                  -0.068212
rel_aoa * rel_letters_count                               0.039932
rel_aoa * rel_orthographic_density                        0.224863
rel_aoa * rel_synonyms_count                             -0.249592
rel_clustering * rel_frequency                            0.031435
rel_clustering * rel_letters_count                        0.048954
rel_clustering * rel_orthographic_density                 0.181429
rel_clustering * rel_synonyms_count                      -0.800738
rel_frequency * rel_letters_count                         0.138920
rel_frequency * rel_orthographic_density                  0.583602
rel_frequency * rel_synonyms_count                       -0.126454
rel_letters_count * rel_orthographic_density              0.290449
rel_letters_count * rel_synonyms_count                    0.120666
rel_orthographic_density * rel_synonyms_count             0.195907
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 1131 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.026806848045603626

intercept                     -4.881799
global_aoa                    -0.010429
global_clustering              0.125231
global_frequency              -0.027960
global_letters_count           0.019972
global_orthographic_density    0.010929
global_synonyms_count          0.030507
dtype: float64

Regressing global clustering with 1131 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05713918596117561

intercept                                             -3.658960
global_aoa                                             0.126351
global_clustering                                      0.390832
global_frequency                                      -0.493029
global_letters_count                                   0.315022
global_orthographic_density                            0.467987
global_synonyms_count                                  0.061132
global_aoa * global_clustering                         0.010922
global_aoa * global_frequency                          0.004750
global_aoa * global_letters_count                     -0.013162
global_aoa * global_orthographic_density              -0.019852
global_aoa * global_synonyms_count                    -0.016796
global_clustering * global_frequency                  -0.063393
global_clustering * global_letters_count               0.031215
global_clustering * global_orthographic_density        0.064602
global_clustering * global_synonyms_count             -0.077313
global_frequency * global_letters_count                0.002888
global_frequency * global_orthographic_density         0.029080
global_frequency * global_synonyms_count               0.019206
global_letters_count * global_orthographic_density    -0.030208
global_letters_count * global_synonyms_count          -0.055949
global_orthographic_density * global_synonyms_count   -0.167780
dtype: float64

Regressing rel clustering with 1131 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.018459876289414923

intercept                      0.927167
global_aoa                    -0.004302
global_clustering              0.104801
global_frequency              -0.012387
global_letters_count           0.028643
global_orthographic_density    0.026236
global_synonyms_count          0.004154
dtype: float64

Regressing rel clustering with 1131 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.040050765656561405

intercept                                              3.228358
global_aoa                                             0.051474
global_clustering                                      0.486033
global_frequency                                      -0.429621
global_letters_count                                   0.063025
global_orthographic_density                            0.446445
global_synonyms_count                                  0.563887
global_aoa * global_clustering                         0.009063
global_aoa * global_frequency                          0.009462
global_aoa * global_letters_count                     -0.006934
global_aoa * global_orthographic_density              -0.023346
global_aoa * global_synonyms_count                    -0.043648
global_clustering * global_frequency                  -0.054127
global_clustering * global_letters_count              -0.003037
global_clustering * global_orthographic_density        0.052006
global_clustering * global_synonyms_count             -0.001159
global_frequency * global_letters_count                0.002058
global_frequency * global_orthographic_density         0.019120
global_frequency * global_synonyms_count               0.002749
global_letters_count * global_orthographic_density    -0.015520
global_letters_count * global_synonyms_count          -0.018618
global_orthographic_density * global_synonyms_count   -0.150660
dtype: float64

Regressing global clustering with 1131 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.008977964027807928

intercept                  -5.843128
rel_aoa                    -0.006858
rel_clustering              0.094792
rel_frequency               0.000722
rel_letters_count           0.010067
rel_orthographic_density    0.012264
rel_synonyms_count          0.017577
dtype: float64

Regressing global clustering with 1131 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.019597304747148492

intercept                                       -5.823924
rel_aoa                                         -0.021167
rel_clustering                                   0.092858
rel_frequency                                    0.004891
rel_letters_count                               -0.015467
rel_orthographic_density                         0.052285
rel_synonyms_count                              -0.113957
rel_aoa * rel_clustering                         0.027615
rel_aoa * rel_frequency                         -0.003499
rel_aoa * rel_letters_count                     -0.009226
rel_aoa * rel_orthographic_density              -0.015811
rel_aoa * rel_synonyms_count                    -0.015674
rel_clustering * rel_frequency                  -0.001088
rel_clustering * rel_letters_count              -0.003875
rel_clustering * rel_orthographic_density        0.010581
rel_clustering * rel_synonyms_count             -0.022953
rel_frequency * rel_letters_count               -0.002396
rel_frequency * rel_orthographic_density        -0.003099
rel_frequency * rel_synonyms_count              -0.023669
rel_letters_count * rel_orthographic_density    -0.023023
rel_letters_count * rel_synonyms_count          -0.013956
rel_orthographic_density * rel_synonyms_count   -0.115071
dtype: float64

Regressing rel clustering with 1131 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.10819022323539995

intercept                   0.285483
rel_aoa                    -0.020575
rel_clustering              0.331619
rel_frequency               0.014304
rel_letters_count           0.028073
rel_orthographic_density    0.028937
rel_synonyms_count          0.033967
dtype: float64

Regressing rel clustering with 1131 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.11954780565568235

intercept                                        0.281854
rel_aoa                                         -0.037619
rel_clustering                                   0.331439
rel_frequency                                    0.012056
rel_letters_count                                0.015484
rel_orthographic_density                         0.032486
rel_synonyms_count                              -0.126647
rel_aoa * rel_clustering                         0.025356
rel_aoa * rel_frequency                         -0.000451
rel_aoa * rel_letters_count                     -0.012136
rel_aoa * rel_orthographic_density              -0.034559
rel_aoa * rel_synonyms_count                    -0.021533
rel_clustering * rel_frequency                  -0.007680
rel_clustering * rel_letters_count              -0.002880
rel_clustering * rel_orthographic_density        0.033983
rel_clustering * rel_synonyms_count             -0.033083
rel_frequency * rel_letters_count               -0.002845
rel_frequency * rel_orthographic_density        -0.012852
rel_frequency * rel_synonyms_count              -0.033337
rel_letters_count * rel_orthographic_density    -0.012397
rel_letters_count * rel_synonyms_count           0.005046
rel_orthographic_density * rel_synonyms_count   -0.079295
dtype: float64

Regressing global clustering with 1131 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.03954670138043748

intercept                     -3.772480
global_aoa                    -0.008026
global_clustering              0.194020
global_frequency              -0.062407
global_letters_count           0.010475
global_orthographic_density   -0.088107
global_synonyms_count          0.062489
rel_aoa                       -0.007094
rel_clustering                -0.078306
rel_frequency                  0.039431
rel_letters_count              0.011062
rel_orthographic_density       0.115172
rel_synonyms_count            -0.045437
dtype: float64

Regressing global clustering with 1131 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.13371824979698355

intercept                                                 12.161938
global_aoa                                                 0.009522
global_clustering                                          3.395683
global_frequency                                          -1.556144
global_letters_count                                       0.955086
global_orthographic_density                               -0.688030
global_synonyms_count                                     -1.761444
rel_aoa                                                   -0.025277
rel_clustering                                            -4.134737
rel_frequency                                              0.297358
rel_letters_count                                         -0.621240
rel_orthographic_density                                   0.589269
rel_synonyms_count                                         0.580819
global_aoa * global_clustering                            -0.101533
global_aoa * global_frequency                             -0.019693
global_aoa * global_letters_count                         -0.070161
global_aoa * global_orthographic_density                  -0.069630
global_aoa * global_synonyms_count                         0.077924
global_aoa * rel_aoa                                       0.009389
global_aoa * rel_clustering                                0.139461
global_aoa * rel_frequency                                 0.033679
global_aoa * rel_letters_count                             0.066617
global_aoa * rel_orthographic_density                      0.048168
global_aoa * rel_synonyms_count                           -0.074748
global_clustering * global_frequency                      -0.248359
global_clustering * global_letters_count                   0.057654
global_clustering * global_orthographic_density           -0.076779
global_clustering * global_synonyms_count                 -0.274769
global_clustering * rel_aoa                                0.023301
global_clustering * rel_clustering                        -0.098568
global_clustering * rel_frequency                          0.108112
global_clustering * rel_letters_count                     -0.022212
global_clustering * rel_orthographic_density               0.039431
global_clustering * rel_synonyms_count                     0.239220
global_frequency * global_letters_count                   -0.010511
global_frequency * global_orthographic_density             0.071507
global_frequency * global_synonyms_count                   0.125036
global_frequency * rel_aoa                                 0.006491
global_frequency * rel_clustering                          0.202207
global_frequency * rel_frequency                           0.021975
global_frequency * rel_letters_count                       0.023074
global_frequency * rel_orthographic_density               -0.044660
global_frequency * rel_synonyms_count                     -0.029562
global_letters_count * global_orthographic_density         0.008645
global_letters_count * global_synonyms_count              -0.139798
global_letters_count * rel_aoa                            -0.004827
global_letters_count * rel_clustering                      0.059731
global_letters_count * rel_frequency                       0.025264
global_letters_count * rel_letters_count                  -0.002377
global_letters_count * rel_orthographic_density           -0.030728
global_letters_count * rel_synonyms_count                  0.178562
global_orthographic_density * global_synonyms_count       -0.388591
global_orthographic_density * rel_aoa                      0.048027
global_orthographic_density * rel_clustering               0.152081
global_orthographic_density * rel_frequency               -0.031289
global_orthographic_density * rel_letters_count           -0.073582
global_orthographic_density * rel_orthographic_density    -0.010554
global_orthographic_density * rel_synonyms_count           0.245261
global_synonyms_count * rel_aoa                           -0.035546
global_synonyms_count * rel_clustering                     0.061585
global_synonyms_count * rel_frequency                     -0.061329
global_synonyms_count * rel_letters_count                 -0.075959
global_synonyms_count * rel_orthographic_density           0.128762
global_synonyms_count * rel_synonyms_count                -0.058092
rel_aoa * rel_clustering                                  -0.008228
rel_aoa * rel_frequency                                    0.008048
rel_aoa * rel_letters_count                               -0.013938
rel_aoa * rel_orthographic_density                        -0.066612
rel_aoa * rel_synonyms_count                               0.013714
rel_clustering * rel_frequency                            -0.101816
rel_clustering * rel_letters_count                        -0.069257
rel_clustering * rel_orthographic_density                 -0.021080
rel_clustering * rel_synonyms_count                       -0.065988
rel_frequency * rel_letters_count                         -0.031808
rel_frequency * rel_orthographic_density                   0.028916
rel_frequency * rel_synonyms_count                        -0.029422
rel_letters_count * rel_orthographic_density               0.059238
rel_letters_count * rel_synonyms_count                     0.003443
rel_orthographic_density * rel_synonyms_count             -0.108336
dtype: float64

Regressing rel clustering with 1131 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.19085378024214816

intercept                     -2.455473
global_aoa                     0.001008
global_clustering             -0.557873
global_frequency              -0.049804
global_letters_count          -0.002838
global_orthographic_density   -0.067210
global_synonyms_count          0.005682
rel_aoa                       -0.015941
rel_clustering                 0.796105
rel_frequency                  0.037117
rel_letters_count              0.028062
rel_orthographic_density       0.092653
rel_synonyms_count             0.012876
dtype: float64

Regressing rel clustering with 1131 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.26330227916560967

intercept                                                 10.705195
global_aoa                                                 0.070967
global_clustering                                          1.862596
global_frequency                                          -1.135061
global_letters_count                                       0.345775
global_orthographic_density                               -0.935651
global_synonyms_count                                     -1.856661
rel_aoa                                                    0.161140
rel_clustering                                            -2.682966
rel_frequency                                              0.214393
rel_letters_count                                         -0.466450
rel_orthographic_density                                   0.589360
rel_synonyms_count                                         0.908236
global_aoa * global_clustering                            -0.060813
global_aoa * global_frequency                             -0.006924
global_aoa * global_letters_count                         -0.054205
global_aoa * global_orthographic_density                  -0.065836
global_aoa * global_synonyms_count                         0.033117
global_aoa * rel_aoa                                       0.006924
global_aoa * rel_clustering                                0.092171
global_aoa * rel_frequency                                 0.027198
global_aoa * rel_letters_count                             0.061069
global_aoa * rel_orthographic_density                      0.057462
global_aoa * rel_synonyms_count                           -0.045957
global_clustering * global_frequency                      -0.166092
global_clustering * global_letters_count                  -0.011839
global_clustering * global_orthographic_density           -0.063739
global_clustering * global_synonyms_count                 -0.275284
global_clustering * rel_aoa                                0.017019
global_clustering * rel_clustering                        -0.117004
global_clustering * rel_frequency                          0.073734
global_clustering * rel_letters_count                     -0.010751
global_clustering * rel_orthographic_density               0.002390
global_clustering * rel_synonyms_count                     0.273154
global_frequency * global_letters_count                   -0.013362
global_frequency * global_orthographic_density             0.084729
global_frequency * global_synonyms_count                   0.080203
global_frequency * rel_aoa                                -0.011195
global_frequency * rel_clustering                          0.141262
global_frequency * rel_frequency                           0.020066
global_frequency * rel_letters_count                       0.026347
global_frequency * rel_orthographic_density               -0.065002
global_frequency * rel_synonyms_count                      0.004356
global_letters_count * global_orthographic_density         0.034829
global_letters_count * global_synonyms_count              -0.062911
global_letters_count * rel_aoa                            -0.012912
global_letters_count * rel_clustering                      0.115042
global_letters_count * rel_frequency                       0.016230
global_letters_count * rel_letters_count                  -0.002264
global_letters_count * rel_orthographic_density           -0.052995
global_letters_count * rel_synonyms_count                  0.121746
global_orthographic_density * global_synonyms_count       -0.214449
global_orthographic_density * rel_aoa                      0.042774
global_orthographic_density * rel_clustering               0.122249
global_orthographic_density * rel_frequency               -0.049291
global_orthographic_density * rel_letters_count           -0.087085
global_orthographic_density * rel_orthographic_density     0.021069
global_orthographic_density * rel_synonyms_count           0.110170
global_synonyms_count * rel_aoa                           -0.021733
global_synonyms_count * rel_clustering                     0.131918
global_synonyms_count * rel_frequency                     -0.033596
global_synonyms_count * rel_letters_count                 -0.081386
global_synonyms_count * rel_orthographic_density           0.015254
global_synonyms_count * rel_synonyms_count                -0.049530
rel_aoa * rel_clustering                                  -0.001937
rel_aoa * rel_frequency                                    0.014084
rel_aoa * rel_letters_count                               -0.008481
rel_aoa * rel_orthographic_density                        -0.067663
rel_aoa * rel_synonyms_count                               0.015479
rel_clustering * rel_frequency                            -0.075357
rel_clustering * rel_letters_count                        -0.078764
rel_clustering * rel_orthographic_density                  0.008334
rel_clustering * rel_synonyms_count                       -0.154749
rel_frequency * rel_letters_count                         -0.024930
rel_frequency * rel_orthographic_density                   0.049810
rel_frequency * rel_synonyms_count                        -0.053184
rel_letters_count * rel_orthographic_density               0.092064
rel_letters_count * rel_synonyms_count                    -0.008127
rel_orthographic_density * rel_synonyms_count             -0.036575
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0689121087128981

intercept                      4.497080
global_aoa                     0.045508
global_clustering             -0.053070
global_frequency               0.039398
global_letters_count           0.198910
global_orthographic_density   -0.160461
global_synonyms_count         -0.327523
dtype: float64

Regressing global letters_count with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08120237179489409

intercept                                             -11.123102
global_aoa                                              0.675567
global_clustering                                      -3.116671
global_frequency                                        0.717361
global_letters_count                                    0.677205
global_orthographic_density                            -0.110781
global_synonyms_count                                   0.907897
global_aoa * global_clustering                          0.116771
global_aoa * global_frequency                          -0.008301
global_aoa * global_letters_count                       0.011752
global_aoa * global_orthographic_density                0.067305
global_aoa * global_synonyms_count                     -0.096909
global_clustering * global_frequency                    0.140530
global_clustering * global_letters_count                0.124518
global_clustering * global_orthographic_density         0.183284
global_clustering * global_synonyms_count               0.029370
global_frequency * global_letters_count                 0.016586
global_frequency * global_orthographic_density          0.074170
global_frequency * global_synonyms_count               -0.082971
global_letters_count * global_orthographic_density     -0.016460
global_letters_count * global_synonyms_count            0.075890
global_orthographic_density * global_synonyms_count    -0.110688
dtype: float64

Regressing rel letters_count with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.03380467751975125

intercept                      1.588079
global_aoa                    -0.003679
global_clustering             -0.041521
global_frequency               0.026726
global_letters_count           0.139577
global_orthographic_density   -0.120918
global_synonyms_count         -0.372163
dtype: float64

Regressing rel letters_count with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04573555658921469

intercept                                             -11.583950
global_aoa                                              0.588391
global_clustering                                      -2.273652
global_frequency                                        0.851217
global_letters_count                                    0.529793
global_orthographic_density                            -0.637277
global_synonyms_count                                   0.954340
global_aoa * global_clustering                          0.096642
global_aoa * global_frequency                          -0.001978
global_aoa * global_letters_count                      -0.009871
global_aoa * global_orthographic_density                0.058832
global_aoa * global_synonyms_count                     -0.103957
global_clustering * global_frequency                    0.140812
global_clustering * global_letters_count                0.049396
global_clustering * global_orthographic_density        -0.000756
global_clustering * global_synonyms_count              -0.002729
global_frequency * global_letters_count                -0.002201
global_frequency * global_orthographic_density          0.039237
global_frequency * global_synonyms_count               -0.118752
global_letters_count * global_orthographic_density     -0.043158
global_letters_count * global_synonyms_count            0.083769
global_orthographic_density * global_synonyms_count    -0.074107
dtype: float64

Regressing global letters_count with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.055722568554947394

intercept                   5.844392
rel_aoa                    -0.040119
rel_clustering              0.121771
rel_frequency               0.060265
rel_letters_count           0.159872
rel_orthographic_density   -0.286335
rel_synonyms_count         -0.353155
dtype: float64

Regressing global letters_count with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06434332300448475

intercept                                        5.777986
rel_aoa                                         -0.110161
rel_clustering                                   0.070876
rel_frequency                                    0.063180
rel_letters_count                                0.249565
rel_orthographic_density                        -0.327800
rel_synonyms_count                              -0.721218
rel_aoa * rel_clustering                         0.066555
rel_aoa * rel_frequency                         -0.035198
rel_aoa * rel_letters_count                     -0.003144
rel_aoa * rel_orthographic_density               0.030851
rel_aoa * rel_synonyms_count                    -0.082023
rel_clustering * rel_frequency                  -0.004819
rel_clustering * rel_letters_count               0.052264
rel_clustering * rel_orthographic_density        0.103673
rel_clustering * rel_synonyms_count             -0.145500
rel_frequency * rel_letters_count                0.020656
rel_frequency * rel_orthographic_density         0.017150
rel_frequency * rel_synonyms_count              -0.081521
rel_letters_count * rel_orthographic_density     0.029331
rel_letters_count * rel_synonyms_count           0.072554
rel_orthographic_density * rel_synonyms_count   -0.052094
dtype: float64

Regressing rel letters_count with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.10666311375556492

intercept                   1.588443
rel_aoa                    -0.023591
rel_clustering             -0.014794
rel_frequency              -0.116581
rel_letters_count           0.353541
rel_orthographic_density    0.037956
rel_synonyms_count         -0.366954
dtype: float64

Regressing rel letters_count with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12000612438718805

intercept                                        1.495682
rel_aoa                                         -0.024964
rel_clustering                                   0.008170
rel_frequency                                   -0.129670
rel_letters_count                                0.547261
rel_orthographic_density                         0.153543
rel_synonyms_count                              -0.660205
rel_aoa * rel_clustering                         0.099581
rel_aoa * rel_frequency                         -0.008016
rel_aoa * rel_letters_count                     -0.042187
rel_aoa * rel_orthographic_density              -0.044923
rel_aoa * rel_synonyms_count                    -0.114518
rel_clustering * rel_frequency                   0.029305
rel_clustering * rel_letters_count               0.068648
rel_clustering * rel_orthographic_density        0.167683
rel_clustering * rel_synonyms_count             -0.108774
rel_frequency * rel_letters_count                0.043093
rel_frequency * rel_orthographic_density         0.089153
rel_frequency * rel_synonyms_count              -0.050075
rel_letters_count * rel_orthographic_density     0.051903
rel_letters_count * rel_synonyms_count           0.115164
rel_orthographic_density * rel_synonyms_count   -0.000509
dtype: float64

Regressing global letters_count with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0770498675298048

intercept                      1.256374
global_aoa                     0.124737
global_clustering             -0.367480
global_frequency               0.093961
global_letters_count           0.242417
global_orthographic_density   -0.190538
global_synonyms_count         -0.063096
rel_aoa                       -0.126312
rel_clustering                 0.369964
rel_frequency                 -0.067679
rel_letters_count             -0.047546
rel_orthographic_density       0.043941
rel_synonyms_count            -0.300021
dtype: float64

Regressing global letters_count with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12206985463889408

intercept                                                 5.131245
global_aoa                                                1.129105
global_clustering                                         0.873416
global_frequency                                          0.461826
global_letters_count                                     -0.941337
global_orthographic_density                              -2.043427
global_synonyms_count                                     2.248524
rel_aoa                                                  -0.894620
rel_clustering                                           -7.691005
rel_frequency                                             0.958366
rel_letters_count                                         1.795028
rel_orthographic_density                                 -0.724193
rel_synonyms_count                                       -2.235544
global_aoa * global_clustering                            0.080548
global_aoa * global_frequency                            -0.035455
global_aoa * global_letters_count                        -0.041209
global_aoa * global_orthographic_density                  0.007686
global_aoa * global_synonyms_count                       -0.238917
global_aoa * rel_aoa                                      0.026441
global_aoa * rel_clustering                              -0.033534
global_aoa * rel_frequency                               -0.034458
global_aoa * rel_letters_count                            0.026509
global_aoa * rel_orthographic_density                     0.080455
global_aoa * rel_synonyms_count                           0.189814
global_clustering * global_frequency                      0.038552
global_clustering * global_letters_count                 -0.238299
global_clustering * global_orthographic_density          -0.306732
global_clustering * global_synonyms_count                -0.819098
global_clustering * rel_aoa                               0.178358
global_clustering * rel_clustering                       -0.103666
global_clustering * rel_frequency                         0.155386
global_clustering * rel_letters_count                     0.239882
global_clustering * rel_orthographic_density              0.121963
global_clustering * rel_synonyms_count                    0.612503
global_frequency * global_letters_count                   0.018379
global_frequency * global_orthographic_density            0.054941
global_frequency * global_synonyms_count                 -0.402784
global_frequency * rel_aoa                                0.147665
global_frequency * rel_clustering                         0.248809
global_frequency * rel_frequency                         -0.023736
global_frequency * rel_letters_count                     -0.077621
global_frequency * rel_orthographic_density               0.029924
global_frequency * rel_synonyms_count                     0.287563
global_letters_count * global_orthographic_density       -0.108967
global_letters_count * global_synonyms_count              0.050584
global_letters_count * rel_aoa                            0.065074
global_letters_count * rel_clustering                     0.682150
global_letters_count * rel_frequency                      0.096469
global_letters_count * rel_letters_count                  0.029272
global_letters_count * rel_orthographic_density           0.054847
global_letters_count * rel_synonyms_count                -0.047114
global_orthographic_density * global_synonyms_count      -0.818689
global_orthographic_density * rel_aoa                    -0.062714
global_orthographic_density * rel_clustering              0.638847
global_orthographic_density * rel_frequency              -0.063269
global_orthographic_density * rel_letters_count           0.077853
global_orthographic_density * rel_orthographic_density    0.075774
global_orthographic_density * rel_synonyms_count          0.632474
global_synonyms_count * rel_aoa                           0.127071
global_synonyms_count * rel_clustering                    1.258829
global_synonyms_count * rel_frequency                     0.271883
global_synonyms_count * rel_letters_count                 0.042056
global_synonyms_count * rel_orthographic_density          1.019782
global_synonyms_count * rel_synonyms_count                0.017858
rel_aoa * rel_clustering                                 -0.093145
rel_aoa * rel_frequency                                  -0.081960
rel_aoa * rel_letters_count                              -0.097358
rel_aoa * rel_orthographic_density                        0.022988
rel_aoa * rel_synonyms_count                             -0.136278
rel_clustering * rel_frequency                           -0.383688
rel_clustering * rel_letters_count                       -0.540826
rel_clustering * rel_orthographic_density                -0.226096
rel_clustering * rel_synonyms_count                      -1.139972
rel_frequency * rel_letters_count                        -0.014656
rel_frequency * rel_orthographic_density                  0.064805
rel_frequency * rel_synonyms_count                       -0.242335
rel_letters_count * rel_orthographic_density              0.022450
rel_letters_count * rel_synonyms_count                    0.052640
rel_orthographic_density * rel_synonyms_count            -0.873566
dtype: float64

Regressing rel letters_count with 1391 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16028296056137922

intercept                      0.348212
global_aoa                     0.107068
global_clustering             -0.321774
global_frequency               0.135265
global_letters_count          -0.588489
global_orthographic_density   -0.158599
global_synonyms_count         -0.041327
rel_aoa                       -0.090747
rel_clustering                 0.322686
rel_frequency                 -0.117236
rel_letters_count              0.802946
rel_orthographic_density       0.000903
rel_synonyms_count            -0.290566
dtype: float64

Regressing rel letters_count with 1391 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2019757688316225

intercept                                                 0.352906
global_aoa                                                1.056627
global_clustering                                         0.106844
global_frequency                                          0.376243
global_letters_count                                     -1.459444
global_orthographic_density                              -1.360210
global_synonyms_count                                     4.202190
rel_aoa                                                  -0.841230
rel_clustering                                           -6.336209
rel_frequency                                             0.790443
rel_letters_count                                         2.352449
rel_orthographic_density                                 -2.016341
rel_synonyms_count                                       -3.637012
global_aoa * global_clustering                            0.111291
global_aoa * global_frequency                            -0.017048
global_aoa * global_letters_count                        -0.026650
global_aoa * global_orthographic_density                 -0.002749
global_aoa * global_synonyms_count                       -0.225408
global_aoa * rel_aoa                                      0.019279
global_aoa * rel_clustering                              -0.066757
global_aoa * rel_frequency                               -0.045082
global_aoa * rel_letters_count                            0.026415
global_aoa * rel_orthographic_density                     0.105180
global_aoa * rel_synonyms_count                           0.182779
global_clustering * global_frequency                      0.040990
global_clustering * global_letters_count                 -0.174301
global_clustering * global_orthographic_density          -0.214588
global_clustering * global_synonyms_count                -0.619379
global_clustering * rel_aoa                               0.115855
global_clustering * rel_clustering                       -0.127635
global_clustering * rel_frequency                         0.117308
global_clustering * rel_letters_count                     0.153943
global_clustering * rel_orthographic_density             -0.035056
global_clustering * rel_synonyms_count                    0.309135
global_frequency * global_letters_count                   0.029127
global_frequency * global_orthographic_density            0.067025
global_frequency * global_synonyms_count                 -0.435658
global_frequency * rel_aoa                                0.120616
global_frequency * rel_clustering                         0.219482
global_frequency * rel_frequency                         -0.028625
global_frequency * rel_letters_count                     -0.096230
global_frequency * rel_orthographic_density               0.034954
global_frequency * rel_synonyms_count                     0.237497
global_letters_count * global_orthographic_density       -0.135338
global_letters_count * global_synonyms_count             -0.033576
global_letters_count * rel_aoa                            0.046844
global_letters_count * rel_clustering                     0.591653
global_letters_count * rel_frequency                      0.084377
global_letters_count * rel_letters_count                  0.009134
global_letters_count * rel_orthographic_density           0.077186
global_letters_count * rel_synonyms_count                -0.005973
global_orthographic_density * global_synonyms_count      -0.789755
global_orthographic_density * rel_aoa                    -0.042439
global_orthographic_density * rel_clustering              0.404886
global_orthographic_density * rel_frequency              -0.061303
global_orthographic_density * rel_letters_count           0.116736
global_orthographic_density * rel_orthographic_density    0.072893
global_orthographic_density * rel_synonyms_count          0.600813
global_synonyms_count * rel_aoa                           0.117605
global_synonyms_count * rel_clustering                    0.957541
global_synonyms_count * rel_frequency                     0.319818
global_synonyms_count * rel_letters_count                 0.081771
global_synonyms_count * rel_orthographic_density          0.926807
global_synonyms_count * rel_synonyms_count               -0.017707
rel_aoa * rel_clustering                                 -0.038924
rel_aoa * rel_frequency                                  -0.062247
rel_aoa * rel_letters_count                              -0.084378
rel_aoa * rel_orthographic_density                        0.017182
rel_aoa * rel_synonyms_count                             -0.131352
rel_clustering * rel_frequency                           -0.350280
rel_clustering * rel_letters_count                       -0.461764
rel_clustering * rel_orthographic_density                -0.001234
rel_clustering * rel_synonyms_count                      -0.777453
rel_frequency * rel_letters_count                         0.000678
rel_frequency * rel_orthographic_density                  0.038534
rel_frequency * rel_synonyms_count                       -0.221920
rel_letters_count * rel_orthographic_density             -0.035426
rel_letters_count * rel_synonyms_count                    0.045798
rel_orthographic_density * rel_synonyms_count            -0.735726
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 1344 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.021581353351464605

intercept                      0.675346
global_aoa                    -0.008731
global_clustering              0.027961
global_frequency              -0.013924
global_letters_count          -0.001346
global_orthographic_density    0.011053
global_synonyms_count          0.136983
dtype: float64

Regressing global synonyms_count with 1344 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.04212876191216908

intercept                                              3.099601
global_aoa                                            -0.004784
global_clustering                                      0.592629
global_frequency                                      -0.048042
global_letters_count                                  -0.094749
global_orthographic_density                           -0.503462
global_synonyms_count                                 -0.050448
global_aoa * global_clustering                        -0.017797
global_aoa * global_frequency                         -0.008424
global_aoa * global_letters_count                     -0.005520
global_aoa * global_orthographic_density              -0.005636
global_aoa * global_synonyms_count                     0.028381
global_clustering * global_frequency                  -0.020639
global_clustering * global_letters_count              -0.020415
global_clustering * global_orthographic_density       -0.103704
global_clustering * global_synonyms_count              0.055381
global_frequency * global_letters_count                0.000011
global_frequency * global_orthographic_density        -0.014023
global_frequency * global_synonyms_count              -0.004230
global_letters_count * global_orthographic_density     0.004590
global_letters_count * global_synonyms_count           0.029008
global_orthographic_density * global_synonyms_count    0.145661
dtype: float64

Regressing rel synonyms_count with 1344 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.015033345148772415

intercept                      0.340704
global_aoa                    -0.005879
global_clustering              0.028400
global_frequency              -0.010847
global_letters_count          -0.000242
global_orthographic_density   -0.003480
global_synonyms_count          0.106677
dtype: float64

Regressing rel synonyms_count with 1344 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.03724718459797449

intercept                                              2.900584
global_aoa                                             0.033251
global_clustering                                      0.574868
global_frequency                                      -0.113439
global_letters_count                                  -0.106528
global_orthographic_density                           -0.480218
global_synonyms_count                                 -0.036656
global_aoa * global_clustering                        -0.013194
global_aoa * global_frequency                         -0.005454
global_aoa * global_letters_count                     -0.010184
global_aoa * global_orthographic_density              -0.009804
global_aoa * global_synonyms_count                     0.021188
global_clustering * global_frequency                  -0.022562
global_clustering * global_letters_count              -0.023217
global_clustering * global_orthographic_density       -0.089816
global_clustering * global_synonyms_count              0.052820
global_frequency * global_letters_count                0.003819
global_frequency * global_orthographic_density        -0.001449
global_frequency * global_synonyms_count              -0.009712
global_letters_count * global_orthographic_density    -0.002985
global_letters_count * global_synonyms_count           0.037283
global_orthographic_density * global_synonyms_count    0.139075
dtype: float64

Regressing global synonyms_count with 1344 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.01656176012096844

intercept                   0.382873
rel_aoa                     0.001384
rel_clustering             -0.005602
rel_frequency              -0.003918
rel_letters_count           0.008893
rel_orthographic_density    0.041609
rel_synonyms_count          0.137279
dtype: float64

Regressing global synonyms_count with 1344 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.03568245124824443

intercept                                        0.398072
rel_aoa                                         -0.028972
rel_clustering                                  -0.067570
rel_frequency                                    0.001056
rel_letters_count                               -0.003616
rel_orthographic_density                         0.026583
rel_synonyms_count                               0.109553
rel_aoa * rel_clustering                        -0.025652
rel_aoa * rel_frequency                         -0.011175
rel_aoa * rel_letters_count                      0.000776
rel_aoa * rel_orthographic_density              -0.007283
rel_aoa * rel_synonyms_count                     0.024659
rel_clustering * rel_frequency                  -0.000555
rel_clustering * rel_letters_count               0.000878
rel_clustering * rel_orthographic_density       -0.082784
rel_clustering * rel_synonyms_count              0.072089
rel_frequency * rel_letters_count               -0.003010
rel_frequency * rel_orthographic_density        -0.012234
rel_frequency * rel_synonyms_count              -0.009415
rel_letters_count * rel_orthographic_density     0.001393
rel_letters_count * rel_synonyms_count           0.022267
rel_orthographic_density * rel_synonyms_count    0.108579
dtype: float64

Regressing rel synonyms_count with 1344 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07379299260538053

intercept                   0.052642
rel_aoa                    -0.010987
rel_clustering              0.039595
rel_frequency               0.003318
rel_letters_count           0.006230
rel_orthographic_density    0.005846
rel_synonyms_count          0.275979
dtype: float64

Regressing rel synonyms_count with 1344 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.09070286799226768

intercept                                        0.075083
rel_aoa                                         -0.028009
rel_clustering                                  -0.029012
rel_frequency                                    0.011986
rel_letters_count                               -0.006695
rel_orthographic_density                         0.007925
rel_synonyms_count                               0.317587
rel_aoa * rel_clustering                        -0.008079
rel_aoa * rel_frequency                         -0.008548
rel_aoa * rel_letters_count                     -0.005036
rel_aoa * rel_orthographic_density              -0.008208
rel_aoa * rel_synonyms_count                     0.009465
rel_clustering * rel_frequency                  -0.006602
rel_clustering * rel_letters_count               0.001072
rel_clustering * rel_orthographic_density       -0.059335
rel_clustering * rel_synonyms_count              0.073718
rel_frequency * rel_letters_count               -0.003469
rel_frequency * rel_orthographic_density        -0.009177
rel_frequency * rel_synonyms_count               0.003391
rel_letters_count * rel_orthographic_density    -0.004125
rel_letters_count * rel_synonyms_count           0.025408
rel_orthographic_density * rel_synonyms_count    0.135006
dtype: float64

Regressing global synonyms_count with 1344 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.02886008797698958

intercept                      1.817350
global_aoa                    -0.011744
global_clustering              0.095843
global_frequency              -0.038004
global_letters_count          -0.049999
global_orthographic_density   -0.080463
global_synonyms_count          0.098731
rel_aoa                        0.006787
rel_clustering                -0.078639
rel_frequency                  0.028958
rel_letters_count              0.052753
rel_orthographic_density       0.101790
rel_synonyms_count             0.042353
dtype: float64

Regressing global synonyms_count with 1344 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08853630656455169

intercept                                                 8.570182
global_aoa                                               -0.137485
global_clustering                                         1.671655
global_frequency                                         -0.402359
global_letters_count                                      0.051077
global_orthographic_density                              -0.569239
global_synonyms_count                                     1.080835
rel_aoa                                                   0.009541
rel_clustering                                           -0.897905
rel_frequency                                            -0.138619
rel_letters_count                                        -0.177034
rel_orthographic_density                                  0.294730
rel_synonyms_count                                       -2.468648
global_aoa * global_clustering                           -0.024727
global_aoa * global_frequency                             0.013987
global_aoa * global_letters_count                        -0.013442
global_aoa * global_orthographic_density                 -0.066744
global_aoa * global_synonyms_count                        0.022918
global_aoa * rel_aoa                                      0.002576
global_aoa * rel_clustering                               0.016026
global_aoa * rel_frequency                               -0.022211
global_aoa * rel_letters_count                            0.007449
global_aoa * rel_orthographic_density                     0.069404
global_aoa * rel_synonyms_count                           0.035972
global_clustering * global_frequency                     -0.084186
global_clustering * global_letters_count                 -0.054324
global_clustering * global_orthographic_density          -0.160647
global_clustering * global_synonyms_count                 0.136645
global_clustering * rel_aoa                               0.005330
global_clustering * rel_clustering                        0.020810
global_clustering * rel_frequency                         0.014972
global_clustering * rel_letters_count                    -0.008827
global_clustering * rel_orthographic_density              0.082524
global_clustering * rel_synonyms_count                   -0.126650
global_frequency * global_letters_count                  -0.033474
global_frequency * global_orthographic_density           -0.027263
global_frequency * global_synonyms_count                  0.029071
global_frequency * rel_aoa                               -0.011216
global_frequency * rel_clustering                         0.046493
global_frequency * rel_frequency                          0.001843
global_frequency * rel_letters_count                      0.011635
global_frequency * rel_orthographic_density               0.012221
global_frequency * rel_synonyms_count                     0.018366
global_letters_count * global_orthographic_density        0.063495
global_letters_count * global_synonyms_count             -0.016656
global_letters_count * rel_aoa                            0.000092
global_letters_count * rel_clustering                     0.024891
global_letters_count * rel_frequency                      0.056505
global_letters_count * rel_letters_count                 -0.006066
global_letters_count * rel_orthographic_density          -0.055758
global_letters_count * rel_synonyms_count                 0.088749
global_orthographic_density * global_synonyms_count      -0.137451
global_orthographic_density * rel_aoa                     0.056143
global_orthographic_density * rel_clustering              0.102916
global_orthographic_density * rel_frequency               0.043779
global_orthographic_density * rel_letters_count          -0.030152
global_orthographic_density * rel_orthographic_density   -0.031570
global_orthographic_density * rel_synonyms_count          0.321473
global_synonyms_count * rel_aoa                          -0.000420
global_synonyms_count * rel_clustering                    0.009912
global_synonyms_count * rel_frequency                     0.044945
global_synonyms_count * rel_letters_count                 0.003874
global_synonyms_count * rel_orthographic_density          0.191134
global_synonyms_count * rel_synonyms_count                0.115865
rel_aoa * rel_clustering                                 -0.021824
rel_aoa * rel_frequency                                   0.011541
rel_aoa * rel_letters_count                               0.004040
rel_aoa * rel_orthographic_density                       -0.070212
rel_aoa * rel_synonyms_count                             -0.033243
rel_clustering * rel_frequency                            0.014018
rel_clustering * rel_letters_count                        0.025528
rel_clustering * rel_orthographic_density                -0.118431
rel_clustering * rel_synonyms_count                       0.025234
rel_frequency * rel_letters_count                        -0.034243
rel_frequency * rel_orthographic_density                 -0.048515
rel_frequency * rel_synonyms_count                       -0.096216
rel_letters_count * rel_orthographic_density             -0.002417
rel_letters_count * rel_synonyms_count                   -0.020914
rel_orthographic_density * rel_synonyms_count            -0.191120
dtype: float64

Regressing rel synonyms_count with 1344 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.14702727859274956

intercept                      1.318166
global_aoa                    -0.006881
global_clustering              0.062097
global_frequency              -0.040596
global_letters_count          -0.035220
global_orthographic_density   -0.022731
global_synonyms_count         -0.590460
rel_aoa                       -0.001003
rel_clustering                -0.047557
rel_frequency                  0.033013
rel_letters_count              0.038424
rel_orthographic_density       0.033894
rel_synonyms_count             0.835138
dtype: float64

Regressing rel synonyms_count with 1344 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.20688766982081408

intercept                                                 4.885406
global_aoa                                               -0.137816
global_clustering                                         0.989528
global_frequency                                         -0.286059
global_letters_count                                      0.151046
global_orthographic_density                              -0.120780
global_synonyms_count                                     0.944879
rel_aoa                                                   0.188613
rel_clustering                                           -0.394189
rel_frequency                                            -0.156506
rel_letters_count                                        -0.323933
rel_orthographic_density                                 -0.101441
rel_synonyms_count                                       -2.252288
global_aoa * global_clustering                           -0.019115
global_aoa * global_frequency                             0.014106
global_aoa * global_letters_count                        -0.009077
global_aoa * global_orthographic_density                 -0.061069
global_aoa * global_synonyms_count                        0.037304
global_aoa * rel_aoa                                      0.001883
global_aoa * rel_clustering                               0.014191
global_aoa * rel_frequency                               -0.019988
global_aoa * rel_letters_count                            0.005948
global_aoa * rel_orthographic_density                     0.060184
global_aoa * rel_synonyms_count                           0.022847
global_clustering * global_frequency                     -0.054367
global_clustering * global_letters_count                 -0.027461
global_clustering * global_orthographic_density          -0.082540
global_clustering * global_synonyms_count                 0.101234
global_clustering * rel_aoa                               0.004772
global_clustering * rel_clustering                        0.011214
global_clustering * rel_frequency                        -0.009507
global_clustering * rel_letters_count                    -0.019168
global_clustering * rel_orthographic_density              0.029897
global_clustering * rel_synonyms_count                   -0.067450
global_frequency * global_letters_count                  -0.025640
global_frequency * global_orthographic_density           -0.014134
global_frequency * global_synonyms_count                 -0.004090
global_frequency * rel_aoa                               -0.016504
global_frequency * rel_clustering                         0.029986
global_frequency * rel_frequency                          0.001972
global_frequency * rel_letters_count                      0.015868
global_frequency * rel_orthographic_density               0.012545
global_frequency * rel_synonyms_count                     0.062293
global_letters_count * global_orthographic_density        0.044037
global_letters_count * global_synonyms_count             -0.077388
global_letters_count * rel_aoa                           -0.008598
global_letters_count * rel_clustering                    -0.002454
global_letters_count * rel_frequency                      0.040953
global_letters_count * rel_letters_count                 -0.004715
global_letters_count * rel_orthographic_density          -0.039420
global_letters_count * rel_synonyms_count                 0.159939
global_orthographic_density * global_synonyms_count      -0.257312
global_orthographic_density * rel_aoa                     0.023290
global_orthographic_density * rel_clustering              0.000890
global_orthographic_density * rel_frequency               0.014256
global_orthographic_density * rel_letters_count          -0.012614
global_orthographic_density * rel_orthographic_density   -0.035100
global_orthographic_density * rel_synonyms_count          0.450222
global_synonyms_count * rel_aoa                          -0.035951
global_synonyms_count * rel_clustering                    0.031812
global_synonyms_count * rel_frequency                     0.052618
global_synonyms_count * rel_letters_count                 0.028529
global_synonyms_count * rel_orthographic_density          0.234367
global_synonyms_count * rel_synonyms_count                0.126585
rel_aoa * rel_clustering                                 -0.023131
rel_aoa * rel_frequency                                   0.013542
rel_aoa * rel_letters_count                               0.008512
rel_aoa * rel_orthographic_density                       -0.035544
rel_aoa * rel_synonyms_count                             -0.007994
rel_clustering * rel_frequency                            0.022102
rel_clustering * rel_letters_count                        0.033125
rel_clustering * rel_orthographic_density                -0.035961
rel_clustering * rel_synonyms_count                      -0.022957
rel_frequency * rel_letters_count                        -0.029604
rel_frequency * rel_orthographic_density                 -0.024864
rel_frequency * rel_synonyms_count                       -0.106755
rel_letters_count * rel_orthographic_density             -0.014213
rel_letters_count * rel_synonyms_count                   -0.057328
rel_orthographic_density * rel_synonyms_count            -0.249463
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 1139 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08396711518855171

intercept                      2.105903
global_aoa                    -0.052596
global_clustering              0.027605
global_frequency              -0.028423
global_letters_count          -0.041740
global_orthographic_density    0.158901
global_synonyms_count          0.056807
dtype: float64

Regressing global orthographic_density with 1139 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09933004993393202

intercept                                              8.017406
global_aoa                                            -0.313452
global_clustering                                      1.177082
global_frequency                                      -0.145278
global_letters_count                                  -0.428173
global_orthographic_density                            0.115134
global_synonyms_count                                  0.019758
global_aoa * global_clustering                        -0.038216
global_aoa * global_frequency                         -0.003988
global_aoa * global_letters_count                      0.007073
global_aoa * global_orthographic_density               0.020536
global_aoa * global_synonyms_count                     0.009980
global_clustering * global_frequency                  -0.040309
global_clustering * global_letters_count              -0.072792
global_clustering * global_orthographic_density       -0.082033
global_clustering * global_synonyms_count              0.053555
global_frequency * global_letters_count               -0.005132
global_frequency * global_orthographic_density        -0.045307
global_frequency * global_synonyms_count               0.019296
global_letters_count * global_orthographic_density    -0.032853
global_letters_count * global_synonyms_count          -0.006202
global_orthographic_density * global_synonyms_count    0.109408
dtype: float64

Regressing rel orthographic_density with 1139 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.062156032036011825

intercept                     -0.131560
global_aoa                    -0.036616
global_clustering              0.022890
global_frequency              -0.030157
global_letters_count          -0.054983
global_orthographic_density    0.104770
global_synonyms_count          0.058622
dtype: float64

Regressing rel orthographic_density with 1139 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07522750464500971

intercept                                              6.112838
global_aoa                                            -0.141795
global_clustering                                      1.335540
global_frequency                                      -0.168609
global_letters_count                                  -0.500644
global_orthographic_density                           -0.028422
global_synonyms_count                                 -0.326910
global_aoa * global_clustering                        -0.027921
global_aoa * global_frequency                         -0.010260
global_aoa * global_letters_count                      0.002730
global_aoa * global_orthographic_density               0.013222
global_aoa * global_synonyms_count                     0.004509
global_clustering * global_frequency                  -0.051846
global_clustering * global_letters_count              -0.090647
global_clustering * global_orthographic_density       -0.083180
global_clustering * global_synonyms_count             -0.013495
global_frequency * global_letters_count               -0.008123
global_frequency * global_orthographic_density        -0.036229
global_frequency * global_synonyms_count               0.012034
global_letters_count * global_orthographic_density    -0.022828
global_letters_count * global_synonyms_count           0.007673
global_orthographic_density * global_synonyms_count    0.087062
dtype: float64

Regressing global orthographic_density with 1139 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.05557568116404054

intercept                   1.501150
rel_aoa                    -0.002314
rel_clustering             -0.036887
rel_frequency              -0.035062
rel_letters_count          -0.015448
rel_orthographic_density    0.226678
rel_synonyms_count          0.096287
dtype: float64

Regressing global orthographic_density with 1139 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07107695105754408

intercept                                        1.525574
rel_aoa                                          0.074069
rel_clustering                                  -0.068014
rel_frequency                                   -0.047086
rel_letters_count                               -0.021454
rel_orthographic_density                         0.323889
rel_synonyms_count                               0.279741
rel_aoa * rel_clustering                         0.017694
rel_aoa * rel_frequency                          0.029443
rel_aoa * rel_letters_count                     -0.001969
rel_aoa * rel_orthographic_density               0.005729
rel_aoa * rel_synonyms_count                     0.030042
rel_clustering * rel_frequency                  -0.019999
rel_clustering * rel_letters_count              -0.012418
rel_clustering * rel_orthographic_density        0.023032
rel_clustering * rel_synonyms_count              0.078381
rel_frequency * rel_letters_count                0.007861
rel_frequency * rel_orthographic_density         0.021197
rel_frequency * rel_synonyms_count               0.034090
rel_letters_count * rel_orthographic_density    -0.028090
rel_letters_count * rel_synonyms_count           0.006820
rel_orthographic_density * rel_synonyms_count    0.149844
dtype: float64

Regressing rel orthographic_density with 1139 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09186254243623793

intercept                  -0.641304
rel_aoa                     0.005643
rel_clustering             -0.021661
rel_frequency               0.001161
rel_letters_count          -0.023849
rel_orthographic_density    0.271918
rel_synonyms_count          0.083263
dtype: float64

Regressing rel orthographic_density with 1139 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10239894711783726

intercept                                       -0.556252
rel_aoa                                          0.072439
rel_clustering                                  -0.092587
rel_frequency                                    0.028256
rel_letters_count                               -0.056030
rel_orthographic_density                         0.346976
rel_synonyms_count                               0.224151
rel_aoa * rel_clustering                         0.007207
rel_aoa * rel_frequency                          0.016337
rel_aoa * rel_letters_count                     -0.001292
rel_aoa * rel_orthographic_density               0.024362
rel_aoa * rel_synonyms_count                     0.033446
rel_clustering * rel_frequency                  -0.039499
rel_clustering * rel_letters_count              -0.029938
rel_clustering * rel_orthographic_density       -0.015943
rel_clustering * rel_synonyms_count              0.041810
rel_frequency * rel_letters_count               -0.005134
rel_frequency * rel_orthographic_density         0.012738
rel_frequency * rel_synonyms_count               0.022880
rel_letters_count * rel_orthographic_density    -0.027239
rel_letters_count * rel_synonyms_count          -0.006000
rel_orthographic_density * rel_synonyms_count    0.105469
dtype: float64

Regressing global orthographic_density with 1139 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10022242008772264

intercept                      3.554887
global_aoa                    -0.088293
global_clustering              0.113616
global_frequency              -0.067929
global_letters_count          -0.134624
global_orthographic_density    0.215263
global_synonyms_count         -0.087436
rel_aoa                        0.063032
rel_clustering                -0.103155
rel_frequency                  0.046214
rel_letters_count              0.096470
rel_orthographic_density      -0.079505
rel_synonyms_count             0.170088
dtype: float64

Regressing global orthographic_density with 1139 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14932475432090586

intercept                                                 5.230874
global_aoa                                               -0.828762
global_clustering                                         0.914985
global_frequency                                          0.089826
global_letters_count                                     -0.003346
global_orthographic_density                               1.488294
global_synonyms_count                                     2.163104
rel_aoa                                                   0.181028
rel_clustering                                            1.819380
rel_frequency                                             0.004888
rel_letters_count                                        -0.244873
rel_orthographic_density                                 -1.229119
rel_synonyms_count                                       -0.006392
global_aoa * global_clustering                           -0.100444
global_aoa * global_frequency                            -0.020922
global_aoa * global_letters_count                         0.041028
global_aoa * global_orthographic_density                  0.065571
global_aoa * global_synonyms_count                        0.090351
global_aoa * rel_aoa                                     -0.005294
global_aoa * rel_clustering                               0.073012
global_aoa * rel_frequency                                0.008498
global_aoa * rel_letters_count                           -0.011111
global_aoa * rel_orthographic_density                    -0.028448
global_aoa * rel_synonyms_count                          -0.116799
global_clustering * global_frequency                     -0.042037
global_clustering * global_letters_count                  0.003246
global_clustering * global_orthographic_density           0.034450
global_clustering * global_synonyms_count                 0.207907
global_clustering * rel_aoa                              -0.053050
global_clustering * rel_clustering                       -0.003821
global_clustering * rel_frequency                        -0.044337
global_clustering * rel_letters_count                     0.042128
global_clustering * rel_orthographic_density              0.067132
global_clustering * rel_synonyms_count                   -0.108508
global_frequency * global_letters_count                  -0.011303
global_frequency * global_orthographic_density           -0.112727
global_frequency * global_synonyms_count                 -0.076547
global_frequency * rel_aoa                               -0.008030
global_frequency * rel_clustering                        -0.033965
global_frequency * rel_frequency                          0.007400
global_frequency * rel_letters_count                      0.056755
global_frequency * rel_orthographic_density               0.107580
global_frequency * rel_synonyms_count                    -0.007192
global_letters_count * global_orthographic_density       -0.080129
global_letters_count * global_synonyms_count             -0.313919
global_letters_count * rel_aoa                           -0.035231
global_letters_count * rel_clustering                    -0.273508
global_letters_count * rel_frequency                     -0.076036
global_letters_count * rel_letters_count                  0.000025
global_letters_count * rel_orthographic_density           0.147861
global_letters_count * rel_synonyms_count                 0.256535
global_orthographic_density * global_synonyms_count       0.351985
global_orthographic_density * rel_aoa                    -0.049805
global_orthographic_density * rel_clustering             -0.315013
global_orthographic_density * rel_frequency              -0.043406
global_orthographic_density * rel_letters_count           0.003461
global_orthographic_density * rel_orthographic_density    0.068321
global_orthographic_density * rel_synonyms_count         -0.340344
global_synonyms_count * rel_aoa                          -0.088228
global_synonyms_count * rel_clustering                   -0.252344
global_synonyms_count * rel_frequency                     0.124836
global_synonyms_count * rel_letters_count                 0.229564
global_synonyms_count * rel_orthographic_density         -0.505482
global_synonyms_count * rel_synonyms_count               -0.070986
rel_aoa * rel_clustering                                  0.070953
rel_aoa * rel_frequency                                   0.032825
rel_aoa * rel_letters_count                               0.027926
rel_aoa * rel_orthographic_density                        0.041751
rel_aoa * rel_synonyms_count                              0.130467
rel_clustering * rel_frequency                            0.100412
rel_clustering * rel_letters_count                        0.150762
rel_clustering * rel_orthographic_density                 0.112083
rel_clustering * rel_synonyms_count                       0.215331
rel_frequency * rel_letters_count                         0.027898
rel_frequency * rel_orthographic_density                  0.008516
rel_frequency * rel_synonyms_count                       -0.041582
rel_letters_count * rel_orthographic_density             -0.055719
rel_letters_count * rel_synonyms_count                   -0.186940
rel_orthographic_density * rel_synonyms_count             0.609788
dtype: float64

Regressing rel orthographic_density with 1139 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12796211093314958

intercept                      2.493273
global_aoa                    -0.073199
global_clustering              0.093712
global_frequency              -0.052148
global_letters_count          -0.092884
global_orthographic_density   -0.527101
global_synonyms_count         -0.057124
rel_aoa                        0.051841
rel_clustering                -0.082337
rel_frequency                  0.040885
rel_letters_count              0.055786
rel_orthographic_density       0.728300
rel_synonyms_count             0.134951
dtype: float64

Regressing rel orthographic_density with 1139 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17686459230979779

intercept                                                 8.280735
global_aoa                                               -0.575919
global_clustering                                         1.377424
global_frequency                                         -0.263942
global_letters_count                                     -0.320286
global_orthographic_density                               0.408636
global_synonyms_count                                     1.239239
rel_aoa                                                   0.176053
rel_clustering                                            0.904676
rel_frequency                                             0.162863
rel_letters_count                                        -0.034716
rel_orthographic_density                                 -0.009379
rel_synonyms_count                                        0.730360
global_aoa * global_clustering                           -0.066870
global_aoa * global_frequency                            -0.013684
global_aoa * global_letters_count                         0.043080
global_aoa * global_orthographic_density                  0.018883
global_aoa * global_synonyms_count                        0.078392
global_aoa * rel_aoa                                     -0.003854
global_aoa * rel_clustering                               0.046613
global_aoa * rel_frequency                                0.007337
global_aoa * rel_letters_count                           -0.015478
global_aoa * rel_orthographic_density                     0.014606
global_aoa * rel_synonyms_count                          -0.105094
global_clustering * global_frequency                     -0.085792
global_clustering * global_letters_count                 -0.011843
global_clustering * global_orthographic_density           0.010459
global_clustering * global_synonyms_count                 0.136705
global_clustering * rel_aoa                              -0.059316
global_clustering * rel_clustering                       -0.005776
global_clustering * rel_frequency                        -0.013538
global_clustering * rel_letters_count                     0.047439
global_clustering * rel_orthographic_density              0.143826
global_clustering * rel_synonyms_count                   -0.016413
global_frequency * global_letters_count                  -0.003607
global_frequency * global_orthographic_density           -0.101276
global_frequency * global_synonyms_count                 -0.032996
global_frequency * rel_aoa                               -0.015180
global_frequency * rel_clustering                         0.002836
global_frequency * rel_frequency                          0.007670
global_frequency * rel_letters_count                      0.049746
global_frequency * rel_orthographic_density               0.108235
global_frequency * rel_synonyms_count                    -0.030019
global_letters_count * global_orthographic_density        0.007737
global_letters_count * global_synonyms_count             -0.251925
global_letters_count * rel_aoa                           -0.036395
global_letters_count * rel_clustering                    -0.221044
global_letters_count * rel_frequency                     -0.066934
global_letters_count * rel_letters_count                  0.002024
global_letters_count * rel_orthographic_density           0.096898
global_letters_count * rel_synonyms_count                 0.215260
global_orthographic_density * global_synonyms_count       0.293873
global_orthographic_density * rel_aoa                    -0.042249
global_orthographic_density * rel_clustering             -0.154459
global_orthographic_density * rel_frequency              -0.037149
global_orthographic_density * rel_letters_count          -0.067961
global_orthographic_density * rel_orthographic_density    0.101246
global_orthographic_density * rel_synonyms_count         -0.288624
global_synonyms_count * rel_aoa                          -0.088913
global_synonyms_count * rel_clustering                   -0.144513
global_synonyms_count * rel_frequency                     0.099072
global_synonyms_count * rel_letters_count                 0.166050
global_synonyms_count * rel_orthographic_density         -0.456996
global_synonyms_count * rel_synonyms_count               -0.069820
rel_aoa * rel_clustering                                  0.065549
rel_aoa * rel_frequency                                   0.029690
rel_aoa * rel_letters_count                               0.024887
rel_aoa * rel_orthographic_density                        0.028045
rel_aoa * rel_synonyms_count                              0.121211
rel_clustering * rel_frequency                            0.066842
rel_clustering * rel_letters_count                        0.112018
rel_clustering * rel_orthographic_density                -0.089291
rel_clustering * rel_synonyms_count                       0.073714
rel_frequency * rel_letters_count                         0.016875
rel_frequency * rel_orthographic_density                  0.000177
rel_frequency * rel_synonyms_count                       -0.032303
rel_letters_count * rel_orthographic_density             -0.017926
rel_letters_count * rel_synonyms_count                   -0.136846
rel_orthographic_density * rel_synonyms_count             0.556938
dtype: float64