Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.majority, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 1731 substitutions for model Model(time=Time.continuous, source=Source.majority, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)
100% (1731 of 1731) |######################| Elapsed Time: 0:00:54 Time: 0:00:54

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | *   | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *   | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | **  | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *   | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | **  | **  |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | *** | **  | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | *** |
H_00 | *** | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | **  | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | **  | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | **  | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | *** |
H_00 | ns. | *** | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | *   | *   | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | *   | *   | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | **  | *   | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | *** |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.540878    0.18810011  0.06843222  0.06444056  0.03699766  0.02741058
  0.01949961  0.01779278  0.0161193   0.00948889]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 0.476135 -0.231608 0.080664 -0.212569 -0.195800 0.462955 -0.225974 -0.261994 0.429900 -0.289643 0.163943 0.002578
Component-1 -0.391168 0.379674 -0.175635 0.300296 0.262364 0.398662 -0.146972 0.302979 0.414143 -0.222074 0.139371 -0.019782
Component-2 0.223666 -0.232268 -0.066063 -0.047645 0.925944 -0.015178 -0.025363 -0.151063 -0.012605 0.064659 0.013629 -0.061633

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (1731 of 1731) |######################| Elapsed Time: 0:00:58 Time: 0:00:58

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *** | *** | ns. | **  |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.65754448  0.21201271]

Out[35]:
aoa frequency letters_count
Component-0 0.759097 -0.394938 0.517490
Component-1 -0.388738 0.362630 0.846984

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (1731 of 1731) |######################| Elapsed Time: 0:00:13 Time: 0:00:13

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *   | ns. |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1044 (cluster-unique) substitutions, but the PCA is in fact computed on 793 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
   ** global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.11946026522497301

intercept                      4.244737
global_aoa                     0.089478
global_clustering              0.032972
global_frequency               0.472450
global_letters_count          -0.021912
global_orthographic_density    0.007935
global_synonyms_count         -0.011685
dtype: float64

Regressing global frequency with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.15294793701154907

intercept                                             -14.674892
global_aoa                                              0.816777
global_clustering                                      -1.554803
global_frequency                                        2.684220
global_letters_count                                    0.336310
global_orthographic_density                             1.515451
global_synonyms_count                                   1.081930
global_aoa * global_clustering                          0.040810
global_aoa * global_frequency                          -0.059187
global_aoa * global_letters_count                      -0.009349
global_aoa * global_orthographic_density                0.060609
global_aoa * global_synonyms_count                     -0.034906
global_clustering * global_frequency                    0.189840
global_clustering * global_letters_count               -0.069279
global_clustering * global_orthographic_density        -0.010625
global_clustering * global_synonyms_count               0.188545
global_frequency * global_letters_count                -0.075217
global_frequency * global_orthographic_density         -0.223463
global_frequency * global_synonyms_count               -0.001249
global_letters_count * global_orthographic_density      0.013597
global_letters_count * global_synonyms_count            0.014187
global_orthographic_density * global_synonyms_count     0.072463
dtype: float64

Regressing rel frequency with 599 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.06654346451220594

intercept                     -6.973505
global_aoa                     0.086974
global_clustering              0.034719
global_frequency               0.385455
global_letters_count           0.061633
global_orthographic_density   -0.014795
global_synonyms_count          0.010157
dtype: float64

Regressing rel frequency with 599 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.09516803274083296

intercept                                             -23.921397
global_aoa                                              0.515733
global_clustering                                      -1.413791
global_frequency                                        2.472078
global_letters_count                                    0.519920
global_orthographic_density                             1.186418
global_synonyms_count                                   0.210705
global_aoa * global_clustering                         -0.020363
global_aoa * global_frequency                          -0.082965
global_aoa * global_letters_count                       0.012301
global_aoa * global_orthographic_density                0.081232
global_aoa * global_synonyms_count                      0.015964
global_clustering * global_frequency                    0.164700
global_clustering * global_letters_count                0.009164
global_clustering * global_orthographic_density         0.012412
global_clustering * global_synonyms_count               0.157712
global_frequency * global_letters_count                -0.057148
global_frequency * global_orthographic_density         -0.205132
global_frequency * global_synonyms_count                0.030215
global_letters_count * global_orthographic_density      0.035384
global_letters_count * global_synonyms_count            0.031993
global_orthographic_density * global_synonyms_count     0.046879
dtype: float64

Regressing global frequency with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.050058084106590406

intercept                   9.494721
rel_aoa                     0.088058
rel_clustering             -0.137067
rel_frequency               0.249723
rel_letters_count           0.012905
rel_orthographic_density    0.031439
rel_synonyms_count         -0.029389
dtype: float64

Regressing global frequency with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.0873170551825615

intercept                                        9.313228
rel_aoa                                          0.218046
rel_clustering                                   0.070398
rel_frequency                                    0.171403
rel_letters_count                                0.071462
rel_orthographic_density                        -0.257524
rel_synonyms_count                               0.242895
rel_aoa * rel_clustering                         0.047130
rel_aoa * rel_frequency                          0.020724
rel_aoa * rel_letters_count                     -0.019130
rel_aoa * rel_orthographic_density               0.049618
rel_aoa * rel_synonyms_count                    -0.099221
rel_clustering * rel_frequency                   0.045270
rel_clustering * rel_letters_count               0.027549
rel_clustering * rel_orthographic_density        0.204712
rel_clustering * rel_synonyms_count              0.188153
rel_frequency * rel_letters_count               -0.006793
rel_frequency * rel_orthographic_density        -0.069142
rel_frequency * rel_synonyms_count               0.084867
rel_letters_count * rel_orthographic_density     0.043267
rel_letters_count * rel_synonyms_count          -0.133401
rel_orthographic_density * rel_synonyms_count   -0.263549
dtype: float64

Regressing rel frequency with 599 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.24390801135528795

intercept                  -1.349209
rel_aoa                     0.039790
rel_clustering              0.146700
rel_frequency               0.627829
rel_letters_count          -0.026855
rel_orthographic_density   -0.142875
rel_synonyms_count         -0.033085
dtype: float64

Regressing rel frequency with 599 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.2787382515387319

intercept                                       -1.594596
rel_aoa                                          0.009612
rel_clustering                                   0.124246
rel_frequency                                    0.563577
rel_letters_count                                0.063035
rel_orthographic_density                        -0.480848
rel_synonyms_count                               0.059842
rel_aoa * rel_clustering                        -0.029095
rel_aoa * rel_frequency                         -0.066400
rel_aoa * rel_letters_count                     -0.004742
rel_aoa * rel_orthographic_density               0.117403
rel_aoa * rel_synonyms_count                     0.009969
rel_clustering * rel_frequency                  -0.007408
rel_clustering * rel_letters_count               0.003979
rel_clustering * rel_orthographic_density       -0.011453
rel_clustering * rel_synonyms_count              0.247690
rel_frequency * rel_letters_count                0.006350
rel_frequency * rel_orthographic_density        -0.121996
rel_frequency * rel_synonyms_count               0.056131
rel_letters_count * rel_orthographic_density     0.016562
rel_letters_count * rel_synonyms_count          -0.072392
rel_orthographic_density * rel_synonyms_count   -0.074038
dtype: float64

Regressing global frequency with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.12674410488594867

intercept                      4.076756
global_aoa                     0.077583
global_clustering              0.205716
global_frequency               0.558613
global_letters_count          -0.022699
global_orthographic_density    0.120328
global_synonyms_count         -0.004991
rel_aoa                        0.018518
rel_clustering                -0.206587
rel_frequency                 -0.098038
rel_letters_count              0.011772
rel_orthographic_density      -0.132107
rel_synonyms_count            -0.008648
dtype: float64

Regressing global frequency with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.25860527591922555

intercept                                                -15.178782
global_aoa                                                 1.291976
global_clustering                                         -6.023314
global_frequency                                           1.256942
global_letters_count                                      -0.467503
global_orthographic_density                               -0.234584
global_synonyms_count                                    -10.483213
rel_aoa                                                    3.000252
rel_clustering                                             1.611332
rel_frequency                                              0.204488
rel_letters_count                                          1.385880
rel_orthographic_density                                   7.995800
rel_synonyms_count                                        12.732924
global_aoa * global_clustering                             0.205937
global_aoa * global_frequency                              0.000841
global_aoa * global_letters_count                          0.020047
global_aoa * global_orthographic_density                   0.019125
global_aoa * global_synonyms_count                        -0.290835
global_aoa * rel_aoa                                      -0.041803
global_aoa * rel_clustering                               -0.099658
global_aoa * rel_frequency                                 0.024645
global_aoa * rel_letters_count                            -0.080368
global_aoa * rel_orthographic_density                     -0.151117
global_aoa * rel_synonyms_count                            0.204052
global_clustering * global_frequency                       0.307043
global_clustering * global_letters_count                  -0.021773
global_clustering * global_orthographic_density            0.996707
global_clustering * global_synonyms_count                  0.033287
global_clustering * rel_aoa                               -0.049548
global_clustering * rel_clustering                         0.188224
global_clustering * rel_frequency                         -0.184678
global_clustering * rel_letters_count                      0.064656
global_clustering * rel_orthographic_density              -0.473478
global_clustering * rel_synonyms_count                     0.010299
global_frequency * global_letters_count                   -0.097334
global_frequency * global_orthographic_density             0.440744
global_frequency * global_synonyms_count                   0.661211
global_frequency * rel_aoa                                -0.215202
global_frequency * rel_clustering                          0.253681
global_frequency * rel_frequency                          -0.026198
global_frequency * rel_letters_count                       0.037516
global_frequency * rel_orthographic_density               -0.781606
global_frequency * rel_synonyms_count                     -0.894164
global_letters_count * global_orthographic_density         0.319059
global_letters_count * global_synonyms_count               0.851262
global_letters_count * rel_aoa                            -0.095968
global_letters_count * rel_clustering                     -0.100750
global_letters_count * rel_frequency                      -0.031977
global_letters_count * rel_letters_count                   0.025973
global_letters_count * rel_orthographic_density           -0.378957
global_letters_count * rel_synonyms_count                 -0.843500
global_orthographic_density * global_synonyms_count        0.266833
global_orthographic_density * rel_aoa                     -0.113538
global_orthographic_density * rel_clustering              -1.386439
global_orthographic_density * rel_frequency               -0.378757
global_orthographic_density * rel_letters_count           -0.244583
global_orthographic_density * rel_orthographic_density    -0.162999
global_orthographic_density * rel_synonyms_count           0.291839
global_synonyms_count * rel_aoa                           -0.190363
global_synonyms_count * rel_clustering                     0.318975
global_synonyms_count * rel_frequency                     -0.748029
global_synonyms_count * rel_letters_count                 -0.670003
global_synonyms_count * rel_orthographic_density          -0.628140
global_synonyms_count * rel_synonyms_count                 0.070596
rel_aoa * rel_clustering                                  -0.051174
rel_aoa * rel_frequency                                    0.100558
rel_aoa * rel_letters_count                                0.129717
rel_aoa * rel_orthographic_density                         0.242609
rel_aoa * rel_synonyms_count                               0.176950
rel_clustering * rel_frequency                            -0.215258
rel_clustering * rel_letters_count                        -0.009746
rel_clustering * rel_orthographic_density                  0.755934
rel_clustering * rel_synonyms_count                       -0.038287
rel_frequency * rel_letters_count                          0.050964
rel_frequency * rel_orthographic_density                   0.487771
rel_frequency * rel_synonyms_count                         1.080300
rel_letters_count * rel_orthographic_density               0.366784
rel_letters_count * rel_synonyms_count                     0.657025
rel_orthographic_density * rel_synonyms_count             -0.015246
dtype: float64

Regressing rel frequency with 599 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.28735536753711455

intercept                      2.912346
global_aoa                     0.087627
global_clustering              0.257689
global_frequency              -0.355092
global_letters_count           0.043650
global_orthographic_density    0.213378
global_synonyms_count          0.042446
rel_aoa                       -0.017004
rel_clustering                -0.203324
rel_frequency                  0.836718
rel_letters_count             -0.046001
rel_orthographic_density      -0.213697
rel_synonyms_count            -0.062547
dtype: float64

Regressing rel frequency with 599 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.38562908383063216

intercept                                                -21.347224
global_aoa                                                 1.933860
global_clustering                                         -6.394154
global_frequency                                           0.334015
global_letters_count                                      -0.872794
global_orthographic_density                                2.135242
global_synonyms_count                                     -9.738693
rel_aoa                                                    2.310814
rel_clustering                                             3.120730
rel_frequency                                              1.130307
rel_letters_count                                          1.745638
rel_orthographic_density                                   5.980031
rel_synonyms_count                                        12.262364
global_aoa * global_clustering                             0.225266
global_aoa * global_frequency                             -0.033811
global_aoa * global_letters_count                          0.009087
global_aoa * global_orthographic_density                  -0.003339
global_aoa * global_synonyms_count                        -0.256740
global_aoa * rel_aoa                                      -0.037369
global_aoa * rel_clustering                               -0.169602
global_aoa * rel_frequency                                 0.072890
global_aoa * rel_letters_count                            -0.045323
global_aoa * rel_orthographic_density                     -0.092497
global_aoa * rel_synonyms_count                            0.161107
global_clustering * global_frequency                       0.297422
global_clustering * global_letters_count                   0.002459
global_clustering * global_orthographic_density            1.149656
global_clustering * global_synonyms_count                  0.014618
global_clustering * rel_aoa                               -0.106358
global_clustering * rel_clustering                         0.203104
global_clustering * rel_frequency                         -0.162318
global_clustering * rel_letters_count                      0.107281
global_clustering * rel_orthographic_density              -0.518823
global_clustering * rel_synonyms_count                     0.119117
global_frequency * global_letters_count                   -0.019826
global_frequency * global_orthographic_density             0.366093
global_frequency * global_synonyms_count                   0.572158
global_frequency * rel_aoa                                -0.184636
global_frequency * rel_clustering                          0.178871
global_frequency * rel_frequency                          -0.033220
global_frequency * rel_letters_count                      -0.001984
global_frequency * rel_orthographic_density               -0.700871
global_frequency * rel_synonyms_count                     -0.805005
global_letters_count * global_orthographic_density         0.265740
global_letters_count * global_synonyms_count               0.933020
global_letters_count * rel_aoa                            -0.114836
global_letters_count * rel_clustering                     -0.116311
global_letters_count * rel_frequency                      -0.092411
global_letters_count * rel_letters_count                   0.019091
global_letters_count * rel_orthographic_density           -0.301342
global_letters_count * rel_synonyms_count                 -0.848307
global_orthographic_density * global_synonyms_count        0.215288
global_orthographic_density * rel_aoa                     -0.140486
global_orthographic_density * rel_clustering              -1.483791
global_orthographic_density * rel_frequency               -0.290729
global_orthographic_density * rel_letters_count           -0.188835
global_orthographic_density * rel_orthographic_density    -0.129252
global_orthographic_density * rel_synonyms_count           0.315692
global_synonyms_count * rel_aoa                           -0.156302
global_synonyms_count * rel_clustering                     0.379824
global_synonyms_count * rel_frequency                     -0.658915
global_synonyms_count * rel_letters_count                 -0.815927
global_synonyms_count * rel_orthographic_density          -0.488688
global_synonyms_count * rel_synonyms_count                 0.093820
rel_aoa * rel_clustering                                   0.021898
rel_aoa * rel_frequency                                    0.047392
rel_aoa * rel_letters_count                                0.117653
rel_aoa * rel_orthographic_density                         0.213622
rel_aoa * rel_synonyms_count                               0.149115
rel_clustering * rel_frequency                            -0.198808
rel_clustering * rel_letters_count                        -0.050826
rel_clustering * rel_orthographic_density                  0.765596
rel_clustering * rel_synonyms_count                       -0.126947
rel_frequency * rel_letters_count                          0.083452
rel_frequency * rel_orthographic_density                   0.419183
rel_frequency * rel_synonyms_count                         0.970990
rel_letters_count * rel_orthographic_density               0.288899
rel_letters_count * rel_synonyms_count                     0.734475
rel_orthographic_density * rel_synonyms_count             -0.113715
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 535 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.14717917664336855

intercept                      3.836451
global_aoa                     0.320303
global_clustering             -0.262039
global_frequency              -0.139296
global_letters_count           0.144424
global_orthographic_density   -0.041994
global_synonyms_count         -0.058148
dtype: float64

Regressing global aoa with 535 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.18963080534002152

intercept                                             -0.404739
global_aoa                                             0.680419
global_clustering                                     -1.526945
global_frequency                                      -0.723803
global_letters_count                                   1.160922
global_orthographic_density                           -1.559556
global_synonyms_count                                 -1.617177
global_aoa * global_clustering                         0.103384
global_aoa * global_frequency                          0.035339
global_aoa * global_letters_count                     -0.006555
global_aoa * global_orthographic_density              -0.044104
global_aoa * global_synonyms_count                     0.079924
global_clustering * global_frequency                  -0.035211
global_clustering * global_letters_count               0.154027
global_clustering * global_orthographic_density       -0.087408
global_clustering * global_synonyms_count             -0.001326
global_frequency * global_letters_count               -0.012670
global_frequency * global_orthographic_density         0.125455
global_frequency * global_synonyms_count               0.122820
global_letters_count * global_orthographic_density     0.007610
global_letters_count * global_synonyms_count          -0.040948
global_orthographic_density * global_synonyms_count    0.089066
dtype: float64

Regressing rel aoa with 535 measures, no interactions
           ^^^^^^^
R^2 = 0.0568522335459124

intercept                     -0.593039
global_aoa                     0.182915
global_clustering             -0.194259
global_frequency              -0.177945
global_letters_count           0.071622
global_orthographic_density    0.149226
global_synonyms_count         -0.004593
dtype: float64

Regressing rel aoa with 535 measures, with interactions
           ^^^^^^^
R^2 = 0.1025536150543387

intercept                                             -1.075824
global_aoa                                             1.066631
global_clustering                                     -0.833571
global_frequency                                      -1.060416
global_letters_count                                   0.302626
global_orthographic_density                           -1.614717
global_synonyms_count                                 -1.015290
global_aoa * global_clustering                         0.158209
global_aoa * global_frequency                          0.023829
global_aoa * global_letters_count                     -0.030217
global_aoa * global_orthographic_density              -0.026649
global_aoa * global_synonyms_count                     0.104842
global_clustering * global_frequency                  -0.058619
global_clustering * global_letters_count               0.025800
global_clustering * global_orthographic_density       -0.080751
global_clustering * global_synonyms_count             -0.002080
global_frequency * global_letters_count                0.021963
global_frequency * global_orthographic_density         0.185463
global_frequency * global_synonyms_count               0.049383
global_letters_count * global_orthographic_density    -0.065706
global_letters_count * global_synonyms_count          -0.070637
global_orthographic_density * global_synonyms_count    0.208166
dtype: float64

Regressing global aoa with 535 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.054961171579019474

intercept                   6.668709
rel_aoa                     0.076960
rel_clustering              0.239551
rel_frequency               0.128275
rel_letters_count           0.073270
rel_orthographic_density   -0.381534
rel_synonyms_count         -0.205938
dtype: float64

Regressing global aoa with 535 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.10854304470439258

intercept                                        6.680921
rel_aoa                                         -0.230164
rel_clustering                                   0.155978
rel_frequency                                    0.184652
rel_letters_count                                0.008959
rel_orthographic_density                        -0.258741
rel_synonyms_count                              -0.012658
rel_aoa * rel_clustering                         0.116956
rel_aoa * rel_frequency                         -0.094673
rel_aoa * rel_letters_count                      0.020650
rel_aoa * rel_orthographic_density               0.040038
rel_aoa * rel_synonyms_count                     0.120668
rel_clustering * rel_frequency                   0.112756
rel_clustering * rel_letters_count               0.013944
rel_clustering * rel_orthographic_density       -0.288729
rel_clustering * rel_synonyms_count              0.101292
rel_frequency * rel_letters_count               -0.027814
rel_frequency * rel_orthographic_density         0.023267
rel_frequency * rel_synonyms_count              -0.005021
rel_letters_count * rel_orthographic_density     0.003531
rel_letters_count * rel_synonyms_count           0.084363
rel_orthographic_density * rel_synonyms_count    0.517996
dtype: float64

Regressing rel aoa with 535 measures, no interactions
           ^^^^^^^
R^2 = 0.20264455297538905

intercept                   0.480522
rel_aoa                     0.487262
rel_clustering             -0.135634
rel_frequency              -0.070724
rel_letters_count           0.017374
rel_orthographic_density    0.140369
rel_synonyms_count         -0.137867
dtype: float64

Regressing rel aoa with 535 measures, with interactions
           ^^^^^^^
R^2 = 0.23915664362652653

intercept                                        0.717423
rel_aoa                                          0.381820
rel_clustering                                  -0.154412
rel_frequency                                    0.057500
rel_letters_count                               -0.050212
rel_orthographic_density                         0.505542
rel_synonyms_count                               0.020997
rel_aoa * rel_clustering                         0.093708
rel_aoa * rel_frequency                         -0.024749
rel_aoa * rel_letters_count                     -0.026019
rel_aoa * rel_orthographic_density              -0.059045
rel_aoa * rel_synonyms_count                     0.004364
rel_clustering * rel_frequency                   0.048658
rel_clustering * rel_letters_count               0.013447
rel_clustering * rel_orthographic_density       -0.054859
rel_clustering * rel_synonyms_count              0.026064
rel_frequency * rel_letters_count               -0.022080
rel_frequency * rel_orthographic_density         0.122207
rel_frequency * rel_synonyms_count               0.011116
rel_letters_count * rel_orthographic_density    -0.015852
rel_letters_count * rel_synonyms_count           0.005849
rel_orthographic_density * rel_synonyms_count    0.156622
dtype: float64

Regressing global aoa with 535 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.19685881814156422

intercept                      0.845746
global_aoa                     0.463783
global_clustering             -0.688148
global_frequency              -0.274694
global_letters_count           0.363817
global_orthographic_density    0.055615
global_synonyms_count          0.413354
rel_aoa                       -0.206087
rel_clustering                 0.471269
rel_frequency                  0.132486
rel_letters_count             -0.276951
rel_orthographic_density      -0.049432
rel_synonyms_count            -0.547346
dtype: float64

Regressing global aoa with 535 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.35035190236736447

intercept                                                 50.339558
global_aoa                                                -0.320085
global_clustering                                          5.190957
global_frequency                                          -0.708395
global_letters_count                                      -3.500628
global_orthographic_density                              -19.543634
global_synonyms_count                                     -3.018365
rel_aoa                                                   -1.206789
rel_clustering                                             2.583395
rel_frequency                                              2.699024
rel_letters_count                                          3.760062
rel_orthographic_density                                  13.559297
rel_synonyms_count                                        -6.006540
global_aoa * global_clustering                             0.295411
global_aoa * global_frequency                              0.141277
global_aoa * global_letters_count                          0.098444
global_aoa * global_orthographic_density                   0.287970
global_aoa * global_synonyms_count                        -0.147917
global_aoa * rel_aoa                                       0.030673
global_aoa * rel_clustering                               -0.423207
global_aoa * rel_frequency                                -0.087129
global_aoa * rel_letters_count                            -0.058992
global_aoa * rel_orthographic_density                     -0.355397
global_aoa * rel_synonyms_count                            0.334981
global_clustering * global_frequency                      -0.012169
global_clustering * global_letters_count                  -0.291116
global_clustering * global_orthographic_density           -2.934182
global_clustering * global_synonyms_count                 -0.897999
global_clustering * rel_aoa                               -0.200361
global_clustering * rel_clustering                         0.017810
global_clustering * rel_frequency                          0.281730
global_clustering * rel_letters_count                      0.192878
global_clustering * rel_orthographic_density               2.261561
global_clustering * rel_synonyms_count                     0.330621
global_frequency * global_letters_count                    0.098943
global_frequency * global_orthographic_density            -0.158237
global_frequency * global_synonyms_count                  -0.269783
global_frequency * rel_aoa                                -0.018496
global_frequency * rel_clustering                         -0.558256
global_frequency * rel_frequency                          -0.012646
global_frequency * rel_letters_count                      -0.166174
global_frequency * rel_orthographic_density                0.326136
global_frequency * rel_synonyms_count                      0.725062
global_letters_count * global_orthographic_density         0.247850
global_letters_count * global_synonyms_count               0.108102
global_letters_count * rel_aoa                            -0.094832
global_letters_count * rel_clustering                      0.419123
global_letters_count * rel_frequency                       0.012273
global_letters_count * rel_letters_count                   0.007178
global_letters_count * rel_orthographic_density           -0.140972
global_letters_count * rel_synonyms_count                  0.023030
global_orthographic_density * global_synonyms_count        0.708995
global_orthographic_density * rel_aoa                      0.021715
global_orthographic_density * rel_clustering               2.313012
global_orthographic_density * rel_frequency               -0.170790
global_orthographic_density * rel_letters_count           -0.520607
global_orthographic_density * rel_orthographic_density     0.098963
global_orthographic_density * rel_synonyms_count          -1.251331
global_synonyms_count * rel_aoa                            0.533378
global_synonyms_count * rel_clustering                     0.816182
global_synonyms_count * rel_frequency                      0.051477
global_synonyms_count * rel_letters_count                 -0.119743
global_synonyms_count * rel_orthographic_density          -0.234649
global_synonyms_count * rel_synonyms_count                 0.094812
rel_aoa * rel_clustering                                   0.448977
rel_aoa * rel_frequency                                   -0.014930
rel_aoa * rel_letters_count                                0.014979
rel_aoa * rel_orthographic_density                        -0.025333
rel_aoa * rel_synonyms_count                              -0.566309
rel_clustering * rel_frequency                             0.234622
rel_clustering * rel_letters_count                        -0.153705
rel_clustering * rel_orthographic_density                 -1.416429
rel_clustering * rel_synonyms_count                       -0.454050
rel_frequency * rel_letters_count                         -0.026004
rel_frequency * rel_orthographic_density                   0.199398
rel_frequency * rel_synonyms_count                        -0.495216
rel_letters_count * rel_orthographic_density               0.421250
rel_letters_count * rel_synonyms_count                    -0.090995
rel_orthographic_density * rel_synonyms_count              0.996116
dtype: float64

Regressing rel aoa with 535 measures, no interactions
           ^^^^^^^
R^2 = 0.24886735608806207

intercept                      0.109264
global_aoa                    -0.352041
global_clustering             -0.522120
global_frequency              -0.186674
global_letters_count           0.253570
global_orthographic_density   -0.001040
global_synonyms_count          0.483591
rel_aoa                        0.732702
rel_clustering                 0.392960
rel_frequency                  0.040810
rel_letters_count             -0.175282
rel_orthographic_density       0.019740
rel_synonyms_count            -0.623055
dtype: float64

Regressing rel aoa with 535 measures, with interactions
           ^^^^^^^
R^2 = 0.38156237097404544

intercept                                                 54.121899
global_aoa                                                -2.428607
global_clustering                                          5.638330
global_frequency                                          -2.172423
global_letters_count                                      -2.662370
global_orthographic_density                              -13.793868
global_synonyms_count                                     -3.557062
rel_aoa                                                   -0.115924
rel_clustering                                             0.406160
rel_frequency                                              2.932943
rel_letters_count                                          4.425778
rel_orthographic_density                                   8.846910
rel_synonyms_count                                        -0.620694
global_aoa * global_clustering                             0.101699
global_aoa * global_frequency                              0.126500
global_aoa * global_letters_count                          0.133620
global_aoa * global_orthographic_density                   0.271614
global_aoa * global_synonyms_count                         0.249930
global_aoa * rel_aoa                                       0.013631
global_aoa * rel_clustering                               -0.139332
global_aoa * rel_frequency                                -0.119082
global_aoa * rel_letters_count                            -0.150078
global_aoa * rel_orthographic_density                     -0.357662
global_aoa * rel_synonyms_count                           -0.128097
global_clustering * global_frequency                      -0.216909
global_clustering * global_letters_count                  -0.107674
global_clustering * global_orthographic_density           -1.997171
global_clustering * global_synonyms_count                 -0.458558
global_clustering * rel_aoa                               -0.194374
global_clustering * rel_clustering                        -0.012101
global_clustering * rel_frequency                          0.319488
global_clustering * rel_letters_count                      0.182480
global_clustering * rel_orthographic_density               1.532663
global_clustering * rel_synonyms_count                     0.364772
global_frequency * global_letters_count                    0.122685
global_frequency * global_orthographic_density            -0.080426
global_frequency * global_synonyms_count                  -0.159503
global_frequency * rel_aoa                                -0.001757
global_frequency * rel_clustering                         -0.269413
global_frequency * rel_frequency                          -0.017661
global_frequency * rel_letters_count                      -0.190464
global_frequency * rel_orthographic_density                0.248151
global_frequency * rel_synonyms_count                      0.500180
global_letters_count * global_orthographic_density         0.130161
global_letters_count * global_synonyms_count              -0.020640
global_letters_count * rel_aoa                            -0.101519
global_letters_count * rel_clustering                      0.222615
global_letters_count * rel_frequency                       0.009710
global_letters_count * rel_letters_count                   0.023493
global_letters_count * rel_orthographic_density            0.050978
global_letters_count * rel_synonyms_count                  0.064002
global_orthographic_density * global_synonyms_count        0.855960
global_orthographic_density * rel_aoa                     -0.013324
global_orthographic_density * rel_clustering               1.497310
global_orthographic_density * rel_frequency               -0.113794
global_orthographic_density * rel_letters_count           -0.439210
global_orthographic_density * rel_orthographic_density     0.149534
global_orthographic_density * rel_synonyms_count          -1.342041
global_synonyms_count * rel_aoa                            0.283839
global_synonyms_count * rel_clustering                     0.209659
global_synonyms_count * rel_frequency                      0.090044
global_synonyms_count * rel_letters_count                 -0.026894
global_synonyms_count * rel_orthographic_density          -0.472905
global_synonyms_count * rel_synonyms_count                -0.054007
rel_aoa * rel_clustering                                   0.368893
rel_aoa * rel_frequency                                   -0.005586
rel_aoa * rel_letters_count                                0.057573
rel_aoa * rel_orthographic_density                         0.012268
rel_aoa * rel_synonyms_count                              -0.338563
rel_clustering * rel_frequency                             0.120742
rel_clustering * rel_letters_count                        -0.116372
rel_clustering * rel_orthographic_density                 -0.744045
rel_clustering * rel_synonyms_count                       -0.287453
rel_frequency * rel_letters_count                          0.013877
rel_frequency * rel_orthographic_density                   0.139860
rel_frequency * rel_synonyms_count                        -0.410667
rel_letters_count * rel_orthographic_density               0.305114
rel_letters_count * rel_synonyms_count                    -0.107115
rel_orthographic_density * rel_synonyms_count              1.045854
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 480 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05142206894645285

intercept                     -4.280348
global_aoa                    -0.027510
global_clustering              0.137567
global_frequency              -0.063875
global_letters_count           0.018243
global_orthographic_density   -0.022867
global_synonyms_count         -0.018733
dtype: float64

Regressing global clustering with 480 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.13167846618055112

intercept                                             -0.555248
global_aoa                                             0.331447
global_clustering                                      0.605902
global_frequency                                      -0.784057
global_letters_count                                  -0.100375
global_orthographic_density                           -0.001642
global_synonyms_count                                 -0.890254
global_aoa * global_clustering                         0.021187
global_aoa * global_frequency                         -0.009173
global_aoa * global_letters_count                     -0.009168
global_aoa * global_orthographic_density              -0.064364
global_aoa * global_synonyms_count                    -0.010072
global_clustering * global_frequency                  -0.081933
global_clustering * global_letters_count               0.010020
global_clustering * global_orthographic_density        0.067476
global_clustering * global_synonyms_count             -0.112660
global_frequency * global_letters_count                0.026666
global_frequency * global_orthographic_density         0.089729
global_frequency * global_synonyms_count               0.062046
global_letters_count * global_orthographic_density     0.005516
global_letters_count * global_synonyms_count          -0.010930
global_orthographic_density * global_synonyms_count   -0.179259
dtype: float64

Regressing rel clustering with 480 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.03587002880767953

intercept                      1.676049
global_aoa                    -0.028683
global_clustering              0.099134
global_frequency              -0.060134
global_letters_count           0.010488
global_orthographic_density   -0.029238
global_synonyms_count         -0.032809
dtype: float64

Regressing rel clustering with 480 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.08754885037413396

intercept                                              5.973297
global_aoa                                             0.254595
global_clustering                                      0.655701
global_frequency                                      -0.772404
global_letters_count                                  -0.200488
global_orthographic_density                            0.034074
global_synonyms_count                                 -0.212899
global_aoa * global_clustering                         0.022714
global_aoa * global_frequency                          0.001656
global_aoa * global_letters_count                     -0.009228
global_aoa * global_orthographic_density              -0.066834
global_aoa * global_synonyms_count                    -0.047864
global_clustering * global_frequency                  -0.080076
global_clustering * global_letters_count              -0.013095
global_clustering * global_orthographic_density        0.060874
global_clustering * global_synonyms_count             -0.022129
global_frequency * global_letters_count                0.019837
global_frequency * global_orthographic_density         0.070656
global_frequency * global_synonyms_count               0.053764
global_letters_count * global_orthographic_density     0.029249
global_letters_count * global_synonyms_count           0.010167
global_orthographic_density * global_synonyms_count   -0.142926
dtype: float64

Regressing global clustering with 480 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.01707486038293704

intercept                  -5.830991
rel_aoa                     0.001899
rel_clustering              0.127688
rel_frequency              -0.002178
rel_letters_count           0.005176
rel_orthographic_density    0.013955
rel_synonyms_count         -0.000437
dtype: float64

Regressing global clustering with 480 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.06620016644251392

intercept                                       -5.726244
rel_aoa                                         -0.032812
rel_clustering                                  -0.013953
rel_frequency                                    0.074300
rel_letters_count                                0.009857
rel_orthographic_density                         0.072941
rel_synonyms_count                              -0.126439
rel_aoa * rel_clustering                         0.022008
rel_aoa * rel_frequency                         -0.015396
rel_aoa * rel_letters_count                     -0.027854
rel_aoa * rel_orthographic_density              -0.053266
rel_aoa * rel_synonyms_count                    -0.003849
rel_clustering * rel_frequency                  -0.056965
rel_clustering * rel_letters_count              -0.025728
rel_clustering * rel_orthographic_density       -0.054798
rel_clustering * rel_synonyms_count             -0.097964
rel_frequency * rel_letters_count               -0.016696
rel_frequency * rel_orthographic_density        -0.000786
rel_frequency * rel_synonyms_count              -0.011663
rel_letters_count * rel_orthographic_density     0.003798
rel_letters_count * rel_synonyms_count           0.025612
rel_orthographic_density * rel_synonyms_count   -0.047367
dtype: float64

Regressing rel clustering with 480 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.09929799625996072

intercept                   0.256352
rel_aoa                    -0.023105
rel_clustering              0.311547
rel_frequency              -0.001579
rel_letters_count           0.010049
rel_orthographic_density    0.016092
rel_synonyms_count          0.017074
dtype: float64

Regressing rel clustering with 480 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1445804020175114

intercept                                        0.341568
rel_aoa                                         -0.036413
rel_clustering                                   0.131416
rel_frequency                                    0.063431
rel_letters_count                                0.021496
rel_orthographic_density                         0.016583
rel_synonyms_count                              -0.067580
rel_aoa * rel_clustering                         0.015474
rel_aoa * rel_frequency                         -0.003291
rel_aoa * rel_letters_count                     -0.031035
rel_aoa * rel_orthographic_density              -0.075581
rel_aoa * rel_synonyms_count                    -0.009316
rel_clustering * rel_frequency                  -0.074961
rel_clustering * rel_letters_count              -0.002905
rel_clustering * rel_orthographic_density        0.005793
rel_clustering * rel_synonyms_count             -0.068227
rel_frequency * rel_letters_count               -0.015422
rel_frequency * rel_orthographic_density        -0.012078
rel_frequency * rel_synonyms_count              -0.005225
rel_letters_count * rel_orthographic_density     0.016194
rel_letters_count * rel_synonyms_count           0.047482
rel_orthographic_density * rel_synonyms_count    0.023631
dtype: float64

Regressing global clustering with 480 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.08470141209684923

intercept                     -2.582948
global_aoa                    -0.048187
global_clustering              0.219546
global_frequency              -0.110708
global_letters_count           0.020340
global_orthographic_density   -0.225877
global_synonyms_count         -0.085516
rel_aoa                        0.024399
rel_clustering                -0.078620
rel_frequency                  0.059599
rel_letters_count             -0.004486
rel_orthographic_density       0.233442
rel_synonyms_count             0.058645
dtype: float64

Regressing global clustering with 480 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2697042755363883

intercept                                                 13.139926
global_aoa                                                 0.705654
global_clustering                                          2.260347
global_frequency                                          -1.778163
global_letters_count                                      -0.500973
global_orthographic_density                               -1.884769
global_synonyms_count                                     -3.264594
rel_aoa                                                   -0.382447
rel_clustering                                            -2.846404
rel_frequency                                              0.791035
rel_letters_count                                          0.059207
rel_orthographic_density                                   1.348443
rel_synonyms_count                                         2.208427
global_aoa * global_clustering                             0.025885
global_aoa * global_frequency                             -0.002882
global_aoa * global_letters_count                         -0.067539
global_aoa * global_orthographic_density                  -0.190140
global_aoa * global_synonyms_count                         0.154310
global_aoa * rel_aoa                                       0.011907
global_aoa * rel_clustering                                0.022134
global_aoa * rel_frequency                                 0.000606
global_aoa * rel_letters_count                             0.067046
global_aoa * rel_orthographic_density                      0.111757
global_aoa * rel_synonyms_count                           -0.146572
global_clustering * global_frequency                      -0.194542
global_clustering * global_letters_count                  -0.067558
global_clustering * global_orthographic_density            0.092585
global_clustering * global_synonyms_count                 -0.434961
global_clustering * rel_aoa                               -0.061181
global_clustering * rel_clustering                        -0.067225
global_clustering * rel_frequency                          0.125144
global_clustering * rel_letters_count                      0.072690
global_clustering * rel_orthographic_density              -0.163914
global_clustering * rel_synonyms_count                     0.601551
global_frequency * global_letters_count                    0.009517
global_frequency * global_orthographic_density             0.232944
global_frequency * global_synonyms_count                   0.177885
global_frequency * rel_aoa                                -0.019265
global_frequency * rel_clustering                          0.140498
global_frequency * rel_frequency                           0.026206
global_frequency * rel_letters_count                       0.028600
global_frequency * rel_orthographic_density               -0.144591
global_frequency * rel_synonyms_count                      0.041357
global_letters_count * global_orthographic_density         0.237947
global_letters_count * global_synonyms_count              -0.149097
global_letters_count * rel_aoa                             0.010096
global_letters_count * rel_clustering                      0.134619
global_letters_count * rel_frequency                       0.032481
global_letters_count * rel_letters_count                   0.005086
global_letters_count * rel_orthographic_density           -0.279749
global_letters_count * rel_synonyms_count                  0.282892
global_orthographic_density * global_synonyms_count       -0.679114
global_orthographic_density * rel_aoa                      0.113331
global_orthographic_density * rel_clustering               0.066484
global_orthographic_density * rel_frequency               -0.125423
global_orthographic_density * rel_letters_count           -0.179032
global_orthographic_density * rel_orthographic_density    -0.063282
global_orthographic_density * rel_synonyms_count           0.216674
global_synonyms_count * rel_aoa                           -0.093601
global_synonyms_count * rel_clustering                     0.166099
global_synonyms_count * rel_frequency                     -0.130084
global_synonyms_count * rel_letters_count                 -0.083655
global_synonyms_count * rel_orthographic_density           0.438983
global_synonyms_count * rel_synonyms_count                -0.099963
rel_aoa * rel_clustering                                   0.072493
rel_aoa * rel_frequency                                    0.031628
rel_aoa * rel_letters_count                               -0.041831
rel_aoa * rel_orthographic_density                        -0.122109
rel_aoa * rel_synonyms_count                               0.058766
rel_clustering * rel_frequency                            -0.141052
rel_clustering * rel_letters_count                        -0.143163
rel_clustering * rel_orthographic_density                  0.074407
rel_clustering * rel_synonyms_count                       -0.381417
rel_frequency * rel_letters_count                         -0.056126
rel_frequency * rel_orthographic_density                   0.099822
rel_frequency * rel_synonyms_count                        -0.036962
rel_letters_count * rel_orthographic_density               0.219973
rel_letters_count * rel_synonyms_count                    -0.019911
rel_orthographic_density * rel_synonyms_count             -0.025891
dtype: float64

Regressing rel clustering with 480 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.20624901182248226

intercept                     -1.703458
global_aoa                    -0.038274
global_clustering             -0.590809
global_frequency              -0.092268
global_letters_count           0.008427
global_orthographic_density   -0.205833
global_synonyms_count         -0.109906
rel_aoa                        0.017458
rel_clustering                 0.810717
rel_frequency                  0.045181
rel_letters_count             -0.001258
rel_orthographic_density       0.193970
rel_synonyms_count             0.100391
dtype: float64

Regressing rel clustering with 480 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.35824184830492634

intercept                                                 13.858931
global_aoa                                                 0.412153
global_clustering                                          0.606960
global_frequency                                          -1.628317
global_letters_count                                      -1.155396
global_orthographic_density                               -2.719737
global_synonyms_count                                     -2.648855
rel_aoa                                                   -0.178422
rel_clustering                                            -1.004331
rel_frequency                                              0.761827
rel_letters_count                                          0.537342
rel_orthographic_density                                   1.699760
rel_synonyms_count                                         1.958233
global_aoa * global_clustering                             0.040071
global_aoa * global_frequency                              0.009372
global_aoa * global_letters_count                         -0.034339
global_aoa * global_orthographic_density                  -0.122996
global_aoa * global_synonyms_count                         0.136061
global_aoa * rel_aoa                                       0.011349
global_aoa * rel_clustering                               -0.009594
global_aoa * rel_frequency                                -0.005385
global_aoa * rel_letters_count                             0.040105
global_aoa * rel_orthographic_density                      0.067635
global_aoa * rel_synonyms_count                           -0.144405
global_clustering * global_frequency                      -0.123816
global_clustering * global_letters_count                  -0.093780
global_clustering * global_orthographic_density            0.123363
global_clustering * global_synonyms_count                 -0.306753
global_clustering * rel_aoa                               -0.076469
global_clustering * rel_clustering                        -0.068732
global_clustering * rel_frequency                          0.073929
global_clustering * rel_letters_count                      0.067429
global_clustering * rel_orthographic_density              -0.244982
global_clustering * rel_synonyms_count                     0.466027
global_frequency * global_letters_count                    0.035688
global_frequency * global_orthographic_density             0.288269
global_frequency * global_synonyms_count                   0.135134
global_frequency * rel_aoa                                -0.028230
global_frequency * rel_clustering                          0.078796
global_frequency * rel_frequency                           0.021402
global_frequency * rel_letters_count                      -0.000952
global_frequency * rel_orthographic_density               -0.205062
global_frequency * rel_synonyms_count                      0.057315
global_letters_count * global_orthographic_density         0.226755
global_letters_count * global_synonyms_count              -0.095940
global_letters_count * rel_aoa                            -0.014297
global_letters_count * rel_clustering                      0.141608
global_letters_count * rel_frequency                       0.005133
global_letters_count * rel_letters_count                   0.002801
global_letters_count * rel_orthographic_density           -0.255270
global_letters_count * rel_synonyms_count                  0.201992
global_orthographic_density * global_synonyms_count       -0.454439
global_orthographic_density * rel_aoa                      0.061719
global_orthographic_density * rel_clustering              -0.006459
global_orthographic_density * rel_frequency               -0.183299
global_orthographic_density * rel_letters_count           -0.177438
global_orthographic_density * rel_orthographic_density    -0.050779
global_orthographic_density * rel_synonyms_count           0.041502
global_synonyms_count * rel_aoa                           -0.122868
global_synonyms_count * rel_clustering                     0.161081
global_synonyms_count * rel_frequency                     -0.084557
global_synonyms_count * rel_letters_count                 -0.126512
global_synonyms_count * rel_orthographic_density           0.155278
global_synonyms_count * rel_synonyms_count                -0.081031
rel_aoa * rel_clustering                                   0.080077
rel_aoa * rel_frequency                                    0.031919
rel_aoa * rel_letters_count                               -0.023052
rel_aoa * rel_orthographic_density                        -0.087013
rel_aoa * rel_synonyms_count                               0.100730
rel_clustering * rel_frequency                            -0.095943
rel_clustering * rel_letters_count                        -0.105408
rel_clustering * rel_orthographic_density                  0.175088
rel_clustering * rel_synonyms_count                       -0.345496
rel_frequency * rel_letters_count                         -0.030065
rel_frequency * rel_orthographic_density                   0.146069
rel_frequency * rel_synonyms_count                        -0.058125
rel_letters_count * rel_orthographic_density               0.206283
rel_letters_count * rel_synonyms_count                     0.036969
rel_orthographic_density * rel_synonyms_count              0.184175
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10140550578569296

intercept                      5.207477
global_aoa                    -0.044423
global_clustering              0.030516
global_frequency              -0.004609
global_letters_count           0.296838
global_orthographic_density   -0.142730
global_synonyms_count         -0.301903
dtype: float64

Regressing global letters_count with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13291296285889576

intercept                                             -0.267476
global_aoa                                            -0.403496
global_clustering                                     -2.476731
global_frequency                                      -0.397672
global_letters_count                                   0.718812
global_orthographic_density                           -1.240769
global_synonyms_count                                  3.343434
global_aoa * global_clustering                         0.027851
global_aoa * global_frequency                          0.062711
global_aoa * global_letters_count                      0.000773
global_aoa * global_orthographic_density              -0.033995
global_aoa * global_synonyms_count                     0.024374
global_clustering * global_frequency                   0.117003
global_clustering * global_letters_count               0.145983
global_clustering * global_orthographic_density        0.108924
global_clustering * global_synonyms_count              0.423027
global_frequency * global_letters_count                0.049505
global_frequency * global_orthographic_density         0.228253
global_frequency * global_synonyms_count               0.057935
global_letters_count * global_orthographic_density    -0.005090
global_letters_count * global_synonyms_count          -0.214133
global_orthographic_density * global_synonyms_count   -0.439871
dtype: float64

Regressing rel letters_count with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04933875907092766

intercept                      2.346035
global_aoa                    -0.073588
global_clustering              0.017261
global_frequency              -0.041978
global_letters_count           0.186225
global_orthographic_density   -0.135384
global_synonyms_count         -0.326478
dtype: float64

Regressing rel letters_count with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0866474910030881

intercept                                             -3.551554
global_aoa                                             0.129032
global_clustering                                     -2.621013
global_frequency                                      -0.698982
global_letters_count                                   0.416312
global_orthographic_density                           -1.177127
global_synonyms_count                                  2.391792
global_aoa * global_clustering                         0.105207
global_aoa * global_frequency                          0.078317
global_aoa * global_letters_count                     -0.033039
global_aoa * global_orthographic_density              -0.080827
global_aoa * global_synonyms_count                     0.015528
global_clustering * global_frequency                   0.119500
global_clustering * global_letters_count               0.087493
global_clustering * global_orthographic_density        0.117175
global_clustering * global_synonyms_count              0.193740
global_frequency * global_letters_count                0.066278
global_frequency * global_orthographic_density         0.284255
global_frequency * global_synonyms_count               0.039935
global_letters_count * global_orthographic_density    -0.040184
global_letters_count * global_synonyms_count          -0.236555
global_orthographic_density * global_synonyms_count   -0.446662
dtype: float64

Regressing global letters_count with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09076565249517776

intercept                   5.695140
rel_aoa                    -0.106081
rel_clustering              0.226854
rel_frequency               0.095422
rel_letters_count           0.224493
rel_orthographic_density   -0.277285
rel_synonyms_count         -0.388092
dtype: float64

Regressing global letters_count with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10616809527308901

intercept                                        5.705630
rel_aoa                                         -0.224098
rel_clustering                                   0.067912
rel_frequency                                    0.136133
rel_letters_count                                0.341911
rel_orthographic_density                        -0.152781
rel_synonyms_count                              -0.447878
rel_aoa * rel_clustering                        -0.046643
rel_aoa * rel_frequency                         -0.039545
rel_aoa * rel_letters_count                      0.007874
rel_aoa * rel_orthographic_density              -0.026185
rel_aoa * rel_synonyms_count                    -0.011603
rel_clustering * rel_frequency                  -0.015687
rel_clustering * rel_letters_count               0.090968
rel_clustering * rel_orthographic_density        0.099434
rel_clustering * rel_synonyms_count              0.337968
rel_frequency * rel_letters_count                0.034357
rel_frequency * rel_orthographic_density         0.099681
rel_frequency * rel_synonyms_count              -0.002447
rel_letters_count * rel_orthographic_density     0.057151
rel_letters_count * rel_synonyms_count          -0.090361
rel_orthographic_density * rel_synonyms_count   -0.116509
dtype: float64

Regressing rel letters_count with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1377012901434429

intercept                   1.190503
rel_aoa                    -0.068704
rel_clustering              0.026649
rel_frequency              -0.131049
rel_letters_count           0.362286
rel_orthographic_density   -0.011029
rel_synonyms_count         -0.367829
dtype: float64

Regressing rel letters_count with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1575597230697552

intercept                                        1.226088
rel_aoa                                         -0.078604
rel_clustering                                  -0.024594
rel_frequency                                   -0.115712
rel_letters_count                                0.510258
rel_orthographic_density                         0.223829
rel_synonyms_count                              -0.315804
rel_aoa * rel_clustering                         0.000442
rel_aoa * rel_frequency                          0.008484
rel_aoa * rel_letters_count                     -0.038371
rel_aoa * rel_orthographic_density              -0.120400
rel_aoa * rel_synonyms_count                    -0.025449
rel_clustering * rel_frequency                   0.038101
rel_clustering * rel_letters_count               0.142419
rel_clustering * rel_orthographic_density        0.212351
rel_clustering * rel_synonyms_count              0.207785
rel_frequency * rel_letters_count                0.044410
rel_frequency * rel_orthographic_density         0.152192
rel_frequency * rel_synonyms_count               0.032468
rel_letters_count * rel_orthographic_density     0.065623
rel_letters_count * rel_synonyms_count          -0.130222
rel_orthographic_density * rel_synonyms_count   -0.282950
dtype: float64

Regressing global letters_count with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11823096242049014

intercept                      1.815260
global_aoa                     0.035207
global_clustering             -0.414635
global_frequency              -0.064337
global_letters_count           0.363692
global_orthographic_density    0.053356
global_synonyms_count          0.283844
rel_aoa                       -0.118216
rel_clustering                 0.492924
rel_frequency                  0.048537
rel_letters_count             -0.087398
rel_orthographic_density      -0.202726
rel_synonyms_count            -0.651815
dtype: float64

Regressing global letters_count with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.23205965781012194

intercept                                                -8.319126
global_aoa                                                0.355799
global_clustering                                        -1.330943
global_frequency                                          2.705233
global_letters_count                                     -2.448969
global_orthographic_density                              -4.802973
global_synonyms_count                                     5.336947
rel_aoa                                                  -3.834774
rel_clustering                                           -0.230172
rel_frequency                                            -0.288210
rel_letters_count                                         4.336382
rel_orthographic_density                                 -0.747280
rel_synonyms_count                                       -1.879628
global_aoa * global_clustering                            0.304699
global_aoa * global_frequency                             0.107401
global_aoa * global_letters_count                         0.042071
global_aoa * global_orthographic_density                  0.101521
global_aoa * global_synonyms_count                        0.088492
global_aoa * rel_aoa                                      0.025660
global_aoa * rel_clustering                              -0.326778
global_aoa * rel_frequency                               -0.091970
global_aoa * rel_letters_count                           -0.092436
global_aoa * rel_orthographic_density                    -0.112959
global_aoa * rel_synonyms_count                          -0.119434
global_clustering * global_frequency                      0.403283
global_clustering * global_letters_count                 -0.544287
global_clustering * global_orthographic_density          -1.266412
global_clustering * global_synonyms_count                -0.737570
global_clustering * rel_aoa                              -0.278529
global_clustering * rel_clustering                       -0.004418
global_clustering * rel_frequency                         0.001116
global_clustering * rel_letters_count                     0.518051
global_clustering * rel_orthographic_density              0.836012
global_clustering * rel_synonyms_count                    0.664372
global_frequency * global_letters_count                  -0.013160
global_frequency * global_orthographic_density           -0.198767
global_frequency * global_synonyms_count                 -0.479060
global_frequency * rel_aoa                                0.167114
global_frequency * rel_clustering                        -0.244752
global_frequency * rel_frequency                          0.010996
global_frequency * rel_letters_count                     -0.094578
global_frequency * rel_orthographic_density               0.489664
global_frequency * rel_synonyms_count                     0.523045
global_letters_count * global_orthographic_density       -0.225359
global_letters_count * global_synonyms_count             -0.506692
global_letters_count * rel_aoa                            0.058897
global_letters_count * rel_clustering                     0.744763
global_letters_count * rel_frequency                      0.155163
global_letters_count * rel_letters_count                  0.018888
global_letters_count * rel_orthographic_density           0.158938
global_letters_count * rel_synonyms_count                 0.206334
global_orthographic_density * global_synonyms_count      -1.121864
global_orthographic_density * rel_aoa                    -0.164585
global_orthographic_density * rel_clustering              0.803633
global_orthographic_density * rel_frequency              -0.037424
global_orthographic_density * rel_letters_count           0.067652
global_orthographic_density * rel_orthographic_density   -0.021931
global_orthographic_density * rel_synonyms_count         -0.375028
global_synonyms_count * rel_aoa                           0.409312
global_synonyms_count * rel_clustering                    1.142395
global_synonyms_count * rel_frequency                     0.355588
global_synonyms_count * rel_letters_count                 0.310723
global_synonyms_count * rel_orthographic_density          1.573984
global_synonyms_count * rel_synonyms_count               -0.238368
rel_aoa * rel_clustering                                  0.298095
rel_aoa * rel_frequency                                  -0.150197
rel_aoa * rel_letters_count                              -0.066700
rel_aoa * rel_orthographic_density                        0.134124
rel_aoa * rel_synonyms_count                             -0.321249
rel_clustering * rel_frequency                           -0.049789
rel_clustering * rel_letters_count                       -0.479106
rel_clustering * rel_orthographic_density                -0.083169
rel_clustering * rel_synonyms_count                      -0.666639
rel_frequency * rel_letters_count                        -0.007168
rel_frequency * rel_orthographic_density                  0.006338
rel_frequency * rel_synonyms_count                       -0.423505
rel_letters_count * rel_orthographic_density             -0.013042
rel_letters_count * rel_synonyms_count                   -0.326014
rel_orthographic_density * rel_synonyms_count            -0.500637
dtype: float64

Regressing rel letters_count with 599 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.18308957944864943

intercept                      1.064841
global_aoa                    -0.003168
global_clustering             -0.353389
global_frequency              -0.005660
global_letters_count          -0.465103
global_orthographic_density    0.018866
global_synonyms_count          0.287001
rel_aoa                       -0.067868
rel_clustering                 0.428417
rel_frequency                 -0.021287
rel_letters_count              0.755463
rel_orthographic_density      -0.187386
rel_synonyms_count            -0.627265
dtype: float64

Regressing rel letters_count with 599 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.28551916782407993

intercept                                                -6.275177
global_aoa                                                0.028154
global_clustering                                        -2.955296
global_frequency                                          1.740740
global_letters_count                                     -3.200523
global_orthographic_density                              -6.715714
global_synonyms_count                                     5.060965
rel_aoa                                                  -3.098215
rel_clustering                                            0.196732
rel_frequency                                             0.073903
rel_letters_count                                         4.604833
rel_orthographic_density                                 -0.539933
rel_synonyms_count                                       -1.517512
global_aoa * global_clustering                            0.283433
global_aoa * global_frequency                             0.122290
global_aoa * global_letters_count                         0.052952
global_aoa * global_orthographic_density                  0.070016
global_aoa * global_synonyms_count                       -0.003577
global_aoa * rel_aoa                                      0.023338
global_aoa * rel_clustering                              -0.271635
global_aoa * rel_frequency                               -0.121701
global_aoa * rel_letters_count                           -0.103570
global_aoa * rel_orthographic_density                    -0.081579
global_aoa * rel_synonyms_count                          -0.023869
global_clustering * global_frequency                      0.417429
global_clustering * global_letters_count                 -0.358061
global_clustering * global_orthographic_density          -0.968181
global_clustering * global_synonyms_count                -0.297084
global_clustering * rel_aoa                              -0.261102
global_clustering * rel_clustering                       -0.100728
global_clustering * rel_frequency                        -0.058816
global_clustering * rel_letters_count                     0.261342
global_clustering * rel_orthographic_density              0.355095
global_clustering * rel_synonyms_count                   -0.072617
global_frequency * global_letters_count                   0.064410
global_frequency * global_orthographic_density            0.110327
global_frequency * global_synonyms_count                 -0.230296
global_frequency * rel_aoa                                0.134933
global_frequency * rel_clustering                        -0.260779
global_frequency * rel_frequency                          0.021896
global_frequency * rel_letters_count                     -0.164932
global_frequency * rel_orthographic_density               0.234206
global_frequency * rel_synonyms_count                     0.116692
global_letters_count * global_orthographic_density       -0.137766
global_letters_count * global_synonyms_count             -0.401870
global_letters_count * rel_aoa                            0.035250
global_letters_count * rel_clustering                     0.604633
global_letters_count * rel_frequency                      0.116257
global_letters_count * rel_letters_count                  0.003226
global_letters_count * rel_orthographic_density           0.074742
global_letters_count * rel_synonyms_count                 0.007601
global_orthographic_density * global_synonyms_count      -1.042043
global_orthographic_density * rel_aoa                    -0.181135
global_orthographic_density * rel_clustering              0.591467
global_orthographic_density * rel_frequency              -0.276451
global_orthographic_density * rel_letters_count           0.039274
global_orthographic_density * rel_orthographic_density   -0.020964
global_orthographic_density * rel_synonyms_count         -0.319083
global_synonyms_count * rel_aoa                           0.410925
global_synonyms_count * rel_clustering                    0.728668
global_synonyms_count * rel_frequency                     0.155952
global_synonyms_count * rel_letters_count                 0.294618
global_synonyms_count * rel_orthographic_density          1.437643
global_synonyms_count * rel_synonyms_count               -0.243237
rel_aoa * rel_clustering                                  0.253763
rel_aoa * rel_frequency                                  -0.091540
rel_aoa * rel_letters_count                              -0.032394
rel_aoa * rel_orthographic_density                        0.175977
rel_aoa * rel_synonyms_count                             -0.297558
rel_clustering * rel_frequency                           -0.029197
rel_clustering * rel_letters_count                       -0.283978
rel_clustering * rel_orthographic_density                 0.263935
rel_clustering * rel_synonyms_count                      -0.103288
rel_frequency * rel_letters_count                         0.033294
rel_frequency * rel_orthographic_density                  0.200890
rel_frequency * rel_synonyms_count                       -0.096698
rel_letters_count * rel_orthographic_density              0.004975
rel_letters_count * rel_synonyms_count                   -0.218872
rel_orthographic_density * rel_synonyms_count            -0.462401
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 581 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.061638592145976556

intercept                      0.797468
global_aoa                    -0.008650
global_clustering              0.013142
global_frequency              -0.016586
global_letters_count          -0.032128
global_orthographic_density   -0.007856
global_synonyms_count          0.207167
dtype: float64

Regressing global synonyms_count with 581 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09125671306009964

intercept                                              3.216410
global_aoa                                            -0.053755
global_clustering                                      0.400135
global_frequency                                      -0.065054
global_letters_count                                  -0.209968
global_orthographic_density                           -0.704577
global_synonyms_count                                  0.208110
global_aoa * global_clustering                        -0.025664
global_aoa * global_frequency                         -0.018660
global_aoa * global_letters_count                      0.005194
global_aoa * global_orthographic_density               0.021693
global_aoa * global_synonyms_count                     0.034086
global_clustering * global_frequency                  -0.014375
global_clustering * global_letters_count              -0.001798
global_clustering * global_orthographic_density       -0.079971
global_clustering * global_synonyms_count              0.116450
global_frequency * global_letters_count                0.014616
global_frequency * global_orthographic_density         0.010182
global_frequency * global_synonyms_count               0.015522
global_letters_count * global_orthographic_density    -0.010119
global_letters_count * global_synonyms_count           0.024756
global_orthographic_density * global_synonyms_count    0.112789
dtype: float64

Regressing rel synonyms_count with 581 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.04640168770906606

intercept                      0.572938
global_aoa                    -0.018069
global_clustering              0.006769
global_frequency              -0.018007
global_letters_count          -0.031258
global_orthographic_density   -0.026392
global_synonyms_count          0.161038
dtype: float64

Regressing rel synonyms_count with 581 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07852363194055056

intercept                                              5.322238
global_aoa                                            -0.133026
global_clustering                                      0.650421
global_frequency                                      -0.314382
global_letters_count                                  -0.294909
global_orthographic_density                           -0.821819
global_synonyms_count                                  0.415586
global_aoa * global_clustering                        -0.023595
global_aoa * global_frequency                         -0.007574
global_aoa * global_letters_count                      0.004242
global_aoa * global_orthographic_density               0.019157
global_aoa * global_synonyms_count                     0.016307
global_clustering * global_frequency                  -0.038536
global_clustering * global_letters_count              -0.014322
global_clustering * global_orthographic_density       -0.066843
global_clustering * global_synonyms_count              0.116987
global_frequency * global_letters_count                0.016565
global_frequency * global_orthographic_density         0.031487
global_frequency * global_synonyms_count               0.003364
global_letters_count * global_orthographic_density    -0.007374
global_letters_count * global_synonyms_count           0.025960
global_orthographic_density * global_synonyms_count    0.095025
dtype: float64

Regressing global synonyms_count with 581 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06477435510584417

intercept                   0.422224
rel_aoa                     0.030386
rel_clustering             -0.054999
rel_frequency              -0.016643
rel_letters_count          -0.035395
rel_orthographic_density    0.018167
rel_synonyms_count          0.211762
dtype: float64

Regressing global synonyms_count with 581 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11104745362682

intercept                                        0.515761
rel_aoa                                          0.037071
rel_clustering                                  -0.203450
rel_frequency                                    0.022477
rel_letters_count                               -0.085012
rel_orthographic_density                         0.053456
rel_synonyms_count                               0.222772
rel_aoa * rel_clustering                        -0.034539
rel_aoa * rel_frequency                         -0.003263
rel_aoa * rel_letters_count                      0.014058
rel_aoa * rel_orthographic_density               0.027599
rel_aoa * rel_synonyms_count                     0.036819
rel_clustering * rel_frequency                  -0.036598
rel_clustering * rel_letters_count              -0.012946
rel_clustering * rel_orthographic_density       -0.106112
rel_clustering * rel_synonyms_count              0.132183
rel_frequency * rel_letters_count               -0.009900
rel_frequency * rel_orthographic_density        -0.001474
rel_frequency * rel_synonyms_count               0.020302
rel_letters_count * rel_orthographic_density    -0.014070
rel_letters_count * rel_synonyms_count           0.024705
rel_orthographic_density * rel_synonyms_count    0.099869
dtype: float64

Regressing rel synonyms_count with 581 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.12802958104345363

intercept                   0.091023
rel_aoa                     0.006130
rel_clustering             -0.025370
rel_frequency              -0.014525
rel_letters_count          -0.033452
rel_orthographic_density   -0.015443
rel_synonyms_count          0.341603
dtype: float64

Regressing rel synonyms_count with 581 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.1822190044132359

intercept                                        0.200229
rel_aoa                                          0.013675
rel_clustering                                  -0.188309
rel_frequency                                    0.035395
rel_letters_count                               -0.077848
rel_orthographic_density                         0.051253
rel_synonyms_count                               0.405250
rel_aoa * rel_clustering                        -0.020675
rel_aoa * rel_frequency                          0.000833
rel_aoa * rel_letters_count                      0.012446
rel_aoa * rel_orthographic_density               0.020090
rel_aoa * rel_synonyms_count                     0.021877
rel_clustering * rel_frequency                  -0.050513
rel_clustering * rel_letters_count              -0.016222
rel_clustering * rel_orthographic_density       -0.085381
rel_clustering * rel_synonyms_count              0.113398
rel_frequency * rel_letters_count               -0.008323
rel_frequency * rel_orthographic_density         0.012396
rel_frequency * rel_synonyms_count               0.021654
rel_letters_count * rel_orthographic_density    -0.013846
rel_letters_count * rel_synonyms_count           0.036392
rel_orthographic_density * rel_synonyms_count    0.152859
dtype: float64

Regressing global synonyms_count with 581 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08806930993562156

intercept                      2.057769
global_aoa                    -0.051611
global_clustering              0.177366
global_frequency              -0.011505
global_letters_count          -0.008352
global_orthographic_density   -0.052658
global_synonyms_count          0.090615
rel_aoa                        0.058568
rel_clustering                -0.180286
rel_frequency                 -0.001388
rel_letters_count             -0.023954
rel_orthographic_density       0.043126
rel_synonyms_count             0.119230
dtype: float64

Regressing global synonyms_count with 581 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.22850932022224613

intercept                                                -1.525263
global_aoa                                                1.063513
global_clustering                                         1.072051
global_frequency                                         -0.205933
global_letters_count                                      0.558104
global_orthographic_density                               2.988369
global_synonyms_count                                     1.647992
rel_aoa                                                  -0.114412
rel_clustering                                           -2.424090
rel_frequency                                            -0.162823
rel_letters_count                                        -0.779442
rel_orthographic_density                                 -0.700447
rel_synonyms_count                                       -2.937510
global_aoa * global_clustering                           -0.020195
global_aoa * global_frequency                            -0.067623
global_aoa * global_letters_count                        -0.065774
global_aoa * global_orthographic_density                 -0.123795
global_aoa * global_synonyms_count                        0.122077
global_aoa * rel_aoa                                     -0.006402
global_aoa * rel_clustering                               0.048009
global_aoa * rel_frequency                                0.045884
global_aoa * rel_letters_count                            0.058595
global_aoa * rel_orthographic_density                     0.118159
global_aoa * rel_synonyms_count                          -0.044995
global_clustering * global_frequency                     -0.105143
global_clustering * global_letters_count                 -0.009783
global_clustering * global_orthographic_density           0.187095
global_clustering * global_synonyms_count                 0.399689
global_clustering * rel_aoa                              -0.002046
global_clustering * rel_clustering                        0.055196
global_clustering * rel_frequency                         0.080072
global_clustering * rel_letters_count                     0.017471
global_clustering * rel_orthographic_density             -0.020920
global_clustering * rel_synonyms_count                   -0.441591
global_frequency * global_letters_count                  -0.011291
global_frequency * global_orthographic_density           -0.074300
global_frequency * global_synonyms_count                  0.189499
global_frequency * rel_aoa                                0.004904
global_frequency * rel_clustering                         0.205084
global_frequency * rel_frequency                         -0.002828
global_frequency * rel_letters_count                      0.012267
global_frequency * rel_orthographic_density              -0.023676
global_frequency * rel_synonyms_count                    -0.132860
global_letters_count * global_orthographic_density       -0.016306
global_letters_count * global_synonyms_count             -0.159569
global_letters_count * rel_aoa                            0.024424
global_letters_count * rel_clustering                     0.042184
global_letters_count * rel_frequency                      0.038641
global_letters_count * rel_letters_count                  0.006375
global_letters_count * rel_orthographic_density          -0.027640
global_letters_count * rel_synonyms_count                 0.205169
global_orthographic_density * global_synonyms_count      -0.452978
global_orthographic_density * rel_aoa                     0.027880
global_orthographic_density * rel_clustering             -0.156552
global_orthographic_density * rel_frequency               0.159399
global_orthographic_density * rel_letters_count           0.148688
global_orthographic_density * rel_orthographic_density    0.075218
global_orthographic_density * rel_synonyms_count          0.429885
global_synonyms_count * rel_aoa                          -0.105254
global_synonyms_count * rel_clustering                   -0.146182
global_synonyms_count * rel_frequency                     0.028652
global_synonyms_count * rel_letters_count                 0.114582
global_synonyms_count * rel_orthographic_density          0.343191
global_synonyms_count * rel_synonyms_count                0.134944
rel_aoa * rel_clustering                                 -0.058315
rel_aoa * rel_frequency                                  -0.009340
rel_aoa * rel_letters_count                               0.001403
rel_aoa * rel_orthographic_density                       -0.001462
rel_aoa * rel_synonyms_count                              0.062508
rel_clustering * rel_frequency                           -0.182294
rel_clustering * rel_letters_count                       -0.089616
rel_clustering * rel_orthographic_density                -0.145280
rel_clustering * rel_synonyms_count                       0.272765
rel_frequency * rel_letters_count                        -0.041050
rel_frequency * rel_orthographic_density                 -0.084757
rel_frequency * rel_synonyms_count                       -0.051208
rel_letters_count * rel_orthographic_density             -0.059005
rel_letters_count * rel_synonyms_count                   -0.094908
rel_orthographic_density * rel_synonyms_count            -0.177008
dtype: float64

Regressing rel synonyms_count with 581 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.21538704951245402

intercept                      1.472104
global_aoa                    -0.043524
global_clustering              0.134177
global_frequency              -0.004958
global_letters_count          -0.000460
global_orthographic_density   -0.022475
global_synonyms_count         -0.624322
rel_aoa                        0.042530
rel_clustering                -0.142123
rel_frequency                 -0.003993
rel_letters_count             -0.029174
rel_orthographic_density      -0.003581
rel_synonyms_count             0.918486
dtype: float64

Regressing rel synonyms_count with 581 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.35388558278644666

intercept                                                -6.979520
global_aoa                                                0.887398
global_clustering                                         0.272307
global_frequency                                          0.113327
global_letters_count                                      0.721167
global_orthographic_density                               3.629692
global_synonyms_count                                     2.074374
rel_aoa                                                   0.074200
rel_clustering                                           -1.544355
rel_frequency                                            -0.536798
rel_letters_count                                        -0.964941
rel_orthographic_density                                 -1.541725
rel_synonyms_count                                       -3.625074
global_aoa * global_clustering                           -0.018743
global_aoa * global_frequency                            -0.049965
global_aoa * global_letters_count                        -0.057412
global_aoa * global_orthographic_density                 -0.134163
global_aoa * global_synonyms_count                        0.096958
global_aoa * rel_aoa                                     -0.005037
global_aoa * rel_clustering                               0.051291
global_aoa * rel_frequency                                0.037842
global_aoa * rel_letters_count                            0.052928
global_aoa * rel_orthographic_density                     0.126154
global_aoa * rel_synonyms_count                          -0.015046
global_clustering * global_frequency                     -0.054174
global_clustering * global_letters_count                  0.010715
global_clustering * global_orthographic_density           0.216726
global_clustering * global_synonyms_count                 0.401501
global_clustering * rel_aoa                               0.004599
global_clustering * rel_clustering                        0.040956
global_clustering * rel_frequency                         0.014602
global_clustering * rel_letters_count                     0.005198
global_clustering * rel_orthographic_density             -0.014698
global_clustering * rel_synonyms_count                   -0.427788
global_frequency * global_letters_count                  -0.013791
global_frequency * global_orthographic_density           -0.098611
global_frequency * global_synonyms_count                  0.084632
global_frequency * rel_aoa                               -0.009755
global_frequency * rel_clustering                         0.127834
global_frequency * rel_frequency                         -0.006417
global_frequency * rel_letters_count                      0.022877
global_frequency * rel_orthographic_density               0.032824
global_frequency * rel_synonyms_count                     0.002082
global_letters_count * global_orthographic_density       -0.040684
global_letters_count * global_synonyms_count             -0.136362
global_letters_count * rel_aoa                            0.012818
global_letters_count * rel_clustering                     0.020769
global_letters_count * rel_frequency                      0.034113
global_letters_count * rel_letters_count                  0.006442
global_letters_count * rel_orthographic_density           0.014095
global_letters_count * rel_synonyms_count                 0.191475
global_orthographic_density * global_synonyms_count      -0.408251
global_orthographic_density * rel_aoa                     0.037817
global_orthographic_density * rel_clustering             -0.138092
global_orthographic_density * rel_frequency               0.176978
global_orthographic_density * rel_letters_count           0.157323
global_orthographic_density * rel_orthographic_density    0.065854
global_orthographic_density * rel_synonyms_count          0.409892
global_synonyms_count * rel_aoa                          -0.072529
global_synonyms_count * rel_clustering                   -0.159070
global_synonyms_count * rel_frequency                     0.119308
global_synonyms_count * rel_letters_count                 0.119379
global_synonyms_count * rel_orthographic_density          0.358277
global_synonyms_count * rel_synonyms_count                0.157372
rel_aoa * rel_clustering                                 -0.059282
rel_aoa * rel_frequency                                  -0.000645
rel_aoa * rel_letters_count                               0.003638
rel_aoa * rel_orthographic_density                       -0.023658
rel_aoa * rel_synonyms_count                              0.021034
rel_clustering * rel_frequency                           -0.109650
rel_clustering * rel_letters_count                       -0.082307
rel_clustering * rel_orthographic_density                -0.172391
rel_clustering * rel_synonyms_count                       0.244973
rel_frequency * rel_letters_count                        -0.045645
rel_frequency * rel_orthographic_density                 -0.122505
rel_frequency * rel_synonyms_count                       -0.165951
rel_letters_count * rel_orthographic_density             -0.084788
rel_letters_count * rel_synonyms_count                   -0.110533
rel_orthographic_density * rel_synonyms_count            -0.215845
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 489 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1060397021678805

intercept                      1.190554
global_aoa                    -0.016494
global_clustering             -0.003055
global_frequency               0.008178
global_letters_count          -0.029961
global_orthographic_density    0.254983
global_synonyms_count          0.059025
dtype: float64

Regressing global orthographic_density with 489 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14463023614432746

intercept                                             -0.968035
global_aoa                                             0.237889
global_clustering                                      0.141653
global_frequency                                       0.394145
global_letters_count                                  -0.202274
global_orthographic_density                            1.079873
global_synonyms_count                                  0.193987
global_aoa * global_clustering                        -0.017314
global_aoa * global_frequency                         -0.042657
global_aoa * global_letters_count                      0.003018
global_aoa * global_orthographic_density               0.033056
global_aoa * global_synonyms_count                    -0.085574
global_clustering * global_frequency                   0.007333
global_clustering * global_letters_count              -0.032921
global_clustering * global_orthographic_density        0.067408
global_clustering * global_synonyms_count              0.058936
global_frequency * global_letters_count               -0.003163
global_frequency * global_orthographic_density        -0.058962
global_frequency * global_synonyms_count               0.075708
global_letters_count * global_orthographic_density    -0.011933
global_letters_count * global_synonyms_count           0.027806
global_orthographic_density * global_synonyms_count   -0.086091
dtype: float64

Regressing rel orthographic_density with 489 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09087031987197292

intercept                     -1.213554
global_aoa                    -0.002615
global_clustering             -0.025710
global_frequency               0.013832
global_letters_count          -0.037544
global_orthographic_density    0.208834
global_synonyms_count          0.092616
dtype: float64

Regressing rel orthographic_density with 489 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12879548558347875

intercept                                             -2.044258
global_aoa                                             0.106371
global_clustering                                      0.253472
global_frequency                                       0.299320
global_letters_count                                  -0.211633
global_orthographic_density                            1.077423
global_synonyms_count                                 -0.012590
global_aoa * global_clustering                        -0.022491
global_aoa * global_frequency                         -0.035137
global_aoa * global_letters_count                      0.010090
global_aoa * global_orthographic_density               0.034043
global_aoa * global_synonyms_count                    -0.065581
global_clustering * global_frequency                  -0.008215
global_clustering * global_letters_count              -0.029893
global_clustering * global_orthographic_density        0.093527
global_clustering * global_synonyms_count              0.026925
global_frequency * global_letters_count               -0.008489
global_frequency * global_orthographic_density        -0.052476
global_frequency * global_synonyms_count               0.042131
global_letters_count * global_orthographic_density    -0.007302
global_letters_count * global_synonyms_count           0.058446
global_orthographic_density * global_synonyms_count   -0.050968
dtype: float64

Regressing global orthographic_density with 489 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08454046229027656

intercept                   1.607495
rel_aoa                     0.011858
rel_clustering             -0.082533
rel_frequency              -0.008796
rel_letters_count           0.008311
rel_orthographic_density    0.301290
rel_synonyms_count          0.136240
dtype: float64

Regressing global orthographic_density with 489 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11260359647507134

intercept                                        1.597849
rel_aoa                                          0.099756
rel_clustering                                   0.058448
rel_frequency                                   -0.032886
rel_letters_count                               -0.016220
rel_orthographic_density                         0.286214
rel_synonyms_count                               0.253158
rel_aoa * rel_clustering                         0.028449
rel_aoa * rel_frequency                          0.021989
rel_aoa * rel_letters_count                     -0.007090
rel_aoa * rel_orthographic_density               0.022501
rel_aoa * rel_synonyms_count                    -0.062153
rel_clustering * rel_frequency                   0.006189
rel_clustering * rel_letters_count              -0.001243
rel_clustering * rel_orthographic_density        0.163078
rel_clustering * rel_synonyms_count             -0.023557
rel_frequency * rel_letters_count                0.002483
rel_frequency * rel_orthographic_density        -0.002084
rel_frequency * rel_synonyms_count               0.045931
rel_letters_count * rel_orthographic_density    -0.027951
rel_letters_count * rel_synonyms_count           0.025292
rel_orthographic_density * rel_synonyms_count   -0.019284
dtype: float64

Regressing rel orthographic_density with 489 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15002197023988628

intercept                  -0.450579
rel_aoa                     0.018079
rel_clustering             -0.067683
rel_frequency               0.040373
rel_letters_count           0.010039
rel_orthographic_density    0.374944
rel_synonyms_count          0.119695
dtype: float64

Regressing rel orthographic_density with 489 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1675317028938912

intercept                                       -0.397584
rel_aoa                                          0.082486
rel_clustering                                  -0.003808
rel_frequency                                    0.057852
rel_letters_count                               -0.021183
rel_orthographic_density                         0.381010
rel_synonyms_count                               0.127523
rel_aoa * rel_clustering                         0.005391
rel_aoa * rel_frequency                          0.007635
rel_aoa * rel_letters_count                     -0.003014
rel_aoa * rel_orthographic_density               0.035188
rel_aoa * rel_synonyms_count                    -0.048436
rel_clustering * rel_frequency                  -0.016936
rel_clustering * rel_letters_count               0.001037
rel_clustering * rel_orthographic_density        0.131210
rel_clustering * rel_synonyms_count              0.017664
rel_frequency * rel_letters_count               -0.004079
rel_frequency * rel_orthographic_density         0.008840
rel_frequency * rel_synonyms_count               0.024054
rel_letters_count * rel_orthographic_density    -0.020500
rel_letters_count * rel_synonyms_count           0.030514
rel_orthographic_density * rel_synonyms_count   -0.015384
dtype: float64

Regressing global orthographic_density with 489 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1299821193052373

intercept                      3.875628
global_aoa                    -0.039235
global_clustering              0.197228
global_frequency              -0.050859
global_letters_count          -0.163440
global_orthographic_density    0.246389
global_synonyms_count         -0.238036
rel_aoa                        0.036337
rel_clustering                -0.216328
rel_frequency                  0.077634
rel_letters_count              0.159004
rel_orthographic_density       0.012363
rel_synonyms_count             0.354227
dtype: float64

Regressing global orthographic_density with 489 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.25847387410293676

intercept                                                -5.864431
global_aoa                                                0.140517
global_clustering                                        -0.163183
global_frequency                                          0.620544
global_letters_count                                      0.602063
global_orthographic_density                               3.244603
global_synonyms_count                                    -1.397120
rel_aoa                                                  -0.009469
rel_clustering                                           -0.213508
rel_frequency                                            -0.748555
rel_letters_count                                        -0.946859
rel_orthographic_density                                 -2.847895
rel_synonyms_count                                        5.263848
global_aoa * global_clustering                           -0.127611
global_aoa * global_frequency                            -0.092839
global_aoa * global_letters_count                        -0.022988
global_aoa * global_orthographic_density                  0.028991
global_aoa * global_synonyms_count                       -0.049603
global_aoa * rel_aoa                                      0.002542
global_aoa * rel_clustering                               0.118834
global_aoa * rel_frequency                                0.027620
global_aoa * rel_letters_count                            0.057869
global_aoa * rel_orthographic_density                     0.026133
global_aoa * rel_synonyms_count                           0.022506
global_clustering * global_frequency                      0.018751
global_clustering * global_letters_count                  0.061966
global_clustering * global_orthographic_density           0.311119
global_clustering * global_synonyms_count                -0.210151
global_clustering * rel_aoa                              -0.066434
global_clustering * rel_clustering                        0.044291
global_clustering * rel_frequency                        -0.181758
global_clustering * rel_letters_count                     0.042608
global_clustering * rel_orthographic_density             -0.021538
global_clustering * rel_synonyms_count                    0.884465
global_frequency * global_letters_count                   0.014008
global_frequency * global_orthographic_density           -0.053450
global_frequency * global_synonyms_count                 -0.090885
global_frequency * rel_aoa                               -0.002027
global_frequency * rel_clustering                         0.094863
global_frequency * rel_frequency                          0.008002
global_frequency * rel_letters_count                      0.074380
global_frequency * rel_orthographic_density               0.108657
global_frequency * rel_synonyms_count                     0.176857
global_letters_count * global_orthographic_density       -0.161355
global_letters_count * global_synonyms_count              0.009820
global_letters_count * rel_aoa                           -0.030106
global_letters_count * rel_clustering                    -0.104091
global_letters_count * rel_frequency                     -0.094633
global_letters_count * rel_letters_count                 -0.011660
global_letters_count * rel_orthographic_density           0.280402
global_letters_count * rel_synonyms_count                -0.109576
global_orthographic_density * global_synonyms_count       0.434333
global_orthographic_density * rel_aoa                    -0.047768
global_orthographic_density * rel_clustering             -0.341677
global_orthographic_density * rel_frequency              -0.069023
global_orthographic_density * rel_letters_count           0.091502
global_orthographic_density * rel_orthographic_density    0.107733
global_orthographic_density * rel_synonyms_count         -0.416264
global_synonyms_count * rel_aoa                          -0.062931
global_synonyms_count * rel_clustering                   -0.524072
global_synonyms_count * rel_frequency                     0.069889
global_synonyms_count * rel_letters_count                 0.113034
global_synonyms_count * rel_orthographic_density         -0.714023
global_synonyms_count * rel_synonyms_count               -0.031921
rel_aoa * rel_clustering                                  0.041527
rel_aoa * rel_frequency                                   0.048608
rel_aoa * rel_letters_count                               0.007315
rel_aoa * rel_orthographic_density                        0.028146
rel_aoa * rel_synonyms_count                             -0.012845
rel_clustering * rel_frequency                            0.093394
rel_clustering * rel_letters_count                       -0.039257
rel_clustering * rel_orthographic_density                 0.154172
rel_clustering * rel_synonyms_count                      -0.066089
rel_frequency * rel_letters_count                        -0.000487
rel_frequency * rel_orthographic_density                 -0.056624
rel_frequency * rel_synonyms_count                       -0.074770
rel_letters_count * rel_orthographic_density             -0.178193
rel_letters_count * rel_synonyms_count                    0.028928
rel_orthographic_density * rel_synonyms_count             0.702728
dtype: float64

Regressing rel orthographic_density with 489 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17884359317634976

intercept                      2.812414
global_aoa                    -0.020679
global_clustering              0.206041
global_frequency              -0.029357
global_letters_count          -0.133197
global_orthographic_density   -0.443814
global_synonyms_count         -0.144354
rel_aoa                        0.022750
rel_clustering                -0.227586
rel_frequency                  0.067068
rel_letters_count              0.119866
rel_orthographic_density       0.755090
rel_synonyms_count             0.247161
dtype: float64

Regressing rel orthographic_density with 489 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.30222527446964076

intercept                                                 0.032672
global_aoa                                                0.363918
global_clustering                                         1.533494
global_frequency                                          0.149440
global_letters_count                                      0.303488
global_orthographic_density                               3.521605
global_synonyms_count                                    -2.415110
rel_aoa                                                  -0.445421
rel_clustering                                           -1.899637
rel_frequency                                            -0.560414
rel_letters_count                                        -0.525069
rel_orthographic_density                                 -2.688287
rel_synonyms_count                                        6.122144
global_aoa * global_clustering                           -0.098151
global_aoa * global_frequency                            -0.079849
global_aoa * global_letters_count                        -0.019982
global_aoa * global_orthographic_density                 -0.042773
global_aoa * global_synonyms_count                       -0.028516
global_aoa * rel_aoa                                      0.005410
global_aoa * rel_clustering                               0.109864
global_aoa * rel_frequency                                0.030712
global_aoa * rel_letters_count                            0.048810
global_aoa * rel_orthographic_density                     0.092181
global_aoa * rel_synonyms_count                           0.006506
global_clustering * global_frequency                     -0.096829
global_clustering * global_letters_count                  0.012982
global_clustering * global_orthographic_density           0.194828
global_clustering * global_synonyms_count                -0.407234
global_clustering * rel_aoa                              -0.087304
global_clustering * rel_clustering                        0.032839
global_clustering * rel_frequency                        -0.084135
global_clustering * rel_letters_count                     0.106476
global_clustering * rel_orthographic_density              0.186188
global_clustering * rel_synonyms_count                    1.002512
global_frequency * global_letters_count                   0.010303
global_frequency * global_orthographic_density           -0.170595
global_frequency * global_synonyms_count                 -0.104327
global_frequency * rel_aoa                                0.004933
global_frequency * rel_clustering                         0.159806
global_frequency * rel_frequency                          0.010313
global_frequency * rel_letters_count                      0.079084
global_frequency * rel_orthographic_density               0.230469
global_frequency * rel_synonyms_count                     0.160150
global_letters_count * global_orthographic_density       -0.121909
global_letters_count * global_synonyms_count              0.043692
global_letters_count * rel_aoa                           -0.020408
global_letters_count * rel_clustering                    -0.061510
global_letters_count * rel_frequency                     -0.079968
global_letters_count * rel_letters_count                 -0.007647
global_letters_count * rel_orthographic_density           0.281860
global_letters_count * rel_synonyms_count                -0.125667
global_orthographic_density * global_synonyms_count       0.370224
global_orthographic_density * rel_aoa                     0.017552
global_orthographic_density * rel_clustering             -0.059603
global_orthographic_density * rel_frequency               0.069559
global_orthographic_density * rel_letters_count           0.030145
global_orthographic_density * rel_orthographic_density    0.146860
global_orthographic_density * rel_synonyms_count         -0.429493
global_synonyms_count * rel_aoa                          -0.062955
global_synonyms_count * rel_clustering                   -0.253794
global_synonyms_count * rel_frequency                     0.088707
global_synonyms_count * rel_letters_count                 0.045635
global_synonyms_count * rel_orthographic_density         -0.635648
global_synonyms_count * rel_synonyms_count               -0.056447
rel_aoa * rel_clustering                                  0.037791
rel_aoa * rel_frequency                                   0.033142
rel_aoa * rel_letters_count                              -0.001258
rel_aoa * rel_orthographic_density                       -0.040487
rel_aoa * rel_synonyms_count                             -0.028687
rel_clustering * rel_frequency                            0.038517
rel_clustering * rel_letters_count                       -0.077156
rel_clustering * rel_orthographic_density                -0.189344
rel_clustering * rel_synonyms_count                      -0.252343
rel_frequency * rel_letters_count                        -0.016609
rel_frequency * rel_orthographic_density                 -0.182251
rel_frequency * rel_synonyms_count                       -0.095734
rel_letters_count * rel_orthographic_density             -0.153960
rel_letters_count * rel_synonyms_count                    0.089232
rel_orthographic_density * rel_synonyms_count             0.690817
dtype: float64