Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 2168 substitutions for model Model(time=Time.continuous, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=1)
100% (2168 of 2168) |######################| Elapsed Time: 0:00:59 Time: 0:00:59

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *   |
H_00 | *** | **  | *** | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | **  | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *   | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | **  | *** | *** |
H_00 | *** | ns. | ns. | *** |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | *   | *   |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | **  | *   | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *** |
H_00 | *** | *   | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *** | **  | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | ns. | *** | *** |
H_00 | *** | *   | ns. | *** |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *** | **  | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | *   | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *   |
H_00 | ns. | **  | **  | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | *   | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | ns. | *** | ns. | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | *** | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *   |
H_00 | *** | *   | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | ns. | *** | ns. | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | *** | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *** |
H_00 | **  | ns. | ns. | **  |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | ns. | ns. | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *** | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | *** | *** |
H_00 | *** | *   | ns. | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | *** | *** |
H_00 | *** | *   | ns. | **  |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.51105539  0.176333    0.08603859  0.07817559  0.03734429  0.02868622
  0.02356144  0.02031939  0.01675882  0.01116853]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 0.510785 -0.286850 0.091126 -0.238796 -0.237656 0.410184 -0.227300 -0.275261 0.374168 -0.281107 0.142458 0.002576
Component-1 0.287258 -0.398103 0.145273 -0.297842 -0.230560 -0.432424 0.165063 -0.288286 -0.461695 0.251920 -0.147527 0.016993
Component-2 0.612450 0.639580 -0.069962 0.181683 -0.330446 -0.109939 -0.000906 0.172796 -0.089919 0.094101 -0.037564 -0.091906

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (2168 of 2168) |######################| Elapsed Time: 0:00:58 Time: 0:00:58

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *   | *   | *   |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | **  | *** | *** | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | **  | *** | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.65354481  0.20826127]

Out[35]:
aoa frequency letters_count
Component-0 0.758444 -0.379467 0.529874
Component-1 -0.375015 0.410843 0.831007

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (2168 of 2168) |######################| Elapsed Time: 0:00:16 Time: 0:00:16

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | ns. | ns. |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 896 (cluster-unique) substitutions, but the PCA is in fact computed on 689 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.14886845910616564

intercept                      3.865443
global_aoa                     0.087262
global_clustering              0.087832
global_frequency               0.496344
global_letters_count           0.041137
global_orthographic_density    0.123764
global_synonyms_count         -0.070474
dtype: float64

Regressing global frequency with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.17534336034615694

intercept                                             -5.730113
global_aoa                                             0.378421
global_clustering                                     -0.687814
global_frequency                                       1.062962
global_letters_count                                   0.525178
global_orthographic_density                            3.646511
global_synonyms_count                                  0.988535
global_aoa * global_clustering                         0.010460
global_aoa * global_frequency                         -0.008650
global_aoa * global_letters_count                     -0.011511
global_aoa * global_orthographic_density              -0.085285
global_aoa * global_synonyms_count                    -0.016272
global_clustering * global_frequency                   0.018866
global_clustering * global_letters_count               0.025487
global_clustering * global_orthographic_density        0.293226
global_clustering * global_synonyms_count              0.086446
global_frequency * global_letters_count               -0.031427
global_frequency * global_orthographic_density        -0.170737
global_frequency * global_synonyms_count              -0.017713
global_letters_count * global_orthographic_density     0.073626
global_letters_count * global_synonyms_count          -0.017812
global_orthographic_density * global_synonyms_count   -0.130875
dtype: float64

Regressing rel frequency with 507 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.08717622805909209

intercept                     -7.268910
global_aoa                     0.088445
global_clustering              0.033910
global_frequency               0.414733
global_letters_count           0.067682
global_orthographic_density    0.002708
global_synonyms_count         -0.021535
dtype: float64

Regressing rel frequency with 507 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.10561771479957627

intercept                                             -22.255920
global_aoa                                              0.614169
global_clustering                                      -1.716851
global_frequency                                        1.413467
global_letters_count                                    0.630309
global_orthographic_density                             2.975062
global_synonyms_count                                   1.098289
global_aoa * global_clustering                          0.038959
global_aoa * global_frequency                          -0.025029
global_aoa * global_letters_count                      -0.002233
global_aoa * global_orthographic_density               -0.063103
global_aoa * global_synonyms_count                     -0.023149
global_clustering * global_frequency                    0.084198
global_clustering * global_letters_count                0.052606
global_clustering * global_orthographic_density         0.268596
global_clustering * global_synonyms_count               0.224750
global_frequency * global_letters_count                -0.032674
global_frequency * global_orthographic_density         -0.149730
global_frequency * global_synonyms_count                0.039986
global_letters_count * global_orthographic_density      0.076221
global_letters_count * global_synonyms_count            0.010503
global_orthographic_density * global_synonyms_count    -0.059961
dtype: float64

Regressing global frequency with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07609052610104816

intercept                   9.589478
rel_aoa                     0.108956
rel_clustering             -0.043579
rel_frequency               0.301839
rel_letters_count           0.040712
rel_orthographic_density    0.119310
rel_synonyms_count         -0.167450
dtype: float64

Regressing global frequency with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.11504190771426814

intercept                                        9.405093
rel_aoa                                          0.232841
rel_clustering                                   0.023914
rel_frequency                                    0.200599
rel_letters_count                                0.142372
rel_orthographic_density                        -0.059933
rel_synonyms_count                              -0.067884
rel_aoa * rel_clustering                        -0.033517
rel_aoa * rel_frequency                          0.036908
rel_aoa * rel_letters_count                      0.000703
rel_aoa * rel_orthographic_density               0.003140
rel_aoa * rel_synonyms_count                    -0.019245
rel_clustering * rel_frequency                  -0.002394
rel_clustering * rel_letters_count               0.104648
rel_clustering * rel_orthographic_density        0.314494
rel_clustering * rel_synonyms_count              0.231336
rel_frequency * rel_letters_count                0.033382
rel_frequency * rel_orthographic_density        -0.021560
rel_frequency * rel_synonyms_count               0.055716
rel_letters_count * rel_orthographic_density     0.040290
rel_letters_count * rel_synonyms_count          -0.163461
rel_orthographic_density * rel_synonyms_count   -0.401025
dtype: float64

Regressing rel frequency with 507 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.31851161977496834

intercept                  -1.122215
rel_aoa                     0.088315
rel_clustering              0.111644
rel_frequency               0.672306
rel_letters_count          -0.040039
rel_orthographic_density   -0.082382
rel_synonyms_count         -0.169701
dtype: float64

Regressing rel frequency with 507 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.350031073981955

intercept                                       -1.351717
rel_aoa                                          0.097186
rel_clustering                                   0.065455
rel_frequency                                    0.610241
rel_letters_count                                0.040910
rel_orthographic_density                        -0.490311
rel_synonyms_count                              -0.104349
rel_aoa * rel_clustering                        -0.060808
rel_aoa * rel_frequency                         -0.031812
rel_aoa * rel_letters_count                     -0.003590
rel_aoa * rel_orthographic_density               0.051232
rel_aoa * rel_synonyms_count                     0.079390
rel_clustering * rel_frequency                  -0.027029
rel_clustering * rel_letters_count               0.057011
rel_clustering * rel_orthographic_density        0.136978
rel_clustering * rel_synonyms_count              0.294896
rel_frequency * rel_letters_count               -0.002330
rel_frequency * rel_orthographic_density        -0.125925
rel_frequency * rel_synonyms_count               0.052968
rel_letters_count * rel_orthographic_density     0.053935
rel_letters_count * rel_synonyms_count          -0.086653
rel_orthographic_density * rel_synonyms_count   -0.099000
dtype: float64

Regressing global frequency with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.16132628123979387

intercept                      3.561551
global_aoa                     0.024985
global_clustering              0.238394
global_frequency               0.569925
global_letters_count           0.142780
global_orthographic_density    0.235544
global_synonyms_count          0.273009
rel_aoa                        0.089153
rel_clustering                -0.191740
rel_frequency                 -0.079280
rel_letters_count             -0.111258
rel_orthographic_density      -0.146177
rel_synonyms_count            -0.410409
dtype: float64

Regressing global frequency with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.3170801237892523

intercept                                                -41.884154
global_aoa                                                 1.676217
global_clustering                                         -5.393265
global_frequency                                           1.374928
global_letters_count                                       2.779867
global_orthographic_density                               10.913831
global_synonyms_count                                     15.783461
rel_aoa                                                    2.601883
rel_clustering                                             1.725264
rel_frequency                                             -2.013926
rel_letters_count                                         -0.831047
rel_orthographic_density                                  -1.305978
rel_synonyms_count                                       -14.012967
global_aoa * global_clustering                             0.217658
global_aoa * global_frequency                              0.074310
global_aoa * global_letters_count                         -0.088539
global_aoa * global_orthographic_density                  -0.178033
global_aoa * global_synonyms_count                        -0.701961
global_aoa * rel_aoa                                      -0.038973
global_aoa * rel_clustering                               -0.023404
global_aoa * rel_frequency                                 0.085620
global_aoa * rel_letters_count                             0.055171
global_aoa * rel_orthographic_density                     -0.002027
global_aoa * rel_synonyms_count                            0.616636
global_clustering * global_frequency                       0.021500
global_clustering * global_letters_count                  -0.096451
global_clustering * global_orthographic_density            1.772170
global_clustering * global_synonyms_count                  1.033447
global_clustering * rel_aoa                               -0.120730
global_clustering * rel_clustering                         0.075173
global_clustering * rel_frequency                         -0.244758
global_clustering * rel_letters_count                      0.302101
global_clustering * rel_orthographic_density              -0.931273
global_clustering * rel_synonyms_count                    -0.888246
global_frequency * global_letters_count                   -0.327105
global_frequency * global_orthographic_density             0.043642
global_frequency * global_synonyms_count                  -0.515688
global_frequency * rel_aoa                                -0.261335
global_frequency * rel_clustering                          0.296279
global_frequency * rel_frequency                          -0.010013
global_frequency * rel_letters_count                       0.265484
global_frequency * rel_orthographic_density               -0.363718
global_frequency * rel_synonyms_count                      0.467127
global_letters_count * global_orthographic_density         0.248457
global_letters_count * global_synonyms_count               0.334600
global_letters_count * rel_aoa                            -0.072849
global_letters_count * rel_clustering                     -0.044483
global_letters_count * rel_frequency                       0.103349
global_letters_count * rel_letters_count                   0.034210
global_letters_count * rel_orthographic_density           -0.223712
global_letters_count * rel_synonyms_count                 -0.402563
global_orthographic_density * global_synonyms_count       -0.773936
global_orthographic_density * rel_aoa                      0.006911
global_orthographic_density * rel_clustering              -1.785808
global_orthographic_density * rel_frequency               -0.157569
global_orthographic_density * rel_letters_count           -0.239851
global_orthographic_density * rel_orthographic_density    -0.012543
global_orthographic_density * rel_synonyms_count           0.928808
global_synonyms_count * rel_aoa                            0.208251
global_synonyms_count * rel_clustering                    -0.856386
global_synonyms_count * rel_frequency                      0.220589
global_synonyms_count * rel_letters_count                  0.132937
global_synonyms_count * rel_orthographic_density           0.675447
global_synonyms_count * rel_synonyms_count                 0.022257
rel_aoa * rel_clustering                                  -0.064879
rel_aoa * rel_frequency                                    0.109587
rel_aoa * rel_letters_count                                0.078534
rel_aoa * rel_orthographic_density                         0.057978
rel_aoa * rel_synonyms_count                              -0.174007
rel_clustering * rel_frequency                            -0.037276
rel_clustering * rel_letters_count                        -0.119944
rel_clustering * rel_orthographic_density                  1.160640
rel_clustering * rel_synonyms_count                        1.154291
rel_frequency * rel_letters_count                         -0.048658
rel_frequency * rel_orthographic_density                   0.396398
rel_frequency * rel_synonyms_count                        -0.072040
rel_letters_count * rel_orthographic_density               0.369802
rel_letters_count * rel_synonyms_count                    -0.070529
rel_orthographic_density * rel_synonyms_count             -1.056295
dtype: float64

Regressing rel frequency with 507 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3729447830450848

intercept                      3.286546
global_aoa                     0.023412
global_clustering              0.295338
global_frequency              -0.366645
global_letters_count           0.145573
global_orthographic_density    0.233940
global_synonyms_count          0.234891
rel_aoa                        0.072305
rel_clustering                -0.197041
rel_frequency                  0.894478
rel_letters_count             -0.121924
rel_orthographic_density      -0.144235
rel_synonyms_count            -0.351947
dtype: float64

Regressing rel frequency with 507 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.4792737558891161

intercept                                                -42.238394
global_aoa                                                 1.508006
global_clustering                                         -5.708813
global_frequency                                           0.364511
global_letters_count                                       2.806978
global_orthographic_density                               10.656937
global_synonyms_count                                     14.663760
rel_aoa                                                    2.348249
rel_clustering                                             2.887246
rel_frequency                                             -0.895058
rel_letters_count                                         -1.303507
rel_orthographic_density                                  -2.465583
rel_synonyms_count                                       -13.600385
global_aoa * global_clustering                             0.174627
global_aoa * global_frequency                              0.069591
global_aoa * global_letters_count                         -0.096281
global_aoa * global_orthographic_density                  -0.204735
global_aoa * global_synonyms_count                        -0.649681
global_aoa * rel_aoa                                      -0.037339
global_aoa * rel_clustering                               -0.008310
global_aoa * rel_frequency                                 0.084623
global_aoa * rel_letters_count                             0.086822
global_aoa * rel_orthographic_density                      0.055651
global_aoa * rel_synonyms_count                            0.577605
global_clustering * global_frequency                       0.065239
global_clustering * global_letters_count                   0.041189
global_clustering * global_orthographic_density            1.614257
global_clustering * global_synonyms_count                  0.957884
global_clustering * rel_aoa                               -0.127321
global_clustering * rel_clustering                         0.052009
global_clustering * rel_frequency                         -0.234864
global_clustering * rel_letters_count                      0.213563
global_clustering * rel_orthographic_density              -0.786991
global_clustering * rel_synonyms_count                    -0.879692
global_frequency * global_letters_count                   -0.219239
global_frequency * global_orthographic_density             0.025657
global_frequency * global_synonyms_count                  -0.446953
global_frequency * rel_aoa                                -0.239779
global_frequency * rel_clustering                          0.181197
global_frequency * rel_frequency                           0.003402
global_frequency * rel_letters_count                       0.199290
global_frequency * rel_orthographic_density               -0.268188
global_frequency * rel_synonyms_count                      0.411726
global_letters_count * global_orthographic_density         0.163045
global_letters_count * global_synonyms_count               0.202567
global_letters_count * rel_aoa                            -0.082315
global_letters_count * rel_clustering                     -0.174143
global_letters_count * rel_frequency                       0.036271
global_letters_count * rel_letters_count                   0.042766
global_letters_count * rel_orthographic_density           -0.094082
global_letters_count * rel_synonyms_count                 -0.263487
global_orthographic_density * global_synonyms_count       -0.747734
global_orthographic_density * rel_aoa                     -0.000644
global_orthographic_density * rel_clustering              -1.625519
global_orthographic_density * rel_frequency               -0.115875
global_orthographic_density * rel_letters_count           -0.086043
global_orthographic_density * rel_orthographic_density     0.036658
global_orthographic_density * rel_synonyms_count           0.901076
global_synonyms_count * rel_aoa                            0.176554
global_synonyms_count * rel_clustering                    -0.771641
global_synonyms_count * rel_frequency                      0.125227
global_synonyms_count * rel_letters_count                  0.168641
global_synonyms_count * rel_orthographic_density           0.556806
global_synonyms_count * rel_synonyms_count                 0.037414
rel_aoa * rel_clustering                                  -0.041861
rel_aoa * rel_frequency                                    0.079686
rel_aoa * rel_letters_count                                0.053517
rel_aoa * rel_orthographic_density                         0.019748
rel_aoa * rel_synonyms_count                              -0.152377
rel_clustering * rel_frequency                             0.029973
rel_clustering * rel_letters_count                        -0.016807
rel_clustering * rel_orthographic_density                  1.013504
rel_clustering * rel_synonyms_count                        1.131416
rel_frequency * rel_letters_count                         -0.014428
rel_frequency * rel_orthographic_density                   0.283180
rel_frequency * rel_synonyms_count                        -0.007071
rel_letters_count * rel_orthographic_density               0.222169
rel_letters_count * rel_synonyms_count                    -0.119171
rel_orthographic_density * rel_synonyms_count             -0.920979
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 470 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.18893766793483024

intercept                      4.891597
global_aoa                     0.394746
global_clustering             -0.077363
global_frequency              -0.119720
global_letters_count           0.058670
global_orthographic_density   -0.114097
global_synonyms_count         -0.020356
dtype: float64

Regressing global aoa with 470 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.2362758673294877

intercept                                              4.097852
global_aoa                                             0.885761
global_clustering                                     -0.598441
global_frequency                                      -0.403054
global_letters_count                                   0.280952
global_orthographic_density                           -1.621734
global_synonyms_count                                 -4.108459
global_aoa * global_clustering                         0.127023
global_aoa * global_frequency                         -0.017747
global_aoa * global_letters_count                      0.055262
global_aoa * global_orthographic_density               0.035068
global_aoa * global_synonyms_count                     0.041864
global_clustering * global_frequency                  -0.078823
global_clustering * global_letters_count               0.094814
global_clustering * global_orthographic_density       -0.099051
global_clustering * global_synonyms_count             -0.232717
global_frequency * global_letters_count               -0.020161
global_frequency * global_orthographic_density         0.032114
global_frequency * global_synonyms_count               0.050358
global_letters_count * global_orthographic_density     0.007087
global_letters_count * global_synonyms_count           0.203492
global_orthographic_density * global_synonyms_count    0.601168
dtype: float64

Regressing rel aoa with 470 measures, no interactions
           ^^^^^^^
R^2 = 0.08046811241597673

intercept                      0.096487
global_aoa                     0.228699
global_clustering             -0.024656
global_frequency              -0.162187
global_letters_count           0.038993
global_orthographic_density    0.128308
global_synonyms_count          0.060349
dtype: float64

Regressing rel aoa with 470 measures, with interactions
           ^^^^^^^
R^2 = 0.1317044831280274

intercept                                              2.783645
global_aoa                                             1.325265
global_clustering                                      0.271603
global_frequency                                      -0.696587
global_letters_count                                  -0.482013
global_orthographic_density                           -2.000187
global_synonyms_count                                 -2.391719
global_aoa * global_clustering                         0.146706
global_aoa * global_frequency                         -0.051816
global_aoa * global_letters_count                      0.017701
global_aoa * global_orthographic_density               0.049544
global_aoa * global_synonyms_count                     0.076102
global_clustering * global_frequency                  -0.100995
global_clustering * global_letters_count              -0.018105
global_clustering * global_orthographic_density       -0.196196
global_clustering * global_synonyms_count             -0.178251
global_frequency * global_letters_count                0.035863
global_frequency * global_orthographic_density         0.077471
global_frequency * global_synonyms_count              -0.016491
global_letters_count * global_orthographic_density    -0.050337
global_letters_count * global_synonyms_count           0.080396
global_orthographic_density * global_synonyms_count    0.408415
dtype: float64

Regressing global aoa with 470 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.08429904400830979

intercept                   6.641004
rel_aoa                     0.165179
rel_clustering              0.357230
rel_frequency               0.071016
rel_letters_count          -0.026591
rel_orthographic_density   -0.516210
rel_synonyms_count         -0.157778
dtype: float64

Regressing global aoa with 470 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.16216254194893565

intercept                                        6.672644
rel_aoa                                         -0.189527
rel_clustering                                   0.134613
rel_frequency                                    0.151150
rel_letters_count                               -0.145432
rel_orthographic_density                        -0.689767
rel_synonyms_count                               0.108312
rel_aoa * rel_clustering                         0.147810
rel_aoa * rel_frequency                         -0.097420
rel_aoa * rel_letters_count                      0.068869
rel_aoa * rel_orthographic_density               0.116075
rel_aoa * rel_synonyms_count                    -0.034432
rel_clustering * rel_frequency                   0.028803
rel_clustering * rel_letters_count               0.056030
rel_clustering * rel_orthographic_density       -0.082439
rel_clustering * rel_synonyms_count             -0.239183
rel_frequency * rel_letters_count               -0.046112
rel_frequency * rel_orthographic_density         0.001078
rel_frequency * rel_synonyms_count              -0.000255
rel_letters_count * rel_orthographic_density     0.055488
rel_letters_count * rel_synonyms_count           0.299543
rel_orthographic_density * rel_synonyms_count    0.928665
dtype: float64

Regressing rel aoa with 470 measures, no interactions
           ^^^^^^^
R^2 = 0.28536846839612895

intercept                   0.468653
rel_aoa                     0.572870
rel_clustering              0.115028
rel_frequency              -0.044664
rel_letters_count          -0.022135
rel_orthographic_density    0.086072
rel_synonyms_count         -0.064745
dtype: float64

Regressing rel aoa with 470 measures, with interactions
           ^^^^^^^
R^2 = 0.33380962174891904

intercept                                        0.781310
rel_aoa                                          0.458602
rel_clustering                                  -0.137835
rel_frequency                                    0.101798
rel_letters_count                               -0.214438
rel_orthographic_density                         0.237728
rel_synonyms_count                               0.122574
rel_aoa * rel_clustering                         0.110038
rel_aoa * rel_frequency                         -0.019098
rel_aoa * rel_letters_count                      0.008299
rel_aoa * rel_orthographic_density              -0.006172
rel_aoa * rel_synonyms_count                    -0.096981
rel_clustering * rel_frequency                  -0.029006
rel_clustering * rel_letters_count               0.061340
rel_clustering * rel_orthographic_density        0.038691
rel_clustering * rel_synonyms_count              0.005509
rel_frequency * rel_letters_count               -0.046712
rel_frequency * rel_orthographic_density         0.065295
rel_frequency * rel_synonyms_count               0.039677
rel_letters_count * rel_orthographic_density    -0.026242
rel_letters_count * rel_synonyms_count           0.137425
rel_orthographic_density * rel_synonyms_count    0.369770
dtype: float64

Regressing global aoa with 470 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.2340297703114438

intercept                     -0.951863
global_aoa                     0.487530
global_clustering             -0.790464
global_frequency              -0.199684
global_letters_count           0.374525
global_orthographic_density    0.119936
global_synonyms_count          0.319216
rel_aoa                       -0.139254
rel_clustering                 0.812401
rel_frequency                  0.067614
rel_letters_count             -0.358960
rel_orthographic_density      -0.200794
rel_synonyms_count            -0.376479
dtype: float64

Regressing global aoa with 470 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.42720786078292017

intercept                                                 87.009056
global_aoa                                                -0.624043
global_clustering                                         10.657230
global_frequency                                          -1.349423
global_letters_count                                      -6.913005
global_orthographic_density                              -18.815137
global_synonyms_count                                    -30.324953
rel_aoa                                                   -2.033737
rel_clustering                                            -1.715798
rel_frequency                                              3.376187
rel_letters_count                                          4.774869
rel_orthographic_density                                  13.087706
rel_synonyms_count                                        13.275195
global_aoa * global_clustering                             0.072321
global_aoa * global_frequency                             -0.015568
global_aoa * global_letters_count                          0.220362
global_aoa * global_orthographic_density                   0.130061
global_aoa * global_synonyms_count                         0.399857
global_aoa * rel_aoa                                       0.053362
global_aoa * rel_clustering                               -0.168760
global_aoa * rel_frequency                                -0.088706
global_aoa * rel_letters_count                            -0.156814
global_aoa * rel_orthographic_density                     -0.148451
global_aoa * rel_synonyms_count                            0.114923
global_clustering * global_frequency                      -0.092872
global_clustering * global_letters_count                  -0.330855
global_clustering * global_orthographic_density           -3.522000
global_clustering * global_synonyms_count                 -2.987309
global_clustering * rel_aoa                               -0.102754
global_clustering * rel_clustering                         0.080231
global_clustering * rel_frequency                          0.393966
global_clustering * rel_letters_count                     -0.001712
global_clustering * rel_orthographic_density               2.601213
global_clustering * rel_synonyms_count                     1.376109
global_frequency * global_letters_count                    0.350538
global_frequency * global_orthographic_density            -0.352340
global_frequency * global_synonyms_count                   0.625025
global_frequency * rel_aoa                                 0.134324
global_frequency * rel_clustering                         -0.509644
global_frequency * rel_frequency                          -0.025600
global_frequency * rel_letters_count                      -0.379473
global_frequency * rel_orthographic_density                0.314236
global_frequency * rel_synonyms_count                     -0.328613
global_letters_count * global_orthographic_density         0.040662
global_letters_count * global_synonyms_count               0.441225
global_letters_count * rel_aoa                            -0.072820
global_letters_count * rel_clustering                      0.342623
global_letters_count * rel_frequency                      -0.107597
global_letters_count * rel_letters_count                   0.012627
global_letters_count * rel_orthographic_density           -0.016841
global_letters_count * rel_synonyms_count                 -0.090334
global_orthographic_density * global_synonyms_count        0.536041
global_orthographic_density * rel_aoa                     -0.052652
global_orthographic_density * rel_clustering               2.976588
global_orthographic_density * rel_frequency                0.283082
global_orthographic_density * rel_letters_count           -0.043027
global_orthographic_density * rel_orthographic_density     0.158108
global_orthographic_density * rel_synonyms_count          -0.801784
global_synonyms_count * rel_aoa                           -0.314955
global_synonyms_count * rel_clustering                     2.474420
global_synonyms_count * rel_frequency                     -0.835120
global_synonyms_count * rel_letters_count                 -1.116768
global_synonyms_count * rel_orthographic_density          -0.828563
global_synonyms_count * rel_synonyms_count                -0.048860
rel_aoa * rel_clustering                                   0.293107
rel_aoa * rel_frequency                                   -0.079371
rel_aoa * rel_letters_count                                0.047682
rel_aoa * rel_orthographic_density                         0.101475
rel_aoa * rel_synonyms_count                              -0.060648
rel_clustering * rel_frequency                             0.085672
rel_clustering * rel_letters_count                         0.130238
rel_clustering * rel_orthographic_density                 -1.784129
rel_clustering * rel_synonyms_count                       -1.219213
rel_frequency * rel_letters_count                          0.086626
rel_frequency * rel_orthographic_density                  -0.185892
rel_frequency * rel_synonyms_count                         0.441017
rel_letters_count * rel_orthographic_density               0.112672
rel_letters_count * rel_synonyms_count                     0.865360
rel_orthographic_density * rel_synonyms_count              1.919770
dtype: float64

Regressing rel aoa with 470 measures, no interactions
           ^^^^^^^
R^2 = 0.3332098128299459

intercept                      0.021954
global_aoa                    -0.341557
global_clustering             -0.550388
global_frequency              -0.188850
global_letters_count           0.210151
global_orthographic_density    0.034826
global_synonyms_count          0.359052
rel_aoa                        0.802764
rel_clustering                 0.655628
rel_frequency                  0.070022
rel_letters_count             -0.177389
rel_orthographic_density      -0.087301
rel_synonyms_count            -0.452529
dtype: float64

Regressing rel aoa with 470 measures, with interactions
           ^^^^^^^
R^2 = 0.48621644137564907

intercept                                                 85.352662
global_aoa                                                -1.778246
global_clustering                                         10.326316
global_frequency                                          -3.248837
global_letters_count                                      -5.077348
global_orthographic_density                              -13.385527
global_synonyms_count                                    -23.847718
rel_aoa                                                   -0.470502
rel_clustering                                            -2.285170
rel_frequency                                              3.951983
rel_letters_count                                          4.199306
rel_orthographic_density                                  10.632207
rel_synonyms_count                                        12.962421
global_aoa * global_clustering                            -0.034462
global_aoa * global_frequency                             -0.025653
global_aoa * global_letters_count                          0.158954
global_aoa * global_orthographic_density                   0.145585
global_aoa * global_synonyms_count                         0.608472
global_aoa * rel_aoa                                       0.033178
global_aoa * rel_clustering                               -0.049456
global_aoa * rel_frequency                                -0.076672
global_aoa * rel_letters_count                            -0.128245
global_aoa * rel_orthographic_density                     -0.184739
global_aoa * rel_synonyms_count                           -0.278908
global_clustering * global_frequency                      -0.345235
global_clustering * global_letters_count                  -0.146670
global_clustering * global_orthographic_density           -2.362687
global_clustering * global_synonyms_count                 -2.247581
global_clustering * rel_aoa                               -0.094037
global_clustering * rel_clustering                         0.062983
global_clustering * rel_frequency                          0.465204
global_clustering * rel_letters_count                      0.016545
global_clustering * rel_orthographic_density               1.770908
global_clustering * rel_synonyms_count                     1.306057
global_frequency * global_letters_count                    0.338294
global_frequency * global_orthographic_density            -0.163789
global_frequency * global_synonyms_count                   0.545148
global_frequency * rel_aoa                                 0.116591
global_frequency * rel_clustering                         -0.203555
global_frequency * rel_frequency                          -0.008687
global_frequency * rel_letters_count                      -0.349351
global_frequency * rel_orthographic_density                0.068648
global_frequency * rel_synonyms_count                     -0.315374
global_letters_count * global_orthographic_density        -0.062194
global_letters_count * global_synonyms_count               0.000976
global_letters_count * rel_aoa                            -0.067575
global_letters_count * rel_clustering                      0.134777
global_letters_count * rel_frequency                      -0.113129
global_letters_count * rel_letters_count                   0.020698
global_letters_count * rel_orthographic_density            0.067523
global_letters_count * rel_synonyms_count                  0.220827
global_orthographic_density * global_synonyms_count        0.344951
global_orthographic_density * rel_aoa                     -0.128326
global_orthographic_density * rel_clustering               1.972180
global_orthographic_density * rel_frequency                0.123107
global_orthographic_density * rel_letters_count            0.048571
global_orthographic_density * rel_orthographic_density     0.158158
global_orthographic_density * rel_synonyms_count          -0.487038
global_synonyms_count * rel_aoa                           -0.464876
global_synonyms_count * rel_clustering                     1.658850
global_synonyms_count * rel_frequency                     -0.614754
global_synonyms_count * rel_letters_count                 -0.556154
global_synonyms_count * rel_orthographic_density          -0.717166
global_synonyms_count * rel_synonyms_count                -0.087101
rel_aoa * rel_clustering                                   0.240969
rel_aoa * rel_frequency                                   -0.060285
rel_aoa * rel_letters_count                                0.029124
rel_aoa * rel_orthographic_density                         0.136878
rel_aoa * rel_synonyms_count                               0.146199
rel_clustering * rel_frequency                            -0.016138
rel_clustering * rel_letters_count                         0.146352
rel_clustering * rel_orthographic_density                 -1.118640
rel_clustering * rel_synonyms_count                       -1.018334
rel_frequency * rel_letters_count                          0.080213
rel_frequency * rel_orthographic_density                  -0.006286
rel_frequency * rel_synonyms_count                         0.330182
rel_letters_count * rel_orthographic_density               0.052762
rel_letters_count * rel_synonyms_count                     0.347335
rel_orthographic_density * rel_synonyms_count              1.249955
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 430 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11234038070185215

intercept                     -3.155383
global_aoa                    -0.020345
global_clustering              0.319214
global_frequency              -0.042629
global_letters_count          -0.022256
global_orthographic_density   -0.065568
global_synonyms_count         -0.117631
dtype: float64

Regressing global clustering with 430 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.23639593705527837

intercept                                             -1.009290
global_aoa                                             0.454051
global_clustering                                      0.334894
global_frequency                                      -0.716941
global_letters_count                                  -0.160809
global_orthographic_density                           -0.048777
global_synonyms_count                                 -2.069877
global_aoa * global_clustering                         0.092516
global_aoa * global_frequency                         -0.010695
global_aoa * global_letters_count                      0.022663
global_aoa * global_orthographic_density               0.003177
global_aoa * global_synonyms_count                     0.024388
global_clustering * global_frequency                  -0.100863
global_clustering * global_letters_count               0.028882
global_clustering * global_orthographic_density        0.106440
global_clustering * global_synonyms_count             -0.172973
global_frequency * global_letters_count                0.012961
global_frequency * global_orthographic_density         0.052775
global_frequency * global_synonyms_count               0.047302
global_letters_count * global_orthographic_density     0.006056
global_letters_count * global_synonyms_count           0.053601
global_orthographic_density * global_synonyms_count    0.021687
dtype: float64

Regressing rel clustering with 430 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.07326986210555686

intercept                      2.525630
global_aoa                    -0.011585
global_clustering              0.255409
global_frequency              -0.028014
global_letters_count          -0.039046
global_orthographic_density   -0.059420
global_synonyms_count         -0.107421
dtype: float64

Regressing rel clustering with 430 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.17486321226225476

intercept                                              5.222768
global_aoa                                             0.281466
global_clustering                                      0.189442
global_frequency                                      -0.715950
global_letters_count                                  -0.232232
global_orthographic_density                            0.013491
global_synonyms_count                                 -1.718352
global_aoa * global_clustering                         0.084862
global_aoa * global_frequency                         -0.004810
global_aoa * global_letters_count                      0.032135
global_aoa * global_orthographic_density               0.028892
global_aoa * global_synonyms_count                     0.010953
global_clustering * global_frequency                  -0.097651
global_clustering * global_letters_count               0.039481
global_clustering * global_orthographic_density        0.132561
global_clustering * global_synonyms_count             -0.136908
global_frequency * global_letters_count                0.016429
global_frequency * global_orthographic_density         0.043137
global_frequency * global_synonyms_count               0.015247
global_letters_count * global_orthographic_density     0.006234
global_letters_count * global_synonyms_count           0.086775
global_orthographic_density * global_synonyms_count    0.058881
dtype: float64

Regressing global clustering with 430 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.08355053373310771

intercept                  -5.857019
rel_aoa                     0.007949
rel_clustering              0.309629
rel_frequency               0.012782
rel_letters_count          -0.021625
rel_orthographic_density   -0.037722
rel_synonyms_count         -0.118279
dtype: float64

Regressing global clustering with 430 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1534952955617822

intercept                                       -5.756006
rel_aoa                                         -0.050427
rel_clustering                                   0.064191
rel_frequency                                    0.067615
rel_letters_count                               -0.069930
rel_orthographic_density                        -0.031491
rel_synonyms_count                              -0.147130
rel_aoa * rel_clustering                         0.078249
rel_aoa * rel_frequency                         -0.010397
rel_aoa * rel_letters_count                     -0.010643
rel_aoa * rel_orthographic_density              -0.015824
rel_aoa * rel_synonyms_count                    -0.014898
rel_clustering * rel_frequency                  -0.053684
rel_clustering * rel_letters_count              -0.002001
rel_clustering * rel_orthographic_density       -0.051620
rel_clustering * rel_synonyms_count             -0.070247
rel_frequency * rel_letters_count               -0.023107
rel_frequency * rel_orthographic_density        -0.018113
rel_frequency * rel_synonyms_count               0.004649
rel_letters_count * rel_orthographic_density    -0.010714
rel_letters_count * rel_synonyms_count           0.045991
rel_orthographic_density * rel_synonyms_count    0.039249
dtype: float64

Regressing rel clustering with 430 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1977268185878558

intercept                   0.260183
rel_aoa                    -0.004838
rel_clustering              0.490031
rel_frequency               0.019708
rel_letters_count          -0.018964
rel_orthographic_density   -0.021184
rel_synonyms_count         -0.062927
dtype: float64

Regressing rel clustering with 430 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2535277386639546

intercept                                        0.336487
rel_aoa                                         -0.041119
rel_clustering                                   0.194284
rel_frequency                                    0.048910
rel_letters_count                               -0.066831
rel_orthographic_density                        -0.056474
rel_synonyms_count                              -0.079372
rel_aoa * rel_clustering                         0.046417
rel_aoa * rel_frequency                         -0.006483
rel_aoa * rel_letters_count                     -0.005517
rel_aoa * rel_orthographic_density              -0.009428
rel_aoa * rel_synonyms_count                    -0.017293
rel_clustering * rel_frequency                  -0.061474
rel_clustering * rel_letters_count               0.053826
rel_clustering * rel_orthographic_density        0.010725
rel_clustering * rel_synonyms_count             -0.092414
rel_frequency * rel_letters_count               -0.012654
rel_frequency * rel_orthographic_density        -0.021882
rel_frequency * rel_synonyms_count               0.006018
rel_letters_count * rel_orthographic_density    -0.007737
rel_letters_count * rel_synonyms_count           0.055278
rel_orthographic_density * rel_synonyms_count    0.060545
dtype: float64

Regressing global clustering with 430 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1381896904088591

intercept                     -1.680761
global_aoa                    -0.041391
global_clustering              0.269538
global_frequency              -0.145097
global_letters_count          -0.077268
global_orthographic_density   -0.158706
global_synonyms_count         -0.125142
rel_aoa                        0.025131
rel_clustering                 0.079633
rel_frequency                  0.119110
rel_letters_count              0.054311
rel_orthographic_density       0.100324
rel_synonyms_count            -0.002974
dtype: float64

Regressing global clustering with 430 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3598358214558345

intercept                                                 15.545952
global_aoa                                                 0.608728
global_clustering                                          2.578958
global_frequency                                          -2.497986
global_letters_count                                      -0.360702
global_orthographic_density                                0.687592
global_synonyms_count                                     -4.367075
rel_aoa                                                   -1.161017
rel_clustering                                            -2.609958
rel_frequency                                              0.607554
rel_letters_count                                         -0.366701
rel_orthographic_density                                  -1.795711
rel_synonyms_count                                         0.234607
global_aoa * global_clustering                             0.037816
global_aoa * global_frequency                             -0.002366
global_aoa * global_letters_count                         -0.011936
global_aoa * global_orthographic_density                  -0.232365
global_aoa * global_synonyms_count                         0.027043
global_aoa * rel_aoa                                       0.008939
global_aoa * rel_clustering                                0.045353
global_aoa * rel_frequency                                -0.020859
global_aoa * rel_letters_count                             0.057120
global_aoa * rel_orthographic_density                      0.259896
global_aoa * rel_synonyms_count                            0.127575
global_clustering * global_frequency                      -0.305001
global_clustering * global_letters_count                   0.078719
global_clustering * global_orthographic_density            0.195952
global_clustering * global_synonyms_count                 -0.567431
global_clustering * rel_aoa                               -0.083429
global_clustering * rel_clustering                        -0.033084
global_clustering * rel_frequency                          0.057991
global_clustering * rel_letters_count                     -0.064768
global_clustering * rel_orthographic_density              -0.086372
global_clustering * rel_synonyms_count                     0.593973
global_frequency * global_letters_count                    0.057510
global_frequency * global_orthographic_density             0.117827
global_frequency * global_synonyms_count                   0.322169
global_frequency * rel_aoa                                 0.000965
global_frequency * rel_clustering                          0.187302
global_frequency * rel_frequency                           0.000819
global_frequency * rel_letters_count                      -0.010745
global_frequency * rel_orthographic_density                0.027371
global_frequency * rel_synonyms_count                     -0.055804
global_letters_count * global_orthographic_density         0.194750
global_letters_count * global_synonyms_count              -0.178689
global_letters_count * rel_aoa                             0.060523
global_letters_count * rel_clustering                     -0.010282
global_letters_count * rel_frequency                       0.017675
global_letters_count * rel_letters_count                  -0.017485
global_letters_count * rel_orthographic_density           -0.184862
global_letters_count * rel_synonyms_count                  0.312292
global_orthographic_density * global_synonyms_count       -1.027310
global_orthographic_density * rel_aoa                      0.188782
global_orthographic_density * rel_clustering               0.056586
global_orthographic_density * rel_frequency               -0.013022
global_orthographic_density * rel_letters_count           -0.143664
global_orthographic_density * rel_orthographic_density    -0.026416
global_orthographic_density * rel_synonyms_count           0.886626
global_synonyms_count * rel_aoa                           -0.031084
global_synonyms_count * rel_clustering                     0.330323
global_synonyms_count * rel_frequency                     -0.231897
global_synonyms_count * rel_letters_count                 -0.002939
global_synonyms_count * rel_orthographic_density           0.662627
global_synonyms_count * rel_synonyms_count                 0.023913
rel_aoa * rel_clustering                                   0.078750
rel_aoa * rel_frequency                                    0.011074
rel_aoa * rel_letters_count                               -0.073676
rel_aoa * rel_orthographic_density                        -0.195560
rel_aoa * rel_synonyms_count                              -0.104642
rel_clustering * rel_frequency                            -0.054519
rel_clustering * rel_letters_count                         0.013002
rel_clustering * rel_orthographic_density                 -0.081386
rel_clustering * rel_synonyms_count                       -0.492275
rel_frequency * rel_letters_count                         -0.066382
rel_frequency * rel_orthographic_density                  -0.111099
rel_frequency * rel_synonyms_count                        -0.007117
rel_letters_count * rel_orthographic_density               0.097216
rel_letters_count * rel_synonyms_count                    -0.054744
rel_orthographic_density * rel_synonyms_count             -0.372659
dtype: float64

Regressing rel clustering with 430 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.286341131044069

intercept                     -0.816235
global_aoa                    -0.034148
global_clustering             -0.536476
global_frequency              -0.127411
global_letters_count          -0.079561
global_orthographic_density   -0.117181
global_synonyms_count         -0.153109
rel_aoa                        0.018806
rel_clustering                 0.947353
rel_frequency                  0.107627
rel_letters_count              0.051437
rel_orthographic_density       0.042147
rel_synonyms_count             0.050924
dtype: float64

Regressing rel clustering with 430 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.4531538462732221

intercept                                                 13.555865
global_aoa                                                 0.365467
global_clustering                                          0.897010
global_frequency                                          -2.121001
global_letters_count                                      -0.737472
global_orthographic_density                                0.376531
global_synonyms_count                                     -1.586738
rel_aoa                                                   -0.959394
rel_clustering                                            -0.881024
rel_frequency                                              0.575680
rel_letters_count                                         -0.005739
rel_orthographic_density                                  -1.480007
rel_synonyms_count                                        -1.543459
global_aoa * global_clustering                             0.056145
global_aoa * global_frequency                              0.009994
global_aoa * global_letters_count                          0.013119
global_aoa * global_orthographic_density                  -0.165242
global_aoa * global_synonyms_count                        -0.012033
global_aoa * rel_aoa                                       0.009637
global_aoa * rel_clustering                                0.015555
global_aoa * rel_frequency                                -0.028513
global_aoa * rel_letters_count                             0.033236
global_aoa * rel_orthographic_density                      0.202015
global_aoa * rel_synonyms_count                            0.131831
global_clustering * global_frequency                      -0.232347
global_clustering * global_letters_count                   0.035206
global_clustering * global_orthographic_density            0.247991
global_clustering * global_synonyms_count                 -0.270422
global_clustering * rel_aoa                               -0.090881
global_clustering * rel_clustering                        -0.036541
global_clustering * rel_frequency                          0.035500
global_clustering * rel_letters_count                     -0.031536
global_clustering * rel_orthographic_density              -0.128618
global_clustering * rel_synonyms_count                     0.365896
global_frequency * global_letters_count                    0.054891
global_frequency * global_orthographic_density             0.143398
global_frequency * global_synonyms_count                   0.207887
global_frequency * rel_aoa                                -0.001903
global_frequency * rel_clustering                          0.137761
global_frequency * rel_frequency                           0.002674
global_frequency * rel_letters_count                      -0.017940
global_frequency * rel_orthographic_density               -0.005895
global_frequency * rel_synonyms_count                      0.024770
global_letters_count * global_orthographic_density         0.169586
global_letters_count * global_synonyms_count              -0.165384
global_letters_count * rel_aoa                             0.038752
global_letters_count * rel_clustering                      0.011265
global_letters_count * rel_frequency                       0.013479
global_letters_count * rel_letters_count                  -0.011062
global_letters_count * rel_orthographic_density           -0.145218
global_letters_count * rel_synonyms_count                  0.297228
global_orthographic_density * global_synonyms_count       -0.915493
global_orthographic_density * rel_aoa                      0.133141
global_orthographic_density * rel_clustering              -0.056568
global_orthographic_density * rel_frequency               -0.058100
global_orthographic_density * rel_letters_count           -0.122692
global_orthographic_density * rel_orthographic_density    -0.040117
global_orthographic_density * rel_synonyms_count           0.788204
global_synonyms_count * rel_aoa                           -0.064522
global_synonyms_count * rel_clustering                     0.145593
global_synonyms_count * rel_frequency                     -0.203115
global_synonyms_count * rel_letters_count                 -0.012831
global_synonyms_count * rel_orthographic_density           0.438065
global_synonyms_count * rel_synonyms_count                 0.004769
rel_aoa * rel_clustering                                   0.064903
rel_aoa * rel_frequency                                    0.005640
rel_aoa * rel_letters_count                               -0.056318
rel_aoa * rel_orthographic_density                        -0.139213
rel_aoa * rel_synonyms_count                              -0.051600
rel_clustering * rel_frequency                            -0.031287
rel_clustering * rel_letters_count                         0.036434
rel_clustering * rel_orthographic_density                  0.015106
rel_clustering * rel_synonyms_count                       -0.325593
rel_frequency * rel_letters_count                         -0.048944
rel_frequency * rel_orthographic_density                  -0.070970
rel_frequency * rel_synonyms_count                        -0.007589
rel_letters_count * rel_orthographic_density               0.067631
rel_letters_count * rel_synonyms_count                    -0.070163
rel_orthographic_density * rel_synonyms_count             -0.224188
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17738009962795798

intercept                      4.937193
global_aoa                    -0.040005
global_clustering             -0.090776
global_frequency              -0.088025
global_letters_count           0.365101
global_orthographic_density   -0.194639
global_synonyms_count         -0.357829
dtype: float64

Regressing global letters_count with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2334438363847675

intercept                                              4.247792
global_aoa                                            -0.129144
global_clustering                                     -2.120064
global_frequency                                      -0.299885
global_letters_count                                  -0.361516
global_orthographic_density                           -2.699095
global_synonyms_count                                 -1.343888
global_aoa * global_clustering                         0.223861
global_aoa * global_frequency                          0.108282
global_aoa * global_letters_count                      0.053499
global_aoa * global_orthographic_density               0.052334
global_aoa * global_synonyms_count                     0.131574
global_clustering * global_frequency                   0.107075
global_clustering * global_letters_count              -0.108042
global_clustering * global_orthographic_density        0.053645
global_clustering * global_synonyms_count              0.333585
global_frequency * global_letters_count               -0.040553
global_frequency * global_orthographic_density         0.227924
global_frequency * global_synonyms_count               0.149570
global_letters_count * global_orthographic_density     0.040880
global_letters_count * global_synonyms_count           0.063143
global_orthographic_density * global_synonyms_count    0.354873
dtype: float64

Regressing rel letters_count with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.10710340786377703

intercept                      1.836490
global_aoa                    -0.056167
global_clustering             -0.113071
global_frequency              -0.127826
global_letters_count           0.265452
global_orthographic_density   -0.128110
global_synonyms_count         -0.411980
dtype: float64

Regressing rel letters_count with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16377346688515815

intercept                                             -1.236984
global_aoa                                             0.369891
global_clustering                                     -2.599642
global_frequency                                      -0.301454
global_letters_count                                  -0.671935
global_orthographic_density                           -2.861836
global_synonyms_count                                 -2.136046
global_aoa * global_clustering                         0.272374
global_aoa * global_frequency                          0.100591
global_aoa * global_letters_count                      0.028333
global_aoa * global_orthographic_density               0.030921
global_aoa * global_synonyms_count                     0.133022
global_clustering * global_frequency                   0.147568
global_clustering * global_letters_count              -0.133586
global_clustering * global_orthographic_density        0.026501
global_clustering * global_synonyms_count              0.143150
global_frequency * global_letters_count               -0.008017
global_frequency * global_orthographic_density         0.269588
global_frequency * global_synonyms_count               0.123094
global_letters_count * global_orthographic_density     0.016048
global_letters_count * global_synonyms_count           0.051558
global_orthographic_density * global_synonyms_count    0.308326
dtype: float64

Regressing global letters_count with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13785450696702473

intercept                   5.370114
rel_aoa                    -0.167774
rel_clustering              0.186391
rel_frequency               0.029990
rel_letters_count           0.296323
rel_orthographic_density   -0.363742
rel_synonyms_count         -0.275723
dtype: float64

Regressing global letters_count with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1767559159282206

intercept                                        5.433228
rel_aoa                                         -0.295646
rel_clustering                                   0.407700
rel_frequency                                    0.169098
rel_letters_count                                0.331298
rel_orthographic_density                        -0.551051
rel_synonyms_count                              -0.060826
rel_aoa * rel_clustering                         0.122178
rel_aoa * rel_frequency                         -0.003558
rel_aoa * rel_letters_count                      0.040512
rel_aoa * rel_orthographic_density               0.028872
rel_aoa * rel_synonyms_count                     0.117624
rel_clustering * rel_frequency                   0.011944
rel_clustering * rel_letters_count              -0.062457
rel_clustering * rel_orthographic_density        0.166565
rel_clustering * rel_synonyms_count              0.249160
rel_frequency * rel_letters_count               -0.046081
rel_frequency * rel_orthographic_density         0.057197
rel_frequency * rel_synonyms_count               0.037444
rel_letters_count * rel_orthographic_density     0.123292
rel_letters_count * rel_synonyms_count           0.175471
rel_orthographic_density * rel_synonyms_count    0.658791
dtype: float64

Regressing rel letters_count with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.21368942103571742

intercept                   0.860313
rel_aoa                    -0.131576
rel_clustering              0.077136
rel_frequency              -0.167865
rel_letters_count           0.452525
rel_orthographic_density   -0.075312
rel_synonyms_count         -0.227157
dtype: float64

Regressing rel letters_count with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2524729609432135

intercept                                        0.913973
rel_aoa                                         -0.170146
rel_clustering                                   0.358839
rel_frequency                                   -0.067962
rel_letters_count                                0.555657
rel_orthographic_density                        -0.069058
rel_synonyms_count                               0.035819
rel_aoa * rel_clustering                         0.160944
rel_aoa * rel_frequency                          0.032195
rel_aoa * rel_letters_count                     -0.016441
rel_aoa * rel_orthographic_density              -0.079690
rel_aoa * rel_synonyms_count                     0.117758
rel_clustering * rel_frequency                   0.059010
rel_clustering * rel_letters_count              -0.018360
rel_clustering * rel_orthographic_density        0.234672
rel_clustering * rel_synonyms_count              0.248085
rel_frequency * rel_letters_count               -0.019364
rel_frequency * rel_orthographic_density         0.108056
rel_frequency * rel_synonyms_count               0.055270
rel_letters_count * rel_orthographic_density     0.104152
rel_letters_count * rel_synonyms_count           0.114293
rel_orthographic_density * rel_synonyms_count    0.486170
dtype: float64

Regressing global letters_count with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.21045569151183074

intercept                     -0.048071
global_aoa                     0.119095
global_clustering             -0.658219
global_frequency              -0.089791
global_letters_count           0.444004
global_orthographic_density   -0.090330
global_synonyms_count         -0.460133
rel_aoa                       -0.229065
rel_clustering                 0.637080
rel_frequency                 -0.025016
rel_letters_count             -0.088282
rel_orthographic_density      -0.067984
rel_synonyms_count             0.164749
dtype: float64

Regressing global letters_count with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3858153034623193

intercept                                                -18.507725
global_aoa                                                 1.379963
global_clustering                                         -5.081764
global_frequency                                           2.346076
global_letters_count                                      -2.645087
global_orthographic_density                               -5.903893
global_synonyms_count                                     -8.106992
rel_aoa                                                   -5.865589
rel_clustering                                             4.205088
rel_frequency                                             -1.178268
rel_letters_count                                          1.075038
rel_orthographic_density                                  -3.809107
rel_synonyms_count                                         5.575317
global_aoa * global_clustering                             0.395642
global_aoa * global_frequency                              0.044533
global_aoa * global_letters_count                          0.064518
global_aoa * global_orthographic_density                   0.144460
global_aoa * global_synonyms_count                         0.250345
global_aoa * rel_aoa                                       0.078536
global_aoa * rel_clustering                               -0.308318
global_aoa * rel_frequency                                 0.013169
global_aoa * rel_letters_count                             0.013153
global_aoa * rel_orthographic_density                     -0.012135
global_aoa * rel_synonyms_count                           -0.108938
global_clustering * global_frequency                       0.550831
global_clustering * global_letters_count                   0.015804
global_clustering * global_orthographic_density           -1.521096
global_clustering * global_synonyms_count                 -0.294234
global_clustering * rel_aoa                               -0.178781
global_clustering * rel_clustering                        -0.036286
global_clustering * rel_frequency                         -0.219269
global_clustering * rel_letters_count                     -0.295762
global_clustering * rel_orthographic_density               0.949095
global_clustering * rel_synonyms_count                     0.386451
global_frequency * global_letters_count                    0.361598
global_frequency * global_orthographic_density            -0.201416
global_frequency * global_synonyms_count                   0.208453
global_frequency * rel_aoa                                 0.245002
global_frequency * rel_clustering                         -0.414306
global_frequency * rel_frequency                          -0.032853
global_frequency * rel_letters_count                      -0.402047
global_frequency * rel_orthographic_density                0.627884
global_frequency * rel_synonyms_count                     -0.065583
global_letters_count * global_orthographic_density        -0.469156
global_letters_count * global_synonyms_count               0.432307
global_letters_count * rel_aoa                             0.223897
global_letters_count * rel_clustering                      0.088222
global_letters_count * rel_frequency                      -0.187206
global_letters_count * rel_letters_count                   0.037372
global_letters_count * rel_orthographic_density            0.495745
global_letters_count * rel_synonyms_count                 -0.219947
global_orthographic_density * global_synonyms_count        0.809809
global_orthographic_density * rel_aoa                      0.178736
global_orthographic_density * rel_clustering               0.957617
global_orthographic_density * rel_frequency                0.273694
global_orthographic_density * rel_letters_count            0.537121
global_orthographic_density * rel_orthographic_density     0.269212
global_orthographic_density * rel_synonyms_count          -0.682288
global_synonyms_count * rel_aoa                            0.281741
global_synonyms_count * rel_clustering                    -0.066442
global_synonyms_count * rel_frequency                     -0.034838
global_synonyms_count * rel_letters_count                 -1.180685
global_synonyms_count * rel_orthographic_density          -1.132685
global_synonyms_count * rel_synonyms_count                -0.285865
rel_aoa * rel_clustering                                   0.297074
rel_aoa * rel_frequency                                   -0.216537
rel_aoa * rel_letters_count                               -0.282398
rel_aoa * rel_orthographic_density                        -0.181084
rel_aoa * rel_synonyms_count                              -0.113938
rel_clustering * rel_frequency                             0.170757
rel_clustering * rel_letters_count                         0.138648
rel_clustering * rel_orthographic_density                 -0.037467
rel_clustering * rel_synonyms_count                        0.422332
rel_frequency * rel_letters_count                          0.166509
rel_frequency * rel_orthographic_density                  -0.465723
rel_frequency * rel_synonyms_count                         0.038986
rel_letters_count * rel_orthographic_density              -0.351109
rel_letters_count * rel_synonyms_count                     0.956284
rel_orthographic_density * rel_synonyms_count              1.643254
dtype: float64

Regressing rel letters_count with 507 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.25923262087241794

intercept                     -0.190206
global_aoa                     0.078593
global_clustering             -0.668469
global_frequency              -0.097523
global_letters_count          -0.467481
global_orthographic_density   -0.109678
global_synonyms_count         -0.514917
rel_aoa                       -0.180594
rel_clustering                 0.636892
rel_frequency                 -0.041143
rel_letters_count              0.839720
rel_orthographic_density      -0.084892
rel_synonyms_count             0.212575
dtype: float64

Regressing rel letters_count with 507 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.4220285220814608

intercept                                                -24.759642
global_aoa                                                 0.796426
global_clustering                                         -7.256875
global_frequency                                           2.115995
global_letters_count                                      -2.489968
global_orthographic_density                               -5.871800
global_synonyms_count                                     -7.171161
rel_aoa                                                   -4.708238
rel_clustering                                             4.868133
rel_frequency                                             -1.059206
rel_letters_count                                          1.173899
rel_orthographic_density                                  -3.911621
rel_synonyms_count                                         3.206255
global_aoa * global_clustering                             0.341703
global_aoa * global_frequency                              0.070357
global_aoa * global_letters_count                          0.076449
global_aoa * global_orthographic_density                   0.075024
global_aoa * global_synonyms_count                         0.218986
global_aoa * rel_aoa                                       0.081563
global_aoa * rel_clustering                               -0.206990
global_aoa * rel_frequency                                -0.010458
global_aoa * rel_letters_count                            -0.010398
global_aoa * rel_orthographic_density                      0.015202
global_aoa * rel_synonyms_count                           -0.115013
global_clustering * global_frequency                       0.601372
global_clustering * global_letters_count                   0.226752
global_clustering * global_orthographic_density           -1.079868
global_clustering * global_synonyms_count                 -0.155120
global_clustering * rel_aoa                               -0.111803
global_clustering * rel_clustering                        -0.076534
global_clustering * rel_frequency                         -0.267993
global_clustering * rel_letters_count                     -0.498130
global_clustering * rel_orthographic_density               0.455618
global_clustering * rel_synonyms_count                     0.051944
global_frequency * global_letters_count                    0.338373
global_frequency * global_orthographic_density             0.007443
global_frequency * global_synonyms_count                   0.245587
global_frequency * rel_aoa                                 0.199891
global_frequency * rel_clustering                         -0.467044
global_frequency * rel_frequency                          -0.034680
global_frequency * rel_letters_count                      -0.392284
global_frequency * rel_orthographic_density                0.399953
global_frequency * rel_synonyms_count                     -0.074121
global_letters_count * global_orthographic_density        -0.295736
global_letters_count * global_synonyms_count               0.439299
global_letters_count * rel_aoa                             0.169755
global_letters_count * rel_clustering                     -0.067736
global_letters_count * rel_frequency                      -0.180235
global_letters_count * rel_letters_count                   0.037101
global_letters_count * rel_orthographic_density            0.355861
global_letters_count * rel_synonyms_count                 -0.191353
global_orthographic_density * global_synonyms_count        0.633221
global_orthographic_density * rel_aoa                      0.206885
global_orthographic_density * rel_clustering               0.816827
global_orthographic_density * rel_frequency                0.128262
global_orthographic_density * rel_letters_count            0.406431
global_orthographic_density * rel_orthographic_density     0.264025
global_orthographic_density * rel_synonyms_count          -0.505278
global_synonyms_count * rel_aoa                            0.318857
global_synonyms_count * rel_clustering                    -0.106128
global_synonyms_count * rel_frequency                     -0.065389
global_synonyms_count * rel_letters_count                 -1.190523
global_synonyms_count * rel_orthographic_density          -0.957459
global_synonyms_count * rel_synonyms_count                -0.270976
rel_aoa * rel_clustering                                   0.165095
rel_aoa * rel_frequency                                   -0.172200
rel_aoa * rel_letters_count                               -0.231667
rel_aoa * rel_orthographic_density                        -0.183619
rel_aoa * rel_synonyms_count                              -0.103964
rel_clustering * rel_frequency                             0.200491
rel_clustering * rel_letters_count                         0.303906
rel_clustering * rel_orthographic_density                  0.154649
rel_clustering * rel_synonyms_count                        0.643525
rel_frequency * rel_letters_count                          0.176424
rel_frequency * rel_orthographic_density                  -0.320992
rel_frequency * rel_synonyms_count                         0.045252
rel_letters_count * rel_orthographic_density              -0.230834
rel_letters_count * rel_synonyms_count                     0.933891
rel_orthographic_density * rel_synonyms_count              1.472888
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 496 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10535896643101117

intercept                      1.066926
global_aoa                    -0.019574
global_clustering              0.024069
global_frequency              -0.023793
global_letters_count          -0.042994
global_orthographic_density   -0.030655
global_synonyms_count          0.268824
dtype: float64

Regressing global synonyms_count with 496 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14294037430001194

intercept                                              4.267623
global_aoa                                            -0.247835
global_clustering                                      0.298177
global_frequency                                      -0.243656
global_letters_count                                  -0.161809
global_orthographic_density                           -0.457742
global_synonyms_count                                 -0.093912
global_aoa * global_clustering                        -0.005914
global_aoa * global_frequency                          0.003278
global_aoa * global_letters_count                      0.015720
global_aoa * global_orthographic_density               0.047620
global_aoa * global_synonyms_count                     0.063269
global_clustering * global_frequency                  -0.029829
global_clustering * global_letters_count              -0.002507
global_clustering * global_orthographic_density        0.014693
global_clustering * global_synonyms_count              0.102696
global_frequency * global_letters_count               -0.000779
global_frequency * global_orthographic_density         0.020347
global_frequency * global_synonyms_count               0.048541
global_letters_count * global_orthographic_density    -0.003598
global_letters_count * global_synonyms_count          -0.004416
global_orthographic_density * global_synonyms_count    0.097969
dtype: float64

Regressing rel synonyms_count with 496 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.0785368906365429

intercept                      0.607499
global_aoa                    -0.016875
global_clustering              0.004356
global_frequency              -0.020929
global_letters_count          -0.038016
global_orthographic_density   -0.036494
global_synonyms_count          0.229367
dtype: float64

Regressing rel synonyms_count with 496 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.11447226276756207

intercept                                              4.514770
global_aoa                                            -0.285395
global_clustering                                      0.409313
global_frequency                                      -0.274727
global_letters_count                                  -0.180321
global_orthographic_density                           -0.579431
global_synonyms_count                                  0.295421
global_aoa * global_clustering                        -0.013615
global_aoa * global_frequency                          0.006429
global_aoa * global_letters_count                      0.012079
global_aoa * global_orthographic_density               0.046276
global_aoa * global_synonyms_count                     0.040763
global_clustering * global_frequency                  -0.032283
global_clustering * global_letters_count              -0.010285
global_clustering * global_orthographic_density       -0.003046
global_clustering * global_synonyms_count              0.125263
global_frequency * global_letters_count               -0.000620
global_frequency * global_orthographic_density         0.022066
global_frequency * global_synonyms_count               0.037189
global_letters_count * global_orthographic_density     0.002709
global_letters_count * global_synonyms_count          -0.003625
global_orthographic_density * global_synonyms_count    0.057154
dtype: float64

Regressing global synonyms_count with 496 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09416370641915972

intercept                   0.414068
rel_aoa                     0.005794
rel_clustering             -0.046028
rel_frequency              -0.025040
rel_letters_count          -0.053602
rel_orthographic_density   -0.014165
rel_synonyms_count          0.250869
dtype: float64

Regressing global synonyms_count with 496 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.140644843686771

intercept                                        0.486320
rel_aoa                                         -0.005376
rel_clustering                                  -0.151249
rel_frequency                                   -0.003125
rel_letters_count                               -0.102726
rel_orthographic_density                        -0.004023
rel_synonyms_count                               0.345334
rel_aoa * rel_clustering                         0.000843
rel_aoa * rel_frequency                          0.002189
rel_aoa * rel_letters_count                      0.035260
rel_aoa * rel_orthographic_density               0.060019
rel_aoa * rel_synonyms_count                     0.052333
rel_clustering * rel_frequency                  -0.029338
rel_clustering * rel_letters_count               0.013364
rel_clustering * rel_orthographic_density       -0.000922
rel_clustering * rel_synonyms_count              0.096875
rel_frequency * rel_letters_count               -0.000576
rel_frequency * rel_orthographic_density         0.018978
rel_frequency * rel_synonyms_count               0.044521
rel_letters_count * rel_orthographic_density    -0.005737
rel_letters_count * rel_synonyms_count           0.007664
rel_orthographic_density * rel_synonyms_count    0.065480
dtype: float64

Regressing rel synonyms_count with 496 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.18797612094971705

intercept                   0.083377
rel_aoa                    -0.004691
rel_clustering             -0.017620
rel_frequency              -0.018269
rel_letters_count          -0.043323
rel_orthographic_density   -0.031279
rel_synonyms_count          0.406078
dtype: float64

Regressing rel synonyms_count with 496 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.22573394773571254

intercept                                        0.157904
rel_aoa                                         -0.000373
rel_clustering                                  -0.120357
rel_frequency                                    0.012219
rel_letters_count                               -0.084100
rel_orthographic_density                        -0.040024
rel_synonyms_count                               0.543520
rel_aoa * rel_clustering                         0.002390
rel_aoa * rel_frequency                          0.006105
rel_aoa * rel_letters_count                      0.027508
rel_aoa * rel_orthographic_density               0.046229
rel_aoa * rel_synonyms_count                     0.030480
rel_clustering * rel_frequency                  -0.030667
rel_clustering * rel_letters_count               0.009752
rel_clustering * rel_orthographic_density       -0.001190
rel_clustering * rel_synonyms_count              0.093324
rel_frequency * rel_letters_count               -0.006352
rel_frequency * rel_orthographic_density         0.016387
rel_frequency * rel_synonyms_count               0.048952
rel_letters_count * rel_orthographic_density     0.006656
rel_letters_count * rel_synonyms_count           0.013296
rel_orthographic_density * rel_synonyms_count    0.083831
dtype: float64

Regressing global synonyms_count with 496 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12457193586753834

intercept                      1.824411
global_aoa                    -0.039564
global_clustering              0.188253
global_frequency              -0.009142
global_letters_count           0.008211
global_orthographic_density   -0.010306
global_synonyms_count          0.196049
rel_aoa                        0.028203
rel_clustering                -0.188112
rel_frequency                 -0.013514
rel_letters_count             -0.056196
rel_orthographic_density      -0.018254
rel_synonyms_count             0.074266
dtype: float64

Regressing global synonyms_count with 496 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2819158751261386

intercept                                                 9.742595
global_aoa                                                0.105114
global_clustering                                         2.762962
global_frequency                                         -0.354834
global_letters_count                                     -0.253778
global_orthographic_density                               1.456562
global_synonyms_count                                     4.183921
rel_aoa                                                  -0.393226
rel_clustering                                           -2.729920
rel_frequency                                            -0.057243
rel_letters_count                                        -0.322725
rel_orthographic_density                                 -0.599950
rel_synonyms_count                                       -4.993044
global_aoa * global_clustering                           -0.041248
global_aoa * global_frequency                            -0.021054
global_aoa * global_letters_count                        -0.014924
global_aoa * global_orthographic_density                 -0.028437
global_aoa * global_synonyms_count                        0.074660
global_aoa * rel_aoa                                      0.001212
global_aoa * rel_clustering                               0.062012
global_aoa * rel_frequency                                0.027730
global_aoa * rel_letters_count                            0.015091
global_aoa * rel_orthographic_density                     0.063658
global_aoa * rel_synonyms_count                          -0.020983
global_clustering * global_frequency                     -0.141819
global_clustering * global_letters_count                 -0.085423
global_clustering * global_orthographic_density          -0.250806
global_clustering * global_synonyms_count                 0.773717
global_clustering * rel_aoa                              -0.029997
global_clustering * rel_clustering                        0.038060
global_clustering * rel_frequency                         0.083832
global_clustering * rel_letters_count                    -0.002060
global_clustering * rel_orthographic_density              0.306128
global_clustering * rel_synonyms_count                   -0.777384
global_frequency * global_letters_count                   0.009736
global_frequency * global_orthographic_density           -0.198607
global_frequency * global_synonyms_count                  0.043785
global_frequency * rel_aoa                                0.010806
global_frequency * rel_clustering                         0.147831
global_frequency * rel_frequency                          0.009741
global_frequency * rel_letters_count                     -0.012510
global_frequency * rel_orthographic_density               0.171757
global_frequency * rel_synonyms_count                     0.000364
global_letters_count * global_orthographic_density       -0.163658
global_letters_count * global_synonyms_count             -0.037006
global_letters_count * rel_aoa                            0.011794
global_letters_count * rel_clustering                     0.079850
global_letters_count * rel_frequency                     -0.025706
global_letters_count * rel_letters_count                 -0.011991
global_letters_count * rel_orthographic_density           0.087896
global_letters_count * rel_synonyms_count                 0.099061
global_orthographic_density * global_synonyms_count       0.160166
global_orthographic_density * rel_aoa                     0.005389
global_orthographic_density * rel_clustering              0.198817
global_orthographic_density * rel_frequency               0.187911
global_orthographic_density * rel_letters_count           0.194344
global_orthographic_density * rel_orthographic_density   -0.016220
global_orthographic_density * rel_synonyms_count         -0.125425
global_synonyms_count * rel_aoa                           0.067461
global_synonyms_count * rel_clustering                   -0.449592
global_synonyms_count * rel_frequency                     0.160627
global_synonyms_count * rel_letters_count                -0.039414
global_synonyms_count * rel_orthographic_density         -0.315313
global_synonyms_count * rel_synonyms_count                0.089611
rel_aoa * rel_clustering                                  0.006354
rel_aoa * rel_frequency                                  -0.006299
rel_aoa * rel_letters_count                               0.026710
rel_aoa * rel_orthographic_density                        0.034770
rel_aoa * rel_synonyms_count                             -0.053526
rel_clustering * rel_frequency                           -0.104345
rel_clustering * rel_letters_count                       -0.011148
rel_clustering * rel_orthographic_density                -0.254131
rel_clustering * rel_synonyms_count                       0.464546
rel_frequency * rel_letters_count                         0.020690
rel_frequency * rel_orthographic_density                 -0.138310
rel_frequency * rel_synonyms_count                       -0.166545
rel_letters_count * rel_orthographic_density             -0.158592
rel_letters_count * rel_synonyms_count                   -0.008183
rel_orthographic_density * rel_synonyms_count             0.375205
dtype: float64

Regressing rel synonyms_count with 496 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.2705500998025506

intercept                      1.344003
global_aoa                    -0.032182
global_clustering              0.155983
global_frequency              -0.004398
global_letters_count           0.021378
global_orthographic_density    0.019647
global_synonyms_count         -0.632783
rel_aoa                        0.020922
rel_clustering                -0.161586
rel_frequency                 -0.016431
rel_letters_count             -0.057186
rel_orthographic_density      -0.040022
rel_synonyms_count             0.980982
dtype: float64

Regressing rel synonyms_count with 496 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.40096653969414864

intercept                                                 8.343245
global_aoa                                                0.178517
global_clustering                                         2.511627
global_frequency                                         -0.351280
global_letters_count                                     -0.192495
global_orthographic_density                               1.579291
global_synonyms_count                                     2.758945
rel_aoa                                                  -0.274555
rel_clustering                                           -2.540023
rel_frequency                                            -0.099692
rel_letters_count                                        -0.403456
rel_orthographic_density                                 -0.540014
rel_synonyms_count                                       -3.592415
global_aoa * global_clustering                           -0.027854
global_aoa * global_frequency                            -0.017677
global_aoa * global_letters_count                        -0.014066
global_aoa * global_orthographic_density                 -0.045455
global_aoa * global_synonyms_count                        0.059432
global_aoa * rel_aoa                                      0.000318
global_aoa * rel_clustering                               0.048185
global_aoa * rel_frequency                                0.022153
global_aoa * rel_letters_count                            0.014088
global_aoa * rel_orthographic_density                     0.070124
global_aoa * rel_synonyms_count                           0.002477
global_clustering * global_frequency                     -0.133848
global_clustering * global_letters_count                 -0.080708
global_clustering * global_orthographic_density          -0.217043
global_clustering * global_synonyms_count                 0.620124
global_clustering * rel_aoa                              -0.018508
global_clustering * rel_clustering                        0.034525
global_clustering * rel_frequency                         0.065104
global_clustering * rel_letters_count                     0.001941
global_clustering * rel_orthographic_density              0.317111
global_clustering * rel_synonyms_count                   -0.646755
global_frequency * global_letters_count                   0.004110
global_frequency * global_orthographic_density           -0.186763
global_frequency * global_synonyms_count                  0.037910
global_frequency * rel_aoa                                0.003760
global_frequency * rel_clustering                         0.140127
global_frequency * rel_frequency                          0.005099
global_frequency * rel_letters_count                      0.002739
global_frequency * rel_orthographic_density               0.165991
global_frequency * rel_synonyms_count                     0.013651
global_letters_count * global_orthographic_density       -0.130143
global_letters_count * global_synonyms_count             -0.047484
global_letters_count * rel_aoa                            0.011199
global_letters_count * rel_clustering                     0.086777
global_letters_count * rel_frequency                     -0.016766
global_letters_count * rel_letters_count                 -0.010708
global_letters_count * rel_orthographic_density           0.072181
global_letters_count * rel_synonyms_count                 0.094078
global_orthographic_density * global_synonyms_count       0.100638
global_orthographic_density * rel_aoa                     0.022827
global_orthographic_density * rel_clustering              0.172202
global_orthographic_density * rel_frequency               0.176298
global_orthographic_density * rel_letters_count           0.156759
global_orthographic_density * rel_orthographic_density   -0.014563
global_orthographic_density * rel_synonyms_count         -0.105748
global_synonyms_count * rel_aoa                           0.067196
global_synonyms_count * rel_clustering                   -0.355333
global_synonyms_count * rel_frequency                     0.167538
global_synonyms_count * rel_letters_count                -0.016706
global_synonyms_count * rel_orthographic_density         -0.242598
global_synonyms_count * rel_synonyms_count                0.090958
rel_aoa * rel_clustering                                  0.001217
rel_aoa * rel_frequency                                  -0.000816
rel_aoa * rel_letters_count                               0.018882
rel_aoa * rel_orthographic_density                        0.009137
rel_aoa * rel_synonyms_count                             -0.078753
rel_clustering * rel_frequency                           -0.086429
rel_clustering * rel_letters_count                       -0.025203
rel_clustering * rel_orthographic_density                -0.260703
rel_clustering * rel_synonyms_count                       0.372348
rel_frequency * rel_letters_count                         0.002280
rel_frequency * rel_orthographic_density                 -0.136437
rel_frequency * rel_synonyms_count                       -0.181975
rel_letters_count * rel_orthographic_density             -0.123148
rel_letters_count * rel_synonyms_count                   -0.020455
rel_orthographic_density * rel_synonyms_count             0.329659
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 433 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18347933188835075

intercept                      0.987646
global_aoa                    -0.021302
global_clustering             -0.002785
global_frequency               0.014097
global_letters_count          -0.030968
global_orthographic_density    0.332275
global_synonyms_count          0.124809
dtype: float64

Regressing global orthographic_density with 433 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.24215041575874174

intercept                                             -1.014954
global_aoa                                            -0.057334
global_clustering                                      0.137902
global_frequency                                       0.484922
global_letters_count                                   0.004595
global_orthographic_density                            0.441827
global_synonyms_count                                  1.571760
global_aoa * global_clustering                        -0.054044
global_aoa * global_frequency                         -0.025660
global_aoa * global_letters_count                     -0.017385
global_aoa * global_orthographic_density               0.072825
global_aoa * global_synonyms_count                    -0.074161
global_clustering * global_frequency                   0.045463
global_clustering * global_letters_count              -0.027304
global_clustering * global_orthographic_density       -0.047198
global_clustering * global_synonyms_count              0.088065
global_frequency * global_letters_count                0.002074
global_frequency * global_orthographic_density        -0.046215
global_frequency * global_synonyms_count               0.032859
global_letters_count * global_orthographic_density    -0.058541
global_letters_count * global_synonyms_count          -0.076777
global_orthographic_density * global_synonyms_count   -0.227525
dtype: float64

Regressing rel orthographic_density with 433 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16084023904619493

intercept                     -1.641487
global_aoa                    -0.013052
global_clustering             -0.037538
global_frequency               0.022749
global_letters_count          -0.013293
global_orthographic_density    0.312372
global_synonyms_count          0.126064
dtype: float64

Regressing rel orthographic_density with 433 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2073093626022967

intercept                                             -0.589592
global_aoa                                            -0.204654
global_clustering                                      0.405156
global_frequency                                       0.286348
global_letters_count                                  -0.139759
global_orthographic_density                            0.256530
global_synonyms_count                                  1.052692
global_aoa * global_clustering                        -0.046835
global_aoa * global_frequency                         -0.016361
global_aoa * global_letters_count                     -0.001369
global_aoa * global_orthographic_density               0.081380
global_aoa * global_synonyms_count                    -0.058626
global_clustering * global_frequency                   0.013920
global_clustering * global_letters_count              -0.040456
global_clustering * global_orthographic_density       -0.019633
global_clustering * global_synonyms_count              0.045810
global_frequency * global_letters_count               -0.005390
global_frequency * global_orthographic_density        -0.032114
global_frequency * global_synonyms_count               0.014172
global_letters_count * global_orthographic_density    -0.041949
global_letters_count * global_synonyms_count          -0.036498
global_orthographic_density * global_synonyms_count   -0.150553
dtype: float64

Regressing global orthographic_density with 433 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1498284446324043

intercept                   1.638087
rel_aoa                     0.000165
rel_clustering             -0.083425
rel_frequency              -0.007408
rel_letters_count          -0.008093
rel_orthographic_density    0.370414
rel_synonyms_count          0.148940
dtype: float64

Regressing global orthographic_density with 433 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18464638660187604

intercept                                        1.638913
rel_aoa                                          0.102805
rel_clustering                                   0.147803
rel_frequency                                   -0.017581
rel_letters_count                               -0.038724
rel_orthographic_density                         0.427296
rel_synonyms_count                               0.365666
rel_aoa * rel_clustering                         0.007770
rel_aoa * rel_frequency                          0.006087
rel_aoa * rel_letters_count                     -0.027164
rel_aoa * rel_orthographic_density               0.023171
rel_aoa * rel_synonyms_count                    -0.021413
rel_clustering * rel_frequency                   0.042277
rel_clustering * rel_letters_count              -0.045053
rel_clustering * rel_orthographic_density        0.041735
rel_clustering * rel_synonyms_count             -0.011917
rel_frequency * rel_letters_count               -0.008482
rel_frequency * rel_orthographic_density        -0.015150
rel_frequency * rel_synonyms_count               0.041298
rel_letters_count * rel_orthographic_density    -0.048485
rel_letters_count * rel_synonyms_count          -0.088711
rel_orthographic_density * rel_synonyms_count   -0.156169
dtype: float64

Regressing rel orthographic_density with 433 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.24391253423243506

intercept                  -0.418340
rel_aoa                     0.003763
rel_clustering             -0.091484
rel_frequency               0.035463
rel_letters_count           0.007459
rel_orthographic_density    0.462839
rel_synonyms_count          0.110991
dtype: float64

Regressing rel orthographic_density with 433 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.26778084180258244

intercept                                       -0.360657
rel_aoa                                          0.090286
rel_clustering                                   0.059833
rel_frequency                                    0.063468
rel_letters_count                               -0.029555
rel_orthographic_density                         0.499822
rel_synonyms_count                               0.252917
rel_aoa * rel_clustering                        -0.001550
rel_aoa * rel_frequency                          0.000432
rel_aoa * rel_letters_count                     -0.017139
rel_aoa * rel_orthographic_density               0.041414
rel_aoa * rel_synonyms_count                    -0.009277
rel_clustering * rel_frequency                   0.012916
rel_clustering * rel_letters_count              -0.048854
rel_clustering * rel_orthographic_density        0.017135
rel_clustering * rel_synonyms_count             -0.021519
rel_frequency * rel_letters_count               -0.016549
rel_frequency * rel_orthographic_density        -0.007535
rel_frequency * rel_synonyms_count               0.024024
rel_letters_count * rel_orthographic_density    -0.030924
rel_letters_count * rel_synonyms_count          -0.070098
rel_orthographic_density * rel_synonyms_count   -0.120526
dtype: float64

Regressing global orthographic_density with 433 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.20941892778678461

intercept                      4.380703
global_aoa                    -0.034263
global_clustering              0.278645
global_frequency              -0.047870
global_letters_count          -0.191621
global_orthographic_density    0.251976
global_synonyms_count         -0.023413
rel_aoa                        0.017443
rel_clustering                -0.312638
rel_frequency                  0.076476
rel_letters_count              0.178510
rel_orthographic_density       0.084594
rel_synonyms_count             0.171184
dtype: float64

Regressing global orthographic_density with 433 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.34601970694630246

intercept                                                 3.135476
global_aoa                                               -0.646649
global_clustering                                         0.125532
global_frequency                                          0.231341
global_letters_count                                     -0.409319
global_orthographic_density                               0.416533
global_synonyms_count                                     6.299264
rel_aoa                                                   0.164909
rel_clustering                                           -0.360952
rel_frequency                                            -0.140169
rel_letters_count                                         0.127116
rel_orthographic_density                                 -0.763003
rel_synonyms_count                                        0.953241
global_aoa * global_clustering                           -0.074504
global_aoa * global_frequency                            -0.015736
global_aoa * global_letters_count                        -0.005479
global_aoa * global_orthographic_density                  0.218729
global_aoa * global_synonyms_count                       -0.152354
global_aoa * rel_aoa                                     -0.006595
global_aoa * rel_clustering                              -0.006405
global_aoa * rel_frequency                                0.001661
global_aoa * rel_letters_count                            0.015892
global_aoa * rel_orthographic_density                    -0.129222
global_aoa * rel_synonyms_count                           0.021675
global_clustering * global_frequency                      0.029185
global_clustering * global_letters_count                 -0.202031
global_clustering * global_orthographic_density           0.320551
global_clustering * global_synonyms_count                 0.559764
global_clustering * rel_aoa                              -0.115728
global_clustering * rel_clustering                       -0.020761
global_clustering * rel_frequency                        -0.129357
global_clustering * rel_letters_count                     0.246701
global_clustering * rel_orthographic_density             -0.207400
global_clustering * rel_synonyms_count                    0.127896
global_frequency * global_letters_count                  -0.061833
global_frequency * global_orthographic_density            0.107975
global_frequency * global_synonyms_count                 -0.258557
global_frequency * rel_aoa                               -0.041397
global_frequency * rel_clustering                         0.044940
global_frequency * rel_frequency                         -0.003431
global_frequency * rel_letters_count                      0.137812
global_frequency * rel_orthographic_density              -0.034521
global_frequency * rel_synonyms_count                     0.219537
global_letters_count * global_orthographic_density       -0.164577
global_letters_count * global_synonyms_count             -0.110758
global_letters_count * rel_aoa                           -0.038128
global_letters_count * rel_clustering                     0.289063
global_letters_count * rel_frequency                     -0.021288
global_letters_count * rel_letters_count                 -0.007737
global_letters_count * rel_orthographic_density           0.189428
global_letters_count * rel_synonyms_count                -0.150653
global_orthographic_density * global_synonyms_count       0.448159
global_orthographic_density * rel_aoa                    -0.082091
global_orthographic_density * rel_clustering             -0.521871
global_orthographic_density * rel_frequency              -0.226460
global_orthographic_density * rel_letters_count          -0.046710
global_orthographic_density * rel_orthographic_density   -0.038754
global_orthographic_density * rel_synonyms_count         -0.638091
global_synonyms_count * rel_aoa                           0.038131
global_synonyms_count * rel_clustering                   -0.792214
global_synonyms_count * rel_frequency                     0.151392
global_synonyms_count * rel_letters_count                 0.285545
global_synonyms_count * rel_orthographic_density         -0.518628
global_synonyms_count * rel_synonyms_count               -0.057518
rel_aoa * rel_clustering                                  0.153499
rel_aoa * rel_frequency                                   0.044532
rel_aoa * rel_letters_count                               0.015154
rel_aoa * rel_orthographic_density                        0.058624
rel_aoa * rel_synonyms_count                              0.003209
rel_clustering * rel_frequency                            0.071321
rel_clustering * rel_letters_count                       -0.357294
rel_clustering * rel_orthographic_density                 0.377701
rel_clustering * rel_synonyms_count                       0.071336
rel_frequency * rel_letters_count                        -0.055672
rel_frequency * rel_orthographic_density                  0.109744
rel_frequency * rel_synonyms_count                       -0.057905
rel_letters_count * rel_orthographic_density             -0.064823
rel_letters_count * rel_synonyms_count                   -0.115540
rel_orthographic_density * rel_synonyms_count             0.484834
dtype: float64

Regressing rel orthographic_density with 433 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.27534927535968223

intercept                      2.955543
global_aoa                    -0.022053
global_clustering              0.244408
global_frequency              -0.022437
global_letters_count          -0.137760
global_orthographic_density   -0.435771
global_synonyms_count          0.047807
rel_aoa                        0.007468
rel_clustering                -0.269614
rel_frequency                  0.058901
rel_letters_count              0.111637
rel_orthographic_density       0.814387
rel_synonyms_count             0.078877
dtype: float64

Regressing rel orthographic_density with 433 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3943280381724837

intercept                                                 2.657511
global_aoa                                               -0.229411
global_clustering                                         1.033528
global_frequency                                          0.410014
global_letters_count                                     -0.718972
global_orthographic_density                               0.240041
global_synonyms_count                                     5.476968
rel_aoa                                                  -0.253744
rel_clustering                                           -1.769544
rel_frequency                                            -0.633999
rel_letters_count                                         0.246963
rel_orthographic_density                                 -0.790155
rel_synonyms_count                                        1.022201
global_aoa * global_clustering                           -0.067831
global_aoa * global_frequency                            -0.034737
global_aoa * global_letters_count                        -0.008284
global_aoa * global_orthographic_density                  0.154069
global_aoa * global_synonyms_count                       -0.170153
global_aoa * rel_aoa                                     -0.010215
global_aoa * rel_clustering                              -0.000769
global_aoa * rel_frequency                                0.016269
global_aoa * rel_letters_count                            0.025446
global_aoa * rel_orthographic_density                    -0.055787
global_aoa * rel_synonyms_count                           0.061485
global_clustering * global_frequency                     -0.003398
global_clustering * global_letters_count                 -0.225504
global_clustering * global_orthographic_density           0.108180
global_clustering * global_synonyms_count                 0.376933
global_clustering * rel_aoa                              -0.110457
global_clustering * rel_clustering                       -0.015890
global_clustering * rel_frequency                        -0.104343
global_clustering * rel_letters_count                     0.262726
global_clustering * rel_orthographic_density             -0.015136
global_clustering * rel_synonyms_count                    0.257565
global_frequency * global_letters_count                  -0.041954
global_frequency * global_orthographic_density           -0.033371
global_frequency * global_synonyms_count                 -0.293940
global_frequency * rel_aoa                               -0.027598
global_frequency * rel_clustering                         0.094933
global_frequency * rel_frequency                         -0.002768
global_frequency * rel_letters_count                      0.123749
global_frequency * rel_orthographic_density               0.108795
global_frequency * rel_synonyms_count                     0.241736
global_letters_count * global_orthographic_density       -0.119239
global_letters_count * global_synonyms_count             -0.054650
global_letters_count * rel_aoa                           -0.007770
global_letters_count * rel_clustering                     0.322101
global_letters_count * rel_frequency                     -0.004107
global_letters_count * rel_letters_count                 -0.014496
global_letters_count * rel_orthographic_density           0.164231
global_letters_count * rel_synonyms_count                -0.183441
global_orthographic_density * global_synonyms_count       0.488322
global_orthographic_density * rel_aoa                    -0.007128
global_orthographic_density * rel_clustering             -0.208076
global_orthographic_density * rel_frequency              -0.018342
global_orthographic_density * rel_letters_count          -0.064404
global_orthographic_density * rel_orthographic_density   -0.014878
global_orthographic_density * rel_synonyms_count         -0.564014
global_synonyms_count * rel_aoa                           0.053172
global_synonyms_count * rel_clustering                   -0.480375
global_synonyms_count * rel_frequency                     0.212889
global_synonyms_count * rel_letters_count                 0.225216
global_synonyms_count * rel_orthographic_density         -0.525459
global_synonyms_count * rel_synonyms_count               -0.086103
rel_aoa * rel_clustering                                  0.143281
rel_aoa * rel_frequency                                   0.030000
rel_aoa * rel_letters_count                              -0.009545
rel_aoa * rel_orthographic_density                       -0.015712
rel_aoa * rel_synonyms_count                             -0.033126
rel_clustering * rel_frequency                            0.043215
rel_clustering * rel_letters_count                       -0.391816
rel_clustering * rel_orthographic_density                 0.075626
rel_clustering * rel_synonyms_count                      -0.185957
rel_frequency * rel_letters_count                        -0.078576
rel_frequency * rel_orthographic_density                 -0.091159
rel_frequency * rel_synonyms_count                       -0.131259
rel_letters_count * rel_orthographic_density             -0.082888
rel_letters_count * rel_synonyms_count                   -0.062322
rel_orthographic_density * rel_synonyms_count             0.393312
dtype: float64