Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.discrete, source=Source.majority, past=Past.all, durl=Durl.all, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 14308 substitutions for model Model(time=Time.discrete, source=Source.majority, past=Past.all, durl=Durl.all, max_distance=1)
100% (14308 of 14308) |####################| Elapsed Time: 0:03:32 Time: 0:03:32

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | *** | *** | *** | *   |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *   |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | **  | *   |
H_00 | *** | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | **  | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *   |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | **  | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | **  |
H_00 | *** | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *   |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *   | *   |
H_00 | ns. | **  | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | **  | ns. |
H_00 | ns. | *** | *** | **  |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *   | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | **  | ns. |
H_00 | ns. | *** | *** | **  |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *   | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *   |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | **  | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | **  |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | ns. | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *   | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | *   | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *   | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | **  |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | *   | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.51786487  0.17388159  0.08890802  0.07508941  0.03518086  0.03013275
  0.02110013  0.01877856  0.01680934  0.0097375   0.00739816]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.442412 0.317401 -0.087970 0.250589 0.253772 -0.421929 0.225935 0.300588 -0.384290 0.283213 -0.145876 -0.001686
Component-1 0.267895 -0.388741 0.124541 -0.280744 -0.293438 -0.425413 0.190106 -0.303450 -0.436118 0.263697 -0.165862 0.015354
Component-2 0.717075 0.140934 -0.115671 0.055199 0.651838 -0.107659 -0.033700 0.002848 -0.060739 0.071341 -0.005332 -0.055494

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (14308 of 14308) |####################| Elapsed Time: 0:02:44 Time: 0:02:44

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | **  | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | ns. | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.67483025  0.18244027]

Out[35]:
aoa frequency letters_count
Component-0 -0.735332 0.371846 -0.566585
Component-1 0.396944 -0.441303 -0.804790

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (14308 of 14308) |####################| Elapsed Time: 0:01:32 Time: 0:01:32

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1311 (cluster-unique) substitutions, but the PCA is in fact computed on 1039 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06386448522873689

intercept                      5.488940
global_aoa                     0.011408
global_clustering             -0.063756
global_frequency               0.314143
global_letters_count          -0.000679
global_orthographic_density   -0.095758
global_synonyms_count         -0.092697
dtype: float64

Regressing global frequency with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07888278734647702

intercept                                              12.932701
global_aoa                                             -0.208257
global_clustering                                       1.321946
global_frequency                                        0.349837
global_letters_count                                   -0.961198
global_orthographic_density                            -0.301046
global_synonyms_count                                   0.398688
global_aoa * global_clustering                         -0.045593
global_aoa * global_frequency                          -0.012182
global_aoa * global_letters_count                       0.018588
global_aoa * global_orthographic_density               -0.052793
global_aoa * global_synonyms_count                      0.011047
global_clustering * global_frequency                   -0.024773
global_clustering * global_letters_count               -0.141502
global_clustering * global_orthographic_density        -0.059110
global_clustering * global_synonyms_count               0.092764
global_frequency * global_letters_count                -0.008647
global_frequency * global_orthographic_density         -0.041576
global_frequency * global_synonyms_count               -0.016031
global_letters_count * global_orthographic_density      0.105312
global_letters_count * global_synonyms_count           -0.003390
global_orthographic_density * global_synonyms_count     0.104735
dtype: float64

Regressing rel frequency with 764 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.03343771076796376

intercept                     -7.024530
global_aoa                     0.033661
global_clustering             -0.070488
global_frequency               0.254327
global_letters_count           0.119411
global_orthographic_density    0.001370
global_synonyms_count          0.077953
dtype: float64

Regressing rel frequency with 764 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.05067907391891713

intercept                                              4.987310
global_aoa                                            -0.468675
global_clustering                                      1.180862
global_frequency                                      -0.240627
global_letters_count                                  -1.122960
global_orthographic_density                           -0.696731
global_synonyms_count                                 -1.589191
global_aoa * global_clustering                        -0.036194
global_aoa * global_frequency                         -0.009036
global_aoa * global_letters_count                      0.062420
global_aoa * global_orthographic_density              -0.032849
global_aoa * global_synonyms_count                     0.087277
global_clustering * global_frequency                  -0.062193
global_clustering * global_letters_count              -0.079099
global_clustering * global_orthographic_density        0.019854
global_clustering * global_synonyms_count              0.009702
global_frequency * global_letters_count                0.022667
global_frequency * global_orthographic_density         0.018643
global_frequency * global_synonyms_count               0.058737
global_letters_count * global_orthographic_density     0.140646
global_letters_count * global_synonyms_count           0.050216
global_orthographic_density * global_synonyms_count    0.267072
dtype: float64

Regressing global frequency with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.05369788008345444

intercept                   9.336749
rel_aoa                     0.028877
rel_clustering             -0.118159
rel_frequency               0.228948
rel_letters_count          -0.054257
rel_orthographic_density   -0.146347
rel_synonyms_count         -0.223717
dtype: float64

Regressing global frequency with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07024013009956809

intercept                                        9.205042
rel_aoa                                          0.151617
rel_clustering                                   0.089987
rel_frequency                                    0.208201
rel_letters_count                                0.056988
rel_orthographic_density                        -0.209049
rel_synonyms_count                               0.174326
rel_aoa * rel_clustering                         0.023751
rel_aoa * rel_frequency                          0.013697
rel_aoa * rel_letters_count                     -0.027380
rel_aoa * rel_orthographic_density               0.033381
rel_aoa * rel_synonyms_count                     0.045585
rel_clustering * rel_frequency                   0.033025
rel_clustering * rel_letters_count              -0.114843
rel_clustering * rel_orthographic_density       -0.096961
rel_clustering * rel_synonyms_count              0.239370
rel_frequency * rel_letters_count                0.001813
rel_frequency * rel_orthographic_density        -0.007147
rel_frequency * rel_synonyms_count               0.078456
rel_letters_count * rel_orthographic_density     0.029536
rel_letters_count * rel_synonyms_count          -0.106034
rel_orthographic_density * rel_synonyms_count    0.007168
dtype: float64

Regressing rel frequency with 764 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.23860908666791103

intercept                  -1.717323
rel_aoa                     0.050168
rel_clustering              0.084282
rel_frequency               0.618688
rel_letters_count          -0.120396
rel_orthographic_density   -0.266593
rel_synonyms_count         -0.051290
dtype: float64

Regressing rel frequency with 764 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.2626408641210872

intercept                                       -1.770070
rel_aoa                                          0.092841
rel_clustering                                   0.151750
rel_frequency                                    0.651110
rel_letters_count                               -0.035463
rel_orthographic_density                        -0.397215
rel_synonyms_count                               0.285190
rel_aoa * rel_clustering                        -0.035604
rel_aoa * rel_frequency                         -0.031763
rel_aoa * rel_letters_count                      0.013262
rel_aoa * rel_orthographic_density               0.147720
rel_aoa * rel_synonyms_count                     0.179318
rel_clustering * rel_frequency                  -0.018081
rel_clustering * rel_letters_count              -0.159480
rel_clustering * rel_orthographic_density       -0.274368
rel_clustering * rel_synonyms_count              0.099662
rel_frequency * rel_letters_count               -0.004937
rel_frequency * rel_orthographic_density        -0.030613
rel_frequency * rel_synonyms_count               0.051635
rel_letters_count * rel_orthographic_density     0.022724
rel_letters_count * rel_synonyms_count          -0.073198
rel_orthographic_density * rel_synonyms_count    0.170834
dtype: float64

Regressing global frequency with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07478150252227866

intercept                      4.343760
global_aoa                    -0.036945
global_clustering             -0.050247
global_frequency               0.314065
global_letters_count           0.230415
global_orthographic_density    0.158846
global_synonyms_count          0.221786
rel_aoa                        0.061642
rel_clustering                -0.024874
rel_frequency                 -0.002304
rel_letters_count             -0.259051
rel_orthographic_density      -0.292333
rel_synonyms_count            -0.401115
dtype: float64

Regressing global frequency with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.15574273108613368

intercept                                                -17.325116
global_aoa                                                 1.418253
global_clustering                                         -2.493746
global_frequency                                           0.784454
global_letters_count                                      -1.250294
global_orthographic_density                                8.622820
global_synonyms_count                                     14.780665
rel_aoa                                                   -0.132744
rel_clustering                                             5.071599
rel_frequency                                              0.174838
rel_letters_count                                          0.455594
rel_orthographic_density                                  -6.697307
rel_synonyms_count                                       -11.605345
global_aoa * global_clustering                             0.389768
global_aoa * global_frequency                              0.115846
global_aoa * global_letters_count                          0.026360
global_aoa * global_orthographic_density                  -0.137097
global_aoa * global_synonyms_count                        -0.487501
global_aoa * rel_aoa                                      -0.028307
global_aoa * rel_clustering                               -0.443685
global_aoa * rel_frequency                                -0.043665
global_aoa * rel_letters_count                            -0.004381
global_aoa * rel_orthographic_density                      0.027229
global_aoa * rel_synonyms_count                            0.402448
global_clustering * global_frequency                       0.015572
global_clustering * global_letters_count                  -0.404078
global_clustering * global_orthographic_density            0.883157
global_clustering * global_synonyms_count                  0.801782
global_clustering * rel_aoa                               -0.473093
global_clustering * rel_clustering                         0.127256
global_clustering * rel_frequency                          0.037016
global_clustering * rel_letters_count                      0.270097
global_clustering * rel_orthographic_density              -0.505869
global_clustering * rel_synonyms_count                    -0.182484
global_frequency * global_letters_count                   -0.107237
global_frequency * global_orthographic_density            -0.182533
global_frequency * global_synonyms_count                  -0.493313
global_frequency * rel_aoa                                -0.245830
global_frequency * rel_clustering                         -0.001550
global_frequency * rel_frequency                          -0.003741
global_frequency * rel_letters_count                       0.094064
global_frequency * rel_orthographic_density                0.218926
global_frequency * rel_synonyms_count                      0.551076
global_letters_count * global_orthographic_density         0.012165
global_letters_count * global_synonyms_count              -0.008991
global_letters_count * rel_aoa                            -0.021798
global_letters_count * rel_clustering                      0.240854
global_letters_count * rel_frequency                       0.003043
global_letters_count * rel_letters_count                  -0.006973
global_letters_count * rel_orthographic_density            0.109802
global_letters_count * rel_synonyms_count                  0.013211
global_orthographic_density * global_synonyms_count       -0.806131
global_orthographic_density * rel_aoa                      0.090045
global_orthographic_density * rel_clustering              -1.188702
global_orthographic_density * rel_frequency                0.090066
global_orthographic_density * rel_letters_count            0.042197
global_orthographic_density * rel_orthographic_density     0.004272
global_orthographic_density * rel_synonyms_count           0.995745
global_synonyms_count * rel_aoa                            0.211361
global_synonyms_count * rel_clustering                    -1.584907
global_synonyms_count * rel_frequency                      0.177753
global_synonyms_count * rel_letters_count                  0.219693
global_synonyms_count * rel_orthographic_density           0.718924
global_synonyms_count * rel_synonyms_count                -0.030628
rel_aoa * rel_clustering                                   0.482571
rel_aoa * rel_frequency                                    0.127173
rel_aoa * rel_letters_count                                0.028565
rel_aoa * rel_orthographic_density                         0.059118
rel_aoa * rel_synonyms_count                              -0.124879
rel_clustering * rel_frequency                            -0.013973
rel_clustering * rel_letters_count                        -0.273468
rel_clustering * rel_orthographic_density                  0.658824
rel_clustering * rel_synonyms_count                        1.330109
rel_frequency * rel_letters_count                          0.009671
rel_frequency * rel_orthographic_density                  -0.093760
rel_frequency * rel_synonyms_count                        -0.158379
rel_letters_count * rel_orthographic_density              -0.084315
rel_letters_count * rel_synonyms_count                    -0.253416
rel_orthographic_density * rel_synonyms_count             -0.835025
dtype: float64

Regressing rel frequency with 764 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3442248751426459

intercept                      4.262882
global_aoa                    -0.033787
global_clustering              0.018853
global_frequency              -0.630657
global_letters_count           0.239048
global_orthographic_density    0.135524
global_synonyms_count          0.162757
rel_aoa                        0.045453
rel_clustering                -0.058768
rel_frequency                  0.982214
rel_letters_count             -0.268947
rel_orthographic_density      -0.245354
rel_synonyms_count            -0.328592
dtype: float64

Regressing rel frequency with 764 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.3998046816702713

intercept                                                -21.596016
global_aoa                                                 1.700583
global_clustering                                         -3.086947
global_frequency                                           0.039755
global_letters_count                                      -1.078587
global_orthographic_density                                8.517026
global_synonyms_count                                     13.263576
rel_aoa                                                   -0.620391
rel_clustering                                             6.340745
rel_frequency                                              0.834737
rel_letters_count                                          0.294733
rel_orthographic_density                                  -6.348599
rel_synonyms_count                                       -10.145791
global_aoa * global_clustering                             0.383322
global_aoa * global_frequency                              0.096241
global_aoa * global_letters_count                          0.013470
global_aoa * global_orthographic_density                  -0.152719
global_aoa * global_synonyms_count                        -0.483088
global_aoa * rel_aoa                                      -0.025446
global_aoa * rel_clustering                               -0.448751
global_aoa * rel_frequency                                -0.024762
global_aoa * rel_letters_count                             0.016767
global_aoa * rel_orthographic_density                      0.064062
global_aoa * rel_synonyms_count                            0.394602
global_clustering * global_frequency                       0.055774
global_clustering * global_letters_count                  -0.329047
global_clustering * global_orthographic_density            0.873005
global_clustering * global_synonyms_count                  0.786340
global_clustering * rel_aoa                               -0.497804
global_clustering * rel_clustering                         0.121776
global_clustering * rel_frequency                          0.019509
global_clustering * rel_letters_count                      0.224181
global_clustering * rel_orthographic_density              -0.494231
global_clustering * rel_synonyms_count                    -0.196176
global_frequency * global_letters_count                   -0.071393
global_frequency * global_orthographic_density            -0.172369
global_frequency * global_synonyms_count                  -0.399546
global_frequency * rel_aoa                                -0.224816
global_frequency * rel_clustering                         -0.103572
global_frequency * rel_frequency                           0.008191
global_frequency * rel_letters_count                       0.065126
global_frequency * rel_orthographic_density                0.183824
global_frequency * rel_synonyms_count                      0.447042
global_letters_count * global_orthographic_density         0.017405
global_letters_count * global_synonyms_count               0.047721
global_letters_count * rel_aoa                            -0.019098
global_letters_count * rel_clustering                      0.165300
global_letters_count * rel_frequency                       0.002205
global_letters_count * rel_letters_count                  -0.007132
global_letters_count * rel_orthographic_density            0.096250
global_letters_count * rel_synonyms_count                 -0.037149
global_orthographic_density * global_synonyms_count       -0.812032
global_orthographic_density * rel_aoa                      0.094573
global_orthographic_density * rel_clustering              -1.132410
global_orthographic_density * rel_frequency                0.114314
global_orthographic_density * rel_letters_count            0.069888
global_orthographic_density * rel_orthographic_density     0.049080
global_orthographic_density * rel_synonyms_count           0.994986
global_synonyms_count * rel_aoa                            0.227038
global_synonyms_count * rel_clustering                    -1.522041
global_synonyms_count * rel_frequency                      0.083808
global_synonyms_count * rel_letters_count                  0.177076
global_synonyms_count * rel_orthographic_density           0.759050
global_synonyms_count * rel_synonyms_count                -0.012362
rel_aoa * rel_clustering                                   0.511617
rel_aoa * rel_frequency                                    0.099796
rel_aoa * rel_letters_count                                0.019259
rel_aoa * rel_orthographic_density                         0.034549
rel_aoa * rel_synonyms_count                              -0.133461
rel_clustering * rel_frequency                             0.056653
rel_clustering * rel_letters_count                        -0.205449
rel_clustering * rel_orthographic_density                  0.628628
rel_clustering * rel_synonyms_count                        1.256997
rel_frequency * rel_letters_count                          0.013785
rel_frequency * rel_orthographic_density                  -0.077030
rel_frequency * rel_synonyms_count                        -0.058337
rel_letters_count * rel_orthographic_density              -0.093228
rel_letters_count * rel_synonyms_count                    -0.211936
rel_orthographic_density * rel_synonyms_count             -0.876267
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 702 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.06075930257920503

intercept                      7.031841
global_aoa                     0.193878
global_clustering              0.087860
global_frequency              -0.099359
global_letters_count           0.058535
global_orthographic_density   -0.057365
global_synonyms_count         -0.151805
dtype: float64

Regressing global aoa with 702 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.10552925818929028

intercept                                             -0.967222
global_aoa                                             0.020650
global_clustering                                     -0.862598
global_frequency                                       0.497982
global_letters_count                                   1.692671
global_orthographic_density                           -0.862552
global_synonyms_count                                 -5.183520
global_aoa * global_clustering                        -0.046452
global_aoa * global_frequency                         -0.023574
global_aoa * global_letters_count                      0.011736
global_aoa * global_orthographic_density              -0.008481
global_aoa * global_synonyms_count                     0.141725
global_clustering * global_frequency                   0.039287
global_clustering * global_letters_count               0.261843
global_clustering * global_orthographic_density       -0.206103
global_clustering * global_synonyms_count             -0.905902
global_frequency * global_letters_count               -0.021559
global_frequency * global_orthographic_density        -0.041999
global_frequency * global_synonyms_count              -0.080072
global_letters_count * global_orthographic_density    -0.020885
global_letters_count * global_synonyms_count          -0.076810
global_orthographic_density * global_synonyms_count    0.000248
dtype: float64

Regressing rel aoa with 702 measures, no interactions
           ^^^^^^^
R^2 = 0.013057316990630063

intercept                      1.830502
global_aoa                     0.028828
global_clustering             -0.015277
global_frequency              -0.141996
global_letters_count           0.042442
global_orthographic_density    0.085924
global_synonyms_count         -0.107418
dtype: float64

Regressing rel aoa with 702 measures, with interactions
           ^^^^^^^
R^2 = 0.04918866635834174

intercept                                             -5.951714
global_aoa                                             0.341088
global_clustering                                      0.235066
global_frequency                                       0.841572
global_letters_count                                   1.404505
global_orthographic_density                           -0.489832
global_synonyms_count                                 -2.693305
global_aoa * global_clustering                        -0.021983
global_aoa * global_frequency                         -0.027833
global_aoa * global_letters_count                     -0.036190
global_aoa * global_orthographic_density              -0.006323
global_aoa * global_synonyms_count                     0.083599
global_clustering * global_frequency                   0.028384
global_clustering * global_letters_count               0.068080
global_clustering * global_orthographic_density       -0.347646
global_clustering * global_synonyms_count             -0.740418
global_frequency * global_letters_count               -0.069368
global_frequency * global_orthographic_density        -0.121347
global_frequency * global_synonyms_count              -0.145965
global_letters_count * global_orthographic_density    -0.068360
global_letters_count * global_synonyms_count          -0.147603
global_orthographic_density * global_synonyms_count   -0.071240
dtype: float64

Regressing global aoa with 702 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.02680251537670975

intercept                   6.489764
rel_aoa                    -0.009506
rel_clustering              0.255294
rel_frequency              -0.035121
rel_letters_count           0.018288
rel_orthographic_density   -0.328264
rel_synonyms_count         -0.181591
dtype: float64

Regressing global aoa with 702 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.07199418471972385

intercept                                        6.654969
rel_aoa                                         -0.261909
rel_clustering                                  -0.423385
rel_frequency                                   -0.002442
rel_letters_count                               -0.068379
rel_orthographic_density                        -0.447710
rel_synonyms_count                              -0.096000
rel_aoa * rel_clustering                        -0.134037
rel_aoa * rel_frequency                         -0.075644
rel_aoa * rel_letters_count                      0.058506
rel_aoa * rel_orthographic_density               0.020098
rel_aoa * rel_synonyms_count                     0.085264
rel_clustering * rel_frequency                  -0.043324
rel_clustering * rel_letters_count               0.342218
rel_clustering * rel_orthographic_density        0.022859
rel_clustering * rel_synonyms_count             -0.730861
rel_frequency * rel_letters_count                0.003177
rel_frequency * rel_orthographic_density        -0.020681
rel_frequency * rel_synonyms_count              -0.037196
rel_letters_count * rel_orthographic_density     0.048370
rel_letters_count * rel_synonyms_count           0.076660
rel_orthographic_density * rel_synonyms_count    0.203078
dtype: float64

Regressing rel aoa with 702 measures, no interactions
           ^^^^^^^
R^2 = 0.1158461193229453

intercept                   0.742720
rel_aoa                     0.396022
rel_clustering             -0.076542
rel_frequency              -0.123111
rel_letters_count          -0.003826
rel_orthographic_density    0.158193
rel_synonyms_count         -0.151861
dtype: float64

Regressing rel aoa with 702 measures, with interactions
           ^^^^^^^
R^2 = 0.14953019318345429

intercept                                        1.033006
rel_aoa                                          0.372409
rel_clustering                                  -0.594264
rel_frequency                                   -0.078212
rel_letters_count                               -0.083177
rel_orthographic_density                         0.365335
rel_synonyms_count                              -0.168860
rel_aoa * rel_clustering                        -0.071060
rel_aoa * rel_frequency                         -0.001525
rel_aoa * rel_letters_count                      0.035832
rel_aoa * rel_orthographic_density               0.017713
rel_aoa * rel_synonyms_count                     0.009717
rel_clustering * rel_frequency                  -0.000235
rel_clustering * rel_letters_count               0.340874
rel_clustering * rel_orthographic_density        0.172956
rel_clustering * rel_synonyms_count             -0.382136
rel_frequency * rel_letters_count                0.022345
rel_frequency * rel_orthographic_density         0.121750
rel_frequency * rel_synonyms_count              -0.033562
rel_letters_count * rel_orthographic_density     0.022555
rel_letters_count * rel_synonyms_count           0.100875
rel_orthographic_density * rel_synonyms_count    0.218718
dtype: float64

Regressing global aoa with 702 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.08100584748179085

intercept                      3.830448
global_aoa                     0.377041
global_clustering             -0.057115
global_frequency              -0.064687
global_letters_count           0.184270
global_orthographic_density    0.103964
global_synonyms_count          0.021032
rel_aoa                       -0.291664
rel_clustering                 0.140251
rel_frequency                 -0.067513
rel_letters_count             -0.140182
rel_orthographic_density      -0.120991
rel_synonyms_count            -0.217795
dtype: float64

Regressing global aoa with 702 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.19405798331618496

intercept                                                 68.107202
global_aoa                                                 0.303217
global_clustering                                          6.604758
global_frequency                                          -0.794990
global_letters_count                                      -6.367328
global_orthographic_density                              -19.674868
global_synonyms_count                                    -17.604270
rel_aoa                                                    0.221795
rel_clustering                                            -4.564948
rel_frequency                                              2.768187
rel_letters_count                                          5.504666
rel_orthographic_density                                  15.704724
rel_synonyms_count                                         5.655013
global_aoa * global_clustering                            -0.062908
global_aoa * global_frequency                             -0.109519
global_aoa * global_letters_count                          0.052583
global_aoa * global_orthographic_density                   0.308107
global_aoa * global_synonyms_count                        -0.080317
global_aoa * rel_aoa                                       0.015166
global_aoa * rel_clustering                                0.029740
global_aoa * rel_frequency                                 0.066284
global_aoa * rel_letters_count                            -0.068538
global_aoa * rel_orthographic_density                     -0.310778
global_aoa * rel_synonyms_count                            0.305020
global_clustering * global_frequency                       0.080990
global_clustering * global_letters_count                  -0.374551
global_clustering * global_orthographic_density           -2.282229
global_clustering * global_synonyms_count                 -1.499726
global_clustering * rel_aoa                                0.134053
global_clustering * rel_clustering                         0.170078
global_clustering * rel_frequency                          0.213893
global_clustering * rel_letters_count                      0.206158
global_clustering * rel_orthographic_density               1.336628
global_clustering * rel_synonyms_count                    -0.023512
global_frequency * global_letters_count                    0.289803
global_frequency * global_orthographic_density             0.300115
global_frequency * global_synonyms_count                  -0.055025
global_frequency * rel_aoa                                 0.149127
global_frequency * rel_clustering                         -0.149315
global_frequency * rel_frequency                          -0.039764
global_frequency * rel_letters_count                      -0.327725
global_frequency * rel_orthographic_density               -0.438699
global_frequency * rel_synonyms_count                      0.253324
global_letters_count * global_orthographic_density         0.057095
global_letters_count * global_synonyms_count               1.264831
global_letters_count * rel_aoa                            -0.102999
global_letters_count * rel_clustering                      0.386247
global_letters_count * rel_frequency                      -0.187650
global_letters_count * rel_letters_count                   0.049876
global_letters_count * rel_orthographic_density           -0.208630
global_letters_count * rel_synonyms_count                 -1.305401
global_orthographic_density * global_synonyms_count        1.484979
global_orthographic_density * rel_aoa                     -0.459564
global_orthographic_density * rel_clustering               1.890022
global_orthographic_density * rel_frequency               -0.211066
global_orthographic_density * rel_letters_count           -0.080675
global_orthographic_density * rel_orthographic_density     0.155443
global_orthographic_density * rel_synonyms_count          -1.899955
global_synonyms_count * rel_aoa                           -0.080314
global_synonyms_count * rel_clustering                     1.900985
global_synonyms_count * rel_frequency                     -0.632579
global_synonyms_count * rel_letters_count                 -1.511492
global_synonyms_count * rel_orthographic_density          -1.726487
global_synonyms_count * rel_synonyms_count                -0.000914
rel_aoa * rel_clustering                                  -0.210078
rel_aoa * rel_frequency                                   -0.129361
rel_aoa * rel_letters_count                                0.099717
rel_aoa * rel_orthographic_density                         0.420330
rel_aoa * rel_synonyms_count                              -0.038926
rel_clustering * rel_frequency                            -0.216099
rel_clustering * rel_letters_count                         0.105761
rel_clustering * rel_orthographic_density                 -0.903333
rel_clustering * rel_synonyms_count                       -1.181618
rel_frequency * rel_letters_count                          0.187039
rel_frequency * rel_orthographic_density                   0.374809
rel_frequency * rel_synonyms_count                         0.436736
rel_letters_count * rel_orthographic_density               0.378672
rel_letters_count * rel_synonyms_count                     1.623615
rel_orthographic_density * rel_synonyms_count              2.412763
dtype: float64

Regressing rel aoa with 702 measures, no interactions
           ^^^^^^^
R^2 = 0.1697146325736284

intercept                      1.194666
global_aoa                    -0.413253
global_clustering             -0.053966
global_frequency               0.019360
global_letters_count           0.220306
global_orthographic_density    0.100004
global_synonyms_count          0.289468
rel_aoa                        0.693180
rel_clustering                 0.092674
rel_frequency                 -0.126896
rel_letters_count             -0.172106
rel_orthographic_density      -0.134068
rel_synonyms_count            -0.456730
dtype: float64

Regressing rel aoa with 702 measures, with interactions
           ^^^^^^^
R^2 = 0.2727295140157403

intercept                                                 45.415875
global_aoa                                                -1.396785
global_clustering                                          4.163777
global_frequency                                          -0.070905
global_letters_count                                      -3.740303
global_orthographic_density                              -15.255321
global_synonyms_count                                    -13.148188
rel_aoa                                                    2.457215
rel_clustering                                             0.834543
rel_frequency                                              1.975651
rel_letters_count                                          3.910829
rel_orthographic_density                                  13.647155
rel_synonyms_count                                         2.396434
global_aoa * global_clustering                            -0.183800
global_aoa * global_frequency                             -0.066722
global_aoa * global_letters_count                          0.010832
global_aoa * global_orthographic_density                   0.230566
global_aoa * global_synonyms_count                         0.077740
global_aoa * rel_aoa                                      -0.016557
global_aoa * rel_clustering                                0.040305
global_aoa * rel_frequency                                 0.035150
global_aoa * rel_letters_count                            -0.046723
global_aoa * rel_orthographic_density                     -0.312075
global_aoa * rel_synonyms_count                            0.160759
global_clustering * global_frequency                       0.128327
global_clustering * global_letters_count                  -0.218488
global_clustering * global_orthographic_density           -1.489933
global_clustering * global_synonyms_count                 -1.358531
global_clustering * rel_aoa                                0.242476
global_clustering * rel_clustering                         0.261237
global_clustering * rel_frequency                          0.084422
global_clustering * rel_letters_count                      0.078646
global_clustering * rel_orthographic_density               0.695117
global_clustering * rel_synonyms_count                     0.052987
global_frequency * global_letters_count                    0.153797
global_frequency * global_orthographic_density             0.327789
global_frequency * global_synonyms_count                  -0.194305
global_frequency * rel_aoa                                 0.107965
global_frequency * rel_clustering                         -0.276055
global_frequency * rel_frequency                          -0.024905
global_frequency * rel_letters_count                      -0.247561
global_frequency * rel_orthographic_density               -0.554875
global_frequency * rel_synonyms_count                      0.364874
global_letters_count * global_orthographic_density         0.182728
global_letters_count * global_synonyms_count               0.889055
global_letters_count * rel_aoa                            -0.093903
global_letters_count * rel_clustering                      0.140925
global_letters_count * rel_frequency                      -0.132344
global_letters_count * rel_letters_count                   0.035959
global_letters_count * rel_orthographic_density           -0.291385
global_letters_count * rel_synonyms_count                 -1.009039
global_orthographic_density * global_synonyms_count        1.083430
global_orthographic_density * rel_aoa                     -0.432068
global_orthographic_density * rel_clustering               0.903195
global_orthographic_density * rel_frequency               -0.398247
global_orthographic_density * rel_letters_count           -0.229750
global_orthographic_density * rel_orthographic_density     0.021934
global_orthographic_density * rel_synonyms_count          -1.268924
global_synonyms_count * rel_aoa                           -0.154471
global_synonyms_count * rel_clustering                     1.647064
global_synonyms_count * rel_frequency                     -0.337138
global_synonyms_count * rel_letters_count                 -0.999550
global_synonyms_count * rel_orthographic_density          -1.167586
global_synonyms_count * rel_synonyms_count                 0.020283
rel_aoa * rel_clustering                                  -0.190725
rel_aoa * rel_frequency                                   -0.093969
rel_aoa * rel_letters_count                                0.108539
rel_aoa * rel_orthographic_density                         0.442107
rel_aoa * rel_synonyms_count                               0.022339
rel_clustering * rel_frequency                             0.050880
rel_clustering * rel_letters_count                         0.312547
rel_clustering * rel_orthographic_density                 -0.101880
rel_clustering * rel_synonyms_count                       -1.118532
rel_frequency * rel_letters_count                          0.184636
rel_frequency * rel_orthographic_density                   0.611265
rel_frequency * rel_synonyms_count                         0.094214
rel_letters_count * rel_orthographic_density               0.420300
rel_letters_count * rel_synonyms_count                     1.184434
rel_orthographic_density * rel_synonyms_count              1.617357
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 607 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.033931914377605676

intercept                     -4.820888
global_aoa                    -0.000421
global_clustering              0.105409
global_frequency              -0.048591
global_letters_count           0.012076
global_orthographic_density    0.014113
global_synonyms_count          0.033203
dtype: float64

Regressing global clustering with 607 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.08123231760532856

intercept                                             -3.884690
global_aoa                                             0.030359
global_clustering                                      0.396772
global_frequency                                      -0.570067
global_letters_count                                   0.560928
global_orthographic_density                            0.854922
global_synonyms_count                                  0.240942
global_aoa * global_clustering                        -0.013652
global_aoa * global_frequency                         -0.000076
global_aoa * global_letters_count                     -0.011983
global_aoa * global_orthographic_density              -0.027398
global_aoa * global_synonyms_count                    -0.009936
global_clustering * global_frequency                  -0.080046
global_clustering * global_letters_count               0.084381
global_clustering * global_orthographic_density        0.075157
global_clustering * global_synonyms_count             -0.120412
global_frequency * global_letters_count                0.009301
global_frequency * global_orthographic_density         0.000354
global_frequency * global_synonyms_count              -0.000775
global_letters_count * global_orthographic_density    -0.028294
global_letters_count * global_synonyms_count          -0.095631
global_orthographic_density * global_synonyms_count   -0.212156
dtype: float64

Regressing rel clustering with 607 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.012374512677456151

intercept                      0.849321
global_aoa                     0.001467
global_clustering              0.081037
global_frequency              -0.018157
global_letters_count           0.019221
global_orthographic_density    0.044314
global_synonyms_count         -0.004106
dtype: float64

Regressing rel clustering with 607 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.04234841436161774

intercept                                              1.292706
global_aoa                                             0.007535
global_clustering                                      0.268409
global_frequency                                      -0.379462
global_letters_count                                   0.363336
global_orthographic_density                            0.896393
global_synonyms_count                                  0.809552
global_aoa * global_clustering                        -0.016971
global_aoa * global_frequency                          0.004142
global_aoa * global_letters_count                     -0.014141
global_aoa * global_orthographic_density              -0.043420
global_aoa * global_synonyms_count                    -0.024054
global_clustering * global_frequency                  -0.047983
global_clustering * global_letters_count               0.050682
global_clustering * global_orthographic_density        0.060740
global_clustering * global_synonyms_count             -0.031243
global_frequency * global_letters_count                0.010402
global_frequency * global_orthographic_density        -0.003396
global_frequency * global_synonyms_count              -0.011429
global_letters_count * global_orthographic_density    -0.019703
global_letters_count * global_synonyms_count          -0.076162
global_orthographic_density * global_synonyms_count   -0.217304
dtype: float64

Regressing global clustering with 607 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.011155759137808974

intercept                  -5.875131
rel_aoa                     0.008922
rel_clustering              0.086256
rel_frequency              -0.013897
rel_letters_count           0.004652
rel_orthographic_density    0.009439
rel_synonyms_count          0.013087
dtype: float64

Regressing global clustering with 607 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.033163522917485366

intercept                                       -5.874513
rel_aoa                                         -0.007130
rel_clustering                                  -0.052851
rel_frequency                                   -0.033577
rel_letters_count                               -0.016396
rel_orthographic_density                         0.050759
rel_synonyms_count                              -0.034835
rel_aoa * rel_clustering                        -0.003246
rel_aoa * rel_frequency                         -0.011366
rel_aoa * rel_letters_count                     -0.013494
rel_aoa * rel_orthographic_density              -0.016021
rel_aoa * rel_synonyms_count                    -0.005327
rel_clustering * rel_frequency                  -0.010156
rel_clustering * rel_letters_count               0.071089
rel_clustering * rel_orthographic_density        0.031520
rel_clustering * rel_synonyms_count             -0.061312
rel_frequency * rel_letters_count                0.010712
rel_frequency * rel_orthographic_density        -0.008284
rel_frequency * rel_synonyms_count              -0.021835
rel_letters_count * rel_orthographic_density    -0.031939
rel_letters_count * rel_synonyms_count          -0.048118
rel_orthographic_density * rel_synonyms_count   -0.121515
dtype: float64

Regressing rel clustering with 607 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.10861778136851286

intercept                   0.297803
rel_aoa                    -0.004811
rel_clustering              0.346959
rel_frequency               0.009841
rel_letters_count           0.022631
rel_orthographic_density    0.047397
rel_synonyms_count          0.046346
dtype: float64

Regressing rel clustering with 607 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1305238294197909

intercept                                        0.265931
rel_aoa                                         -0.021263
rel_clustering                                   0.225564
rel_frequency                                   -0.021398
rel_letters_count                                0.021208
rel_orthographic_density                         0.056690
rel_synonyms_count                              -0.008122
rel_aoa * rel_clustering                         0.006491
rel_aoa * rel_frequency                         -0.002598
rel_aoa * rel_letters_count                     -0.017415
rel_aoa * rel_orthographic_density              -0.043953
rel_aoa * rel_synonyms_count                     0.003082
rel_clustering * rel_frequency                  -0.012028
rel_clustering * rel_letters_count               0.058360
rel_clustering * rel_orthographic_density        0.036358
rel_clustering * rel_synonyms_count             -0.089687
rel_frequency * rel_letters_count                0.009533
rel_frequency * rel_orthographic_density        -0.018356
rel_frequency * rel_synonyms_count              -0.023646
rel_letters_count * rel_orthographic_density    -0.018361
rel_letters_count * rel_synonyms_count          -0.020245
rel_orthographic_density * rel_synonyms_count   -0.061578
dtype: float64

Regressing global clustering with 607 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.042658207111811675

intercept                     -3.843839
global_aoa                    -0.004422
global_clustering              0.138302
global_frequency              -0.086727
global_letters_count          -0.015335
global_orthographic_density   -0.067784
global_synonyms_count          0.113083
rel_aoa                        0.003438
rel_clustering                -0.032841
rel_frequency                  0.045011
rel_letters_count              0.023776
rel_orthographic_density       0.085796
rel_synonyms_count            -0.106382
dtype: float64

Regressing global clustering with 607 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16576558099857897

intercept                                                 28.225704
global_aoa                                                -0.499101
global_clustering                                          5.985093
global_frequency                                          -2.454450
global_letters_count                                       0.153155
global_orthographic_density                                0.466818
global_synonyms_count                                     -3.657983
rel_aoa                                                    0.811772
rel_clustering                                            -5.700256
rel_frequency                                              1.711202
rel_letters_count                                          0.100021
rel_orthographic_density                                  -0.373099
rel_synonyms_count                                         1.241798
global_aoa * global_clustering                            -0.141999
global_aoa * global_frequency                             -0.002288
global_aoa * global_letters_count                         -0.044548
global_aoa * global_orthographic_density                  -0.090428
global_aoa * global_synonyms_count                         0.023579
global_aoa * rel_aoa                                       0.005127
global_aoa * rel_clustering                                0.176993
global_aoa * rel_frequency                                 0.019584
global_aoa * rel_letters_count                             0.039095
global_aoa * rel_orthographic_density                      0.042774
global_aoa * rel_synonyms_count                            0.019116
global_clustering * global_frequency                      -0.398437
global_clustering * global_letters_count                  -0.025595
global_clustering * global_orthographic_density           -0.136694
global_clustering * global_synonyms_count                 -0.377096
global_clustering * rel_aoa                                0.101975
global_clustering * rel_clustering                        -0.082592
global_clustering * rel_frequency                          0.318465
global_clustering * rel_letters_count                      0.060380
global_clustering * rel_orthographic_density               0.057204
global_clustering * rel_synonyms_count                     0.316893
global_frequency * global_letters_count                   -0.001322
global_frequency * global_orthographic_density            -0.033601
global_frequency * global_synonyms_count                   0.155274
global_frequency * rel_aoa                                -0.017493
global_frequency * rel_clustering                          0.271457
global_frequency * rel_frequency                           0.012447
global_frequency * rel_letters_count                       0.017540
global_frequency * rel_orthographic_density                0.022506
global_frequency * rel_synonyms_count                      0.009484
global_letters_count * global_orthographic_density        -0.023222
global_letters_count * global_synonyms_count               0.111149
global_letters_count * rel_aoa                            -0.030654
global_letters_count * rel_clustering                      0.091576
global_letters_count * rel_frequency                       0.005978
global_letters_count * rel_letters_count                   0.002638
global_letters_count * rel_orthographic_density            0.027717
global_letters_count * rel_synonyms_count                 -0.038115
global_orthographic_density * global_synonyms_count       -0.483804
global_orthographic_density * rel_aoa                      0.093601
global_orthographic_density * rel_clustering               0.252837
global_orthographic_density * rel_frequency                0.069158
global_orthographic_density * rel_letters_count           -0.046753
global_orthographic_density * rel_orthographic_density    -0.008917
global_orthographic_density * rel_synonyms_count           0.325114
global_synonyms_count * rel_aoa                           -0.092794
global_synonyms_count * rel_clustering                     0.107171
global_synonyms_count * rel_frequency                     -0.301871
global_synonyms_count * rel_letters_count                 -0.340981
global_synonyms_count * rel_orthographic_density           0.121767
global_synonyms_count * rel_synonyms_count                -0.017377
rel_aoa * rel_clustering                                  -0.123215
rel_aoa * rel_frequency                                    0.018560
rel_aoa * rel_letters_count                                0.007908
rel_aoa * rel_orthographic_density                        -0.095578
rel_aoa * rel_synonyms_count                               0.036720
rel_clustering * rel_frequency                            -0.269356
rel_clustering * rel_letters_count                        -0.037480
rel_clustering * rel_orthographic_density                 -0.054060
rel_clustering * rel_synonyms_count                       -0.155159
rel_frequency * rel_letters_count                         -0.018165
rel_frequency * rel_orthographic_density                  -0.065649
rel_frequency * rel_synonyms_count                         0.149033
rel_letters_count * rel_orthographic_density               0.012219
rel_letters_count * rel_synonyms_count                     0.205616
rel_orthographic_density * rel_synonyms_count             -0.107492
dtype: float64

Regressing rel clustering with 607 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.20274883140649025

intercept                     -2.500263
global_aoa                    -0.003360
global_clustering             -0.585110
global_frequency              -0.065435
global_letters_count          -0.011746
global_orthographic_density   -0.026039
global_synonyms_count         -0.007917
rel_aoa                       -0.002826
rel_clustering                 0.819395
rel_frequency                  0.035453
rel_letters_count              0.031799
rel_orthographic_density       0.050717
rel_synonyms_count             0.020362
dtype: float64

Regressing rel clustering with 607 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2970684743635801

intercept                                                 22.459585
global_aoa                                                -0.377562
global_clustering                                          3.715809
global_frequency                                          -1.777195
global_letters_count                                      -0.028506
global_orthographic_density                               -0.552711
global_synonyms_count                                     -2.395249
rel_aoa                                                    0.803425
rel_clustering                                            -3.924445
rel_frequency                                              1.275820
rel_letters_count                                         -0.038088
rel_orthographic_density                                   0.552512
rel_synonyms_count                                        -0.160361
global_aoa * global_clustering                            -0.101224
global_aoa * global_frequency                              0.004746
global_aoa * global_letters_count                         -0.036484
global_aoa * global_orthographic_density                  -0.082817
global_aoa * global_synonyms_count                        -0.006996
global_aoa * rel_aoa                                       0.003867
global_aoa * rel_clustering                                0.131517
global_aoa * rel_frequency                                 0.012916
global_aoa * rel_letters_count                             0.037945
global_aoa * rel_orthographic_density                      0.040010
global_aoa * rel_synonyms_count                            0.052745
global_clustering * global_frequency                      -0.269604
global_clustering * global_letters_count                  -0.030503
global_clustering * global_orthographic_density           -0.148714
global_clustering * global_synonyms_count                 -0.417536
global_clustering * rel_aoa                                0.090281
global_clustering * rel_clustering                        -0.133228
global_clustering * rel_frequency                          0.220330
global_clustering * rel_letters_count                      0.028590
global_clustering * rel_orthographic_density               0.095727
global_clustering * rel_synonyms_count                     0.359157
global_frequency * global_letters_count                   -0.000408
global_frequency * global_orthographic_density             0.028183
global_frequency * global_synonyms_count                   0.000787
global_frequency * rel_aoa                                -0.023631
global_frequency * rel_clustering                          0.168546
global_frequency * rel_frequency                           0.011568
global_frequency * rel_letters_count                       0.019338
global_frequency * rel_orthographic_density               -0.031385
global_frequency * rel_synonyms_count                      0.158344
global_letters_count * global_orthographic_density         0.021476
global_letters_count * global_synonyms_count               0.063178
global_letters_count * rel_aoa                            -0.025372
global_letters_count * rel_clustering                      0.116261
global_letters_count * rel_frequency                      -0.001829
global_letters_count * rel_letters_count                   0.004587
global_letters_count * rel_orthographic_density           -0.000528
global_letters_count * rel_synonyms_count                  0.030277
global_orthographic_density * global_synonyms_count       -0.319618
global_orthographic_density * rel_aoa                      0.086281
global_orthographic_density * rel_clustering               0.268187
global_orthographic_density * rel_frequency                0.004980
global_orthographic_density * rel_letters_count           -0.081146
global_orthographic_density * rel_orthographic_density     0.018598
global_orthographic_density * rel_synonyms_count           0.197922
global_synonyms_count * rel_aoa                           -0.118148
global_synonyms_count * rel_clustering                     0.191062
global_synonyms_count * rel_frequency                     -0.177757
global_synonyms_count * rel_letters_count                 -0.220178
global_synonyms_count * rel_orthographic_density          -0.030527
global_synonyms_count * rel_synonyms_count                -0.001182
rel_aoa * rel_clustering                                  -0.104774
rel_aoa * rel_frequency                                    0.019743
rel_aoa * rel_letters_count                                0.004840
rel_aoa * rel_orthographic_density                        -0.083873
rel_aoa * rel_synonyms_count                               0.066078
rel_clustering * rel_frequency                            -0.188685
rel_clustering * rel_letters_count                        -0.037019
rel_clustering * rel_orthographic_density                 -0.114708
rel_clustering * rel_synonyms_count                       -0.233168
rel_frequency * rel_letters_count                         -0.012172
rel_frequency * rel_orthographic_density                  -0.010192
rel_frequency * rel_synonyms_count                         0.020865
rel_letters_count * rel_orthographic_density               0.058479
rel_letters_count * rel_synonyms_count                     0.084326
rel_orthographic_density * rel_synonyms_count              0.034080
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0807771419811466

intercept                      4.496455
global_aoa                     0.051809
global_clustering              0.048775
global_frequency               0.063696
global_letters_count           0.245774
global_orthographic_density   -0.083222
global_synonyms_count         -0.333391
dtype: float64

Regressing global letters_count with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09736858487767219

intercept                                             -5.545684
global_aoa                                             0.520133
global_clustering                                     -2.155078
global_frequency                                       0.698981
global_letters_count                                   0.374575
global_orthographic_density                           -1.337591
global_synonyms_count                                 -1.796726
global_aoa * global_clustering                         0.162747
global_aoa * global_frequency                          0.044115
global_aoa * global_letters_count                      0.004257
global_aoa * global_orthographic_density               0.058031
global_aoa * global_synonyms_count                    -0.032052
global_clustering * global_frequency                   0.142347
global_clustering * global_letters_count              -0.015473
global_clustering * global_orthographic_density       -0.052860
global_clustering * global_synonyms_count              0.015182
global_frequency * global_letters_count               -0.032657
global_frequency * global_orthographic_density         0.062385
global_frequency * global_synonyms_count               0.030693
global_letters_count * global_orthographic_density    -0.023829
global_letters_count * global_synonyms_count           0.191081
global_orthographic_density * global_synonyms_count    0.283425
dtype: float64

Regressing rel letters_count with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.03649221783495882

intercept                      1.873680
global_aoa                    -0.000753
global_clustering              0.014498
global_frequency               0.030166
global_letters_count           0.147424
global_orthographic_density   -0.079323
global_synonyms_count         -0.438244
dtype: float64

Regressing rel letters_count with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05843822163288248

intercept                                             -11.898934
global_aoa                                              0.835790
global_clustering                                      -2.256448
global_frequency                                        1.079004
global_letters_count                                    0.393079
global_orthographic_density                            -1.491340
global_synonyms_count                                  -0.966995
global_aoa * global_clustering                          0.179728
global_aoa * global_frequency                           0.041232
global_aoa * global_letters_count                      -0.032115
global_aoa * global_orthographic_density                0.047814
global_aoa * global_synonyms_count                     -0.076989
global_clustering * global_frequency                    0.186257
global_clustering * global_letters_count               -0.071498
global_clustering * global_orthographic_density        -0.163630
global_clustering * global_synonyms_count               0.056968
global_frequency * global_letters_count                -0.048976
global_frequency * global_orthographic_density          0.039032
global_frequency * global_synonyms_count                0.000632
global_letters_count * global_orthographic_density     -0.059355
global_letters_count * global_synonyms_count            0.173956
global_orthographic_density * global_synonyms_count     0.251396
dtype: float64

Regressing global letters_count with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06838693981693988

intercept                   5.750199
rel_aoa                    -0.039160
rel_clustering              0.208020
rel_frequency               0.045976
rel_letters_count           0.215220
rel_orthographic_density   -0.220607
rel_synonyms_count         -0.325934
dtype: float64

Regressing global letters_count with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07758768476991895

intercept                                        5.778664
rel_aoa                                         -0.054680
rel_clustering                                  -0.003456
rel_frequency                                    0.080733
rel_letters_count                                0.251589
rel_orthographic_density                        -0.350158
rel_synonyms_count                              -0.345890
rel_aoa * rel_clustering                         0.050369
rel_aoa * rel_frequency                         -0.006537
rel_aoa * rel_letters_count                     -0.007696
rel_aoa * rel_orthographic_density              -0.013256
rel_aoa * rel_synonyms_count                    -0.082041
rel_clustering * rel_frequency                  -0.054386
rel_clustering * rel_letters_count               0.047550
rel_clustering * rel_orthographic_density        0.045881
rel_clustering * rel_synonyms_count             -0.213270
rel_frequency * rel_letters_count               -0.008665
rel_frequency * rel_orthographic_density        -0.022939
rel_frequency * rel_synonyms_count              -0.009878
rel_letters_count * rel_orthographic_density     0.050377
rel_letters_count * rel_synonyms_count           0.161692
rel_orthographic_density * rel_synonyms_count    0.283412
dtype: float64

Regressing rel letters_count with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12320731979103328

intercept                   1.533476
rel_aoa                    -0.057192
rel_clustering              0.050852
rel_frequency              -0.151599
rel_letters_count           0.398339
rel_orthographic_density    0.085029
rel_synonyms_count         -0.384832
dtype: float64

Regressing rel letters_count with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.145411366421076

intercept                                        1.536042
rel_aoa                                          0.010773
rel_clustering                                  -0.059689
rel_frequency                                   -0.132869
rel_letters_count                                0.535550
rel_orthographic_density                         0.108709
rel_synonyms_count                              -0.345829
rel_aoa * rel_clustering                         0.127293
rel_aoa * rel_frequency                          0.029905
rel_aoa * rel_letters_count                     -0.057811
rel_aoa * rel_orthographic_density              -0.114758
rel_aoa * rel_synonyms_count                    -0.107585
rel_clustering * rel_frequency                  -0.009860
rel_clustering * rel_letters_count               0.067731
rel_clustering * rel_orthographic_density        0.158397
rel_clustering * rel_synonyms_count             -0.142832
rel_frequency * rel_letters_count                0.008678
rel_frequency * rel_orthographic_density         0.048660
rel_frequency * rel_synonyms_count               0.025260
rel_letters_count * rel_orthographic_density     0.077818
rel_letters_count * rel_synonyms_count           0.207998
rel_orthographic_density * rel_synonyms_count    0.345578
dtype: float64

Regressing global letters_count with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09202256694440736

intercept                      0.020962
global_aoa                     0.139630
global_clustering             -0.269572
global_frequency               0.190914
global_letters_count           0.328175
global_orthographic_density   -0.057750
global_synonyms_count         -0.178095
rel_aoa                       -0.140628
rel_clustering                 0.360391
rel_frequency                 -0.162721
rel_letters_count             -0.078699
rel_orthographic_density      -0.004010
rel_synonyms_count            -0.160751
dtype: float64

Regressing global letters_count with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18121530106451755

intercept                                                 23.767466
global_aoa                                                 1.270540
global_clustering                                          1.349395
global_frequency                                          -0.575397
global_letters_count                                      -2.150712
global_orthographic_density                               -9.211766
global_synonyms_count                                     -7.902953
rel_aoa                                                   -2.771944
rel_clustering                                            -8.362601
rel_frequency                                              1.579317
rel_letters_count                                          1.602366
rel_orthographic_density                                   3.042989
rel_synonyms_count                                         2.278906
global_aoa * global_clustering                             0.245396
global_aoa * global_frequency                              0.015277
global_aoa * global_letters_count                          0.008427
global_aoa * global_orthographic_density                   0.152382
global_aoa * global_synonyms_count                        -0.397471
global_aoa * rel_aoa                                       0.051941
global_aoa * rel_clustering                               -0.080714
global_aoa * rel_frequency                                -0.017393
global_aoa * rel_letters_count                            -0.036708
global_aoa * rel_orthographic_density                     -0.059845
global_aoa * rel_synonyms_count                            0.398334
global_clustering * global_frequency                       0.062057
global_clustering * global_letters_count                  -0.032557
global_clustering * global_orthographic_density           -1.200611
global_clustering * global_synonyms_count                 -0.701694
global_clustering * rel_aoa                               -0.021298
global_clustering * rel_clustering                         0.060735
global_clustering * rel_frequency                          0.138499
global_clustering * rel_letters_count                     -0.283776
global_clustering * rel_orthographic_density               0.714540
global_clustering * rel_synonyms_count                     0.246943
global_frequency * global_letters_count                    0.204723
global_frequency * global_orthographic_density             0.133304
global_frequency * global_synonyms_count                  -0.041181
global_frequency * rel_aoa                                 0.204427
global_frequency * rel_clustering                          0.369266
global_frequency * rel_frequency                          -0.044732
global_frequency * rel_letters_count                      -0.318251
global_frequency * rel_orthographic_density                0.024256
global_frequency * rel_synonyms_count                      0.155130
global_letters_count * global_orthographic_density        -0.125055
global_letters_count * global_synonyms_count               1.082502
global_letters_count * rel_aoa                             0.143533
global_letters_count * rel_clustering                      0.379319
global_letters_count * rel_frequency                      -0.057522
global_letters_count * rel_letters_count                   0.041940
global_letters_count * rel_orthographic_density            0.215607
global_letters_count * rel_synonyms_count                 -0.822860
global_orthographic_density * global_synonyms_count        0.682514
global_orthographic_density * rel_aoa                     -0.275887
global_orthographic_density * rel_clustering               1.145644
global_orthographic_density * rel_frequency               -0.052805
global_orthographic_density * rel_letters_count            0.083257
global_orthographic_density * rel_orthographic_density     0.136868
global_orthographic_density * rel_synonyms_count          -0.561296
global_synonyms_count * rel_aoa                            0.280637
global_synonyms_count * rel_clustering                     1.595205
global_synonyms_count * rel_frequency                     -0.116702
global_synonyms_count * rel_letters_count                 -0.906695
global_synonyms_count * rel_orthographic_density          -0.371282
global_synonyms_count * rel_synonyms_count                -0.215535
rel_aoa * rel_clustering                                  -0.055969
rel_aoa * rel_frequency                                   -0.155612
rel_aoa * rel_letters_count                               -0.209031
rel_aoa * rel_orthographic_density                         0.200245
rel_aoa * rel_synonyms_count                              -0.324468
rel_clustering * rel_frequency                            -0.505605
rel_clustering * rel_letters_count                        -0.037419
rel_clustering * rel_orthographic_density                 -0.568488
rel_clustering * rel_synonyms_count                       -1.143110
rel_frequency * rel_letters_count                          0.111661
rel_frequency * rel_orthographic_density                  -0.053247
rel_frequency * rel_synonyms_count                         0.028949
rel_letters_count * rel_orthographic_density              -0.078660
rel_letters_count * rel_synonyms_count                     0.805622
rel_orthographic_density * rel_synonyms_count              0.576901
dtype: float64

Regressing rel letters_count with 764 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.18865821615654876

intercept                     -0.430233
global_aoa                     0.120902
global_clustering             -0.275053
global_frequency               0.187469
global_letters_count          -0.579197
global_orthographic_density   -0.021588
global_synonyms_count         -0.146319
rel_aoa                       -0.123557
rel_clustering                 0.346299
rel_frequency                 -0.183016
rel_letters_count              0.854699
rel_orthographic_density      -0.066558
rel_synonyms_count            -0.170556
dtype: float64

Regressing rel letters_count with 764 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2700247236607787

intercept                                                 10.220119
global_aoa                                                 1.171135
global_clustering                                         -0.534370
global_frequency                                          -0.231807
global_letters_count                                      -1.897864
global_orthographic_density                               -7.080533
global_synonyms_count                                     -5.067495
rel_aoa                                                   -2.586032
rel_clustering                                            -6.571813
rel_frequency                                              1.095107
rel_letters_count                                          1.963342
rel_orthographic_density                                   1.268203
rel_synonyms_count                                         0.122778
global_aoa * global_clustering                             0.230806
global_aoa * global_frequency                              0.023034
global_aoa * global_letters_count                          0.014365
global_aoa * global_orthographic_density                   0.089680
global_aoa * global_synonyms_count                        -0.427891
global_aoa * rel_aoa                                       0.043583
global_aoa * rel_clustering                               -0.077498
global_aoa * rel_frequency                                -0.034312
global_aoa * rel_letters_count                            -0.041006
global_aoa * rel_orthographic_density                      0.002245
global_aoa * rel_synonyms_count                            0.412189
global_clustering * global_frequency                       0.115104
global_clustering * global_letters_count                   0.104322
global_clustering * global_orthographic_density           -0.904299
global_clustering * global_synonyms_count                 -0.607973
global_clustering * rel_aoa                               -0.030890
global_clustering * rel_clustering                         0.072973
global_clustering * rel_frequency                          0.063215
global_clustering * rel_letters_count                     -0.364144
global_clustering * rel_orthographic_density               0.455451
global_clustering * rel_synonyms_count                     0.197213
global_frequency * global_letters_count                    0.175109
global_frequency * global_orthographic_density             0.156976
global_frequency * global_synonyms_count                  -0.118340
global_frequency * rel_aoa                                 0.190151
global_frequency * rel_clustering                          0.328675
global_frequency * rel_frequency                          -0.045746
global_frequency * rel_letters_count                      -0.284901
global_frequency * rel_orthographic_density                0.005150
global_frequency * rel_synonyms_count                      0.246471
global_letters_count * global_orthographic_density        -0.112352
global_letters_count * global_synonyms_count               0.985013
global_letters_count * rel_aoa                             0.111291
global_letters_count * rel_clustering                      0.255726
global_letters_count * rel_frequency                      -0.035085
global_letters_count * rel_letters_count                   0.020821
global_letters_count * rel_orthographic_density            0.166275
global_letters_count * rel_synonyms_count                 -0.767834
global_orthographic_density * global_synonyms_count        0.391029
global_orthographic_density * rel_aoa                     -0.199686
global_orthographic_density * rel_clustering               0.853070
global_orthographic_density * rel_frequency               -0.057824
global_orthographic_density * rel_letters_count            0.036158
global_orthographic_density * rel_orthographic_density     0.123194
global_orthographic_density * rel_synonyms_count          -0.371652
global_synonyms_count * rel_aoa                            0.326651
global_synonyms_count * rel_clustering                     1.412211
global_synonyms_count * rel_frequency                     -0.043212
global_synonyms_count * rel_letters_count                 -0.893505
global_synonyms_count * rel_orthographic_density          -0.130409
global_synonyms_count * rel_synonyms_count                -0.207271
rel_aoa * rel_clustering                                  -0.016607
rel_aoa * rel_frequency                                   -0.123407
rel_aoa * rel_letters_count                               -0.176510
rel_aoa * rel_orthographic_density                         0.137852
rel_aoa * rel_synonyms_count                              -0.353698
rel_clustering * rel_frequency                            -0.442512
rel_clustering * rel_letters_count                         0.033337
rel_clustering * rel_orthographic_density                 -0.313527
rel_clustering * rel_synonyms_count                       -0.996954
rel_frequency * rel_letters_count                          0.090605
rel_frequency * rel_orthographic_density                  -0.062597
rel_frequency * rel_synonyms_count                        -0.045261
rel_letters_count * rel_orthographic_density              -0.019264
rel_letters_count * rel_synonyms_count                     0.805542
rel_orthographic_density * rel_synonyms_count              0.406515
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 735 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0258899059436557

intercept                      0.162043
global_aoa                    -0.005955
global_clustering             -0.001344
global_frequency               0.004297
global_letters_count           0.016607
global_orthographic_density    0.039986
global_synonyms_count          0.165200
dtype: float64

Regressing global synonyms_count with 735 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.04418717580175424

intercept                                             -0.401800
global_aoa                                             0.143009
global_clustering                                      0.138424
global_frequency                                      -0.071927
global_letters_count                                   0.255395
global_orthographic_density                            0.178185
global_synonyms_count                                  0.081303
global_aoa * global_clustering                         0.005183
global_aoa * global_frequency                         -0.003376
global_aoa * global_letters_count                     -0.014482
global_aoa * global_orthographic_density              -0.000144
global_aoa * global_synonyms_count                     0.017348
global_clustering * global_frequency                  -0.026764
global_clustering * global_letters_count               0.012847
global_clustering * global_orthographic_density       -0.007467
global_clustering * global_synonyms_count              0.081962
global_frequency * global_letters_count               -0.005918
global_frequency * global_orthographic_density        -0.014401
global_frequency * global_synonyms_count               0.021614
global_letters_count * global_orthographic_density    -0.010951
global_letters_count * global_synonyms_count           0.021990
global_orthographic_density * global_synonyms_count    0.099744
dtype: float64

Regressing rel synonyms_count with 735 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.02018622385823543

intercept                     -0.099143
global_aoa                    -0.002629
global_clustering             -0.002314
global_frequency               0.001596
global_letters_count           0.011717
global_orthographic_density    0.031847
global_synonyms_count          0.138871
dtype: float64

Regressing rel synonyms_count with 735 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.037439080582630124

intercept                                             -0.354043
global_aoa                                             0.097753
global_clustering                                      0.033386
global_frequency                                      -0.143221
global_letters_count                                   0.203823
global_orthographic_density                            0.080491
global_synonyms_count                                  0.265103
global_aoa * global_clustering                         0.006410
global_aoa * global_frequency                          0.003134
global_aoa * global_letters_count                     -0.013835
global_aoa * global_orthographic_density              -0.001988
global_aoa * global_synonyms_count                     0.005146
global_clustering * global_frequency                  -0.021309
global_clustering * global_letters_count               0.015483
global_clustering * global_orthographic_density        0.002484
global_clustering * global_synonyms_count              0.107901
global_frequency * global_letters_count               -0.000083
global_frequency * global_orthographic_density         0.002585
global_frequency * global_synonyms_count               0.021601
global_letters_count * global_orthographic_density    -0.008756
global_letters_count * global_synonyms_count           0.029525
global_orthographic_density * global_synonyms_count    0.084387
dtype: float64

Regressing global synonyms_count with 735 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.02702042590328624

intercept                   0.423667
rel_aoa                     0.009452
rel_clustering             -0.043373
rel_frequency               0.012040
rel_letters_count           0.016288
rel_orthographic_density    0.047845
rel_synonyms_count          0.175789
dtype: float64

Regressing global synonyms_count with 735 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.03957561273765242

intercept                                        0.477939
rel_aoa                                         -0.002337
rel_clustering                                  -0.137343
rel_frequency                                    0.028655
rel_letters_count                               -0.023514
rel_orthographic_density                         0.067137
rel_synonyms_count                               0.218913
rel_aoa * rel_clustering                        -0.005476
rel_aoa * rel_frequency                         -0.005885
rel_aoa * rel_letters_count                     -0.001361
rel_aoa * rel_orthographic_density              -0.001319
rel_aoa * rel_synonyms_count                     0.020417
rel_clustering * rel_frequency                  -0.011012
rel_clustering * rel_letters_count               0.022695
rel_clustering * rel_orthographic_density       -0.023125
rel_clustering * rel_synonyms_count              0.073823
rel_frequency * rel_letters_count               -0.006458
rel_frequency * rel_orthographic_density        -0.004015
rel_frequency * rel_synonyms_count               0.021089
rel_letters_count * rel_orthographic_density    -0.013382
rel_letters_count * rel_synonyms_count           0.008112
rel_orthographic_density * rel_synonyms_count    0.044178
dtype: float64

Regressing rel synonyms_count with 735 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.09086803300209456

intercept                   0.097421
rel_aoa                    -0.002289
rel_clustering              0.021639
rel_frequency               0.017665
rel_letters_count           0.012817
rel_orthographic_density    0.024157
rel_synonyms_count          0.318237
dtype: float64

Regressing rel synonyms_count with 735 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.10737474794407365

intercept                                        0.134595
rel_aoa                                         -0.007622
rel_clustering                                  -0.071137
rel_frequency                                    0.024487
rel_letters_count                               -0.014757
rel_orthographic_density                         0.049280
rel_synonyms_count                               0.424949
rel_aoa * rel_clustering                         0.013787
rel_aoa * rel_frequency                         -0.000758
rel_aoa * rel_letters_count                      0.001546
rel_aoa * rel_orthographic_density               0.004209
rel_aoa * rel_synonyms_count                     0.000707
rel_clustering * rel_frequency                  -0.005370
rel_clustering * rel_letters_count               0.037407
rel_clustering * rel_orthographic_density        0.004708
rel_clustering * rel_synonyms_count              0.067167
rel_frequency * rel_letters_count               -0.001693
rel_frequency * rel_orthographic_density         0.003933
rel_frequency * rel_synonyms_count               0.033255
rel_letters_count * rel_orthographic_density    -0.008915
rel_letters_count * rel_synonyms_count           0.021277
rel_orthographic_density * rel_synonyms_count    0.082897
dtype: float64

Regressing global synonyms_count with 735 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.03504486455132905

intercept                      1.444999
global_aoa                    -0.019031
global_clustering              0.098909
global_frequency              -0.027096
global_letters_count          -0.019685
global_orthographic_density    0.027598
global_synonyms_count          0.057983
rel_aoa                        0.022041
rel_clustering                -0.116550
rel_frequency                  0.039098
rel_letters_count              0.037009
rel_orthographic_density       0.010543
rel_synonyms_count             0.121298
dtype: float64

Regressing global synonyms_count with 735 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12343681858697632

intercept                                                 10.498360
global_aoa                                                -0.355345
global_clustering                                          1.957920
global_frequency                                          -0.609985
global_letters_count                                       0.530898
global_orthographic_density                               -0.333605
global_synonyms_count                                      1.199690
rel_aoa                                                    0.001444
rel_clustering                                            -1.271726
rel_frequency                                              0.244551
rel_letters_count                                         -0.566895
rel_orthographic_density                                   0.655830
rel_synonyms_count                                        -2.595133
global_aoa * global_clustering                            -0.069129
global_aoa * global_frequency                              0.009646
global_aoa * global_letters_count                         -0.010284
global_aoa * global_orthographic_density                  -0.069786
global_aoa * global_synonyms_count                         0.014935
global_aoa * rel_aoa                                      -0.006496
global_aoa * rel_clustering                                0.075351
global_aoa * rel_frequency                                -0.028789
global_aoa * rel_letters_count                            -0.002601
global_aoa * rel_orthographic_density                      0.083114
global_aoa * rel_synonyms_count                            0.017284
global_clustering * global_frequency                      -0.112312
global_clustering * global_letters_count                   0.074372
global_clustering * global_orthographic_density           -0.208440
global_clustering * global_synonyms_count                  0.165779
global_clustering * rel_aoa                               -0.004697
global_clustering * rel_clustering                         0.072905
global_clustering * rel_frequency                          0.048930
global_clustering * rel_letters_count                     -0.136461
global_clustering * rel_orthographic_density               0.167556
global_clustering * rel_synonyms_count                    -0.170751
global_frequency * global_letters_count                   -0.009082
global_frequency * global_orthographic_density            -0.052385
global_frequency * global_synonyms_count                   0.071252
global_frequency * rel_aoa                                -0.000990
global_frequency * rel_clustering                          0.076582
global_frequency * rel_frequency                           0.004356
global_frequency * rel_letters_count                      -0.016851
global_frequency * rel_orthographic_density                0.006488
global_frequency * rel_synonyms_count                     -0.024381
global_letters_count * global_orthographic_density         0.049285
global_letters_count * global_synonyms_count               0.052500
global_letters_count * rel_aoa                            -0.008291
global_letters_count * rel_clustering                     -0.065541
global_letters_count * rel_frequency                       0.028006
global_letters_count * rel_letters_count                  -0.001870
global_letters_count * rel_orthographic_density           -0.053266
global_letters_count * rel_synonyms_count                  0.044230
global_orthographic_density * global_synonyms_count       -0.433153
global_orthographic_density * rel_aoa                      0.023331
global_orthographic_density * rel_clustering               0.195511
global_orthographic_density * rel_frequency                0.047589
global_orthographic_density * rel_letters_count           -0.039374
global_orthographic_density * rel_orthographic_density    -0.053808
global_orthographic_density * rel_synonyms_count           0.631559
global_synonyms_count * rel_aoa                            0.058744
global_synonyms_count * rel_clustering                     0.118544
global_synonyms_count * rel_frequency                     -0.013369
global_synonyms_count * rel_letters_count                 -0.066444
global_synonyms_count * rel_orthographic_density           0.576360
global_synonyms_count * rel_synonyms_count                 0.142958
rel_aoa * rel_clustering                                   0.004447
rel_aoa * rel_frequency                                    0.018909
rel_aoa * rel_letters_count                                0.012223
rel_aoa * rel_orthographic_density                        -0.064389
rel_aoa * rel_synonyms_count                              -0.075072
rel_clustering * rel_frequency                            -0.010039
rel_clustering * rel_letters_count                         0.147684
rel_clustering * rel_orthographic_density                 -0.137738
rel_clustering * rel_synonyms_count                       -0.069442
rel_frequency * rel_letters_count                         -0.006789
rel_frequency * rel_orthographic_density                  -0.026409
rel_frequency * rel_synonyms_count                        -0.011180
rel_letters_count * rel_orthographic_density               0.002087
rel_letters_count * rel_synonyms_count                     0.019757
rel_orthographic_density * rel_synonyms_count             -0.678181
dtype: float64

Regressing rel synonyms_count with 735 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.17499505310116914

intercept                      1.081055
global_aoa                    -0.015003
global_clustering              0.050933
global_frequency              -0.041471
global_letters_count          -0.019046
global_orthographic_density    0.073297
global_synonyms_count         -0.661065
rel_aoa                        0.016688
rel_clustering                -0.066187
rel_frequency                  0.046671
rel_letters_count              0.033778
rel_orthographic_density      -0.042685
rel_synonyms_count             0.944580
dtype: float64

Regressing rel synonyms_count with 735 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.2526252498091932

intercept                                                 11.868882
global_aoa                                                -0.334001
global_clustering                                          1.832409
global_frequency                                          -0.766276
global_letters_count                                       0.080026
global_orthographic_density                               -0.442568
global_synonyms_count                                     -0.314967
rel_aoa                                                    0.102694
rel_clustering                                            -1.256825
rel_frequency                                              0.408555
rel_letters_count                                         -0.221776
rel_orthographic_density                                   0.690293
rel_synonyms_count                                        -1.016661
global_aoa * global_clustering                            -0.057142
global_aoa * global_frequency                              0.006941
global_aoa * global_letters_count                         -0.000691
global_aoa * global_orthographic_density                  -0.053271
global_aoa * global_synonyms_count                         0.038192
global_aoa * rel_aoa                                      -0.001350
global_aoa * rel_clustering                                0.056660
global_aoa * rel_frequency                                -0.023305
global_aoa * rel_letters_count                            -0.010909
global_aoa * rel_orthographic_density                      0.057446
global_aoa * rel_synonyms_count                            0.006080
global_clustering * global_frequency                      -0.111430
global_clustering * global_letters_count                   0.038943
global_clustering * global_orthographic_density           -0.152597
global_clustering * global_synonyms_count                  0.143648
global_clustering * rel_aoa                               -0.004180
global_clustering * rel_clustering                         0.070176
global_clustering * rel_frequency                          0.048294
global_clustering * rel_letters_count                     -0.098909
global_clustering * rel_orthographic_density               0.124752
global_clustering * rel_synonyms_count                    -0.111735
global_frequency * global_letters_count                    0.005965
global_frequency * global_orthographic_density            -0.015487
global_frequency * global_synonyms_count                   0.102607
global_frequency * rel_aoa                                -0.002460
global_frequency * rel_clustering                          0.091241
global_frequency * rel_frequency                           0.004453
global_frequency * rel_letters_count                      -0.021263
global_frequency * rel_orthographic_density               -0.013434
global_frequency * rel_synonyms_count                     -0.028804
global_letters_count * global_orthographic_density         0.043682
global_letters_count * global_synonyms_count               0.065085
global_letters_count * rel_aoa                            -0.022177
global_letters_count * rel_clustering                     -0.032868
global_letters_count * rel_frequency                       0.006119
global_letters_count * rel_letters_count                   0.000619
global_letters_count * rel_orthographic_density           -0.042940
global_letters_count * rel_synonyms_count                  0.022977
global_orthographic_density * global_synonyms_count       -0.388138
global_orthographic_density * rel_aoa                      0.009756
global_orthographic_density * rel_clustering               0.133542
global_orthographic_density * rel_frequency                0.010044
global_orthographic_density * rel_letters_count           -0.035273
global_orthographic_density * rel_orthographic_density    -0.043250
global_orthographic_density * rel_synonyms_count           0.569098
global_synonyms_count * rel_aoa                            0.021001
global_synonyms_count * rel_clustering                     0.048168
global_synonyms_count * rel_frequency                     -0.053711
global_synonyms_count * rel_letters_count                 -0.116413
global_synonyms_count * rel_orthographic_density           0.429746
global_synonyms_count * rel_synonyms_count                 0.138515
rel_aoa * rel_clustering                                   0.010505
rel_aoa * rel_frequency                                    0.020704
rel_aoa * rel_letters_count                                0.021388
rel_aoa * rel_orthographic_density                        -0.037231
rel_aoa * rel_synonyms_count                              -0.065615
rel_clustering * rel_frequency                            -0.020809
rel_clustering * rel_letters_count                         0.111770
rel_clustering * rel_orthographic_density                 -0.092046
rel_clustering * rel_synonyms_count                       -0.040957
rel_frequency * rel_letters_count                          0.005467
rel_frequency * rel_orthographic_density                   0.005897
rel_frequency * rel_synonyms_count                         0.011151
rel_letters_count * rel_orthographic_density               0.010469
rel_letters_count * rel_synonyms_count                     0.067783
rel_orthographic_density * rel_synonyms_count             -0.521424
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 616 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08340176085129958

intercept                      1.844739
global_aoa                    -0.025669
global_clustering              0.017896
global_frequency              -0.019246
global_letters_count          -0.054348
global_orthographic_density    0.171781
global_synonyms_count          0.064145
dtype: float64

Regressing global orthographic_density with 616 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10878895553984713

intercept                                              4.663776
global_aoa                                            -0.336204
global_clustering                                      0.239288
global_frequency                                      -0.100464
global_letters_count                                  -0.414033
global_orthographic_density                            0.366238
global_synonyms_count                                  0.526188
global_aoa * global_clustering                        -0.023004
global_aoa * global_frequency                          0.000281
global_aoa * global_letters_count                      0.016563
global_aoa * global_orthographic_density               0.062094
global_aoa * global_synonyms_count                     0.003946
global_clustering * global_frequency                  -0.004755
global_clustering * global_letters_count              -0.027818
global_clustering * global_orthographic_density        0.066683
global_clustering * global_synonyms_count              0.097325
global_frequency * global_letters_count                0.012315
global_frequency * global_orthographic_density        -0.008322
global_frequency * global_synonyms_count               0.004158
global_letters_count * global_orthographic_density    -0.023739
global_letters_count * global_synonyms_count          -0.007731
global_orthographic_density * global_synonyms_count    0.075736
dtype: float64

Regressing rel orthographic_density with 616 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.05590958251390987

intercept                     -0.557442
global_aoa                    -0.011095
global_clustering              0.011167
global_frequency              -0.014920
global_letters_count          -0.050357
global_orthographic_density    0.128628
global_synonyms_count          0.073338
dtype: float64

Regressing rel orthographic_density with 616 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0812463424741372

intercept                                              2.578666
global_aoa                                            -0.163737
global_clustering                                      0.357911
global_frequency                                      -0.072296
global_letters_count                                  -0.529116
global_orthographic_density                            0.007942
global_synonyms_count                                  0.421335
global_aoa * global_clustering                        -0.009509
global_aoa * global_frequency                         -0.006840
global_aoa * global_letters_count                      0.014624
global_aoa * global_orthographic_density               0.065180
global_aoa * global_synonyms_count                    -0.008301
global_clustering * global_frequency                  -0.007685
global_clustering * global_letters_count              -0.056921
global_clustering * global_orthographic_density        0.060120
global_clustering * global_synonyms_count              0.047621
global_frequency * global_letters_count                0.007096
global_frequency * global_orthographic_density         0.014419
global_frequency * global_synonyms_count              -0.004809
global_letters_count * global_orthographic_density    -0.013778
global_letters_count * global_synonyms_count          -0.005940
global_orthographic_density * global_synonyms_count    0.044657
dtype: float64

Regressing global orthographic_density with 616 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06730597950859041

intercept                   1.631419
rel_aoa                     0.022472
rel_clustering              0.008478
rel_frequency               0.000635
rel_letters_count          -0.046258
rel_orthographic_density    0.221978
rel_synonyms_count          0.079584
dtype: float64

Regressing global orthographic_density with 616 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0884162050551478

intercept                                        1.613460
rel_aoa                                          0.094425
rel_clustering                                   0.143336
rel_frequency                                   -0.009736
rel_letters_count                               -0.026913
rel_orthographic_density                         0.324785
rel_synonyms_count                               0.181232
rel_aoa * rel_clustering                         0.060962
rel_aoa * rel_frequency                          0.017527
rel_aoa * rel_letters_count                      0.001980
rel_aoa * rel_orthographic_density               0.056255
rel_aoa * rel_synonyms_count                     0.038296
rel_clustering * rel_frequency                  -0.006238
rel_clustering * rel_letters_count              -0.068022
rel_clustering * rel_orthographic_density        0.073270
rel_clustering * rel_synonyms_count              0.055540
rel_frequency * rel_letters_count                0.013873
rel_frequency * rel_orthographic_density         0.033560
rel_frequency * rel_synonyms_count               0.026472
rel_letters_count * rel_orthographic_density    -0.040688
rel_letters_count * rel_synonyms_count           0.002565
rel_orthographic_density * rel_synonyms_count    0.069294
dtype: float64

Regressing rel orthographic_density with 616 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11208995340744275

intercept                  -0.511748
rel_aoa                     0.032491
rel_clustering              0.016042
rel_frequency               0.041687
rel_letters_count          -0.040982
rel_orthographic_density    0.276655
rel_synonyms_count          0.079900
dtype: float64

Regressing rel orthographic_density with 616 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.134255565167144

intercept                                       -0.459477
rel_aoa                                          0.095004
rel_clustering                                   0.092566
rel_frequency                                    0.075317
rel_letters_count                               -0.046085
rel_orthographic_density                         0.350117
rel_synonyms_count                               0.145089
rel_aoa * rel_clustering                         0.040404
rel_aoa * rel_frequency                          0.002677
rel_aoa * rel_letters_count                      0.004869
rel_aoa * rel_orthographic_density               0.078699
rel_aoa * rel_synonyms_count                     0.042078
rel_clustering * rel_frequency                  -0.026634
rel_clustering * rel_letters_count              -0.068452
rel_clustering * rel_orthographic_density        0.038226
rel_clustering * rel_synonyms_count              0.036122
rel_frequency * rel_letters_count                0.000692
rel_frequency * rel_orthographic_density         0.027386
rel_frequency * rel_synonyms_count               0.007362
rel_letters_count * rel_orthographic_density    -0.031706
rel_letters_count * rel_synonyms_count          -0.019886
rel_orthographic_density * rel_synonyms_count    0.038232
dtype: float64

Regressing global orthographic_density with 616 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09607664513424485

intercept                      3.484618
global_aoa                    -0.077865
global_clustering              0.024619
global_frequency              -0.107105
global_letters_count          -0.116404
global_orthographic_density    0.174192
global_synonyms_count         -0.001924
rel_aoa                        0.077096
rel_clustering                 0.006760
rel_frequency                  0.104324
rel_letters_count              0.059586
rel_orthographic_density      -0.031871
rel_synonyms_count             0.070265
dtype: float64

Regressing global orthographic_density with 616 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18294027905248766

intercept                                                -14.905925
global_aoa                                                -0.472937
global_clustering                                         -1.486658
global_frequency                                           0.885009
global_letters_count                                       1.187471
global_orthographic_density                                5.318963
global_synonyms_count                                      6.090350
rel_aoa                                                    0.496271
rel_clustering                                             3.100639
rel_frequency                                             -0.295189
rel_letters_count                                         -0.013904
rel_orthographic_density                                  -2.159619
rel_synonyms_count                                        -0.186972
global_aoa * global_clustering                            -0.124231
global_aoa * global_frequency                             -0.022840
global_aoa * global_letters_count                         -0.011603
global_aoa * global_orthographic_density                  -0.044095
global_aoa * global_synonyms_count                         0.123962
global_aoa * rel_aoa                                      -0.018309
global_aoa * rel_clustering                                0.052576
global_aoa * rel_frequency                                 0.008249
global_aoa * rel_letters_count                             0.038145
global_aoa * rel_orthographic_density                      0.086726
global_aoa * rel_synonyms_count                           -0.182559
global_clustering * global_frequency                       0.051601
global_clustering * global_letters_count                   0.035698
global_clustering * global_orthographic_density            0.619575
global_clustering * global_synonyms_count                  0.212920
global_clustering * rel_aoa                                0.072913
global_clustering * rel_clustering                        -0.042562
global_clustering * rel_frequency                         -0.028616
global_clustering * rel_letters_count                      0.155597
global_clustering * rel_orthographic_density              -0.273839
global_clustering * rel_synonyms_count                    -0.008733
global_frequency * global_letters_count                   -0.074932
global_frequency * global_orthographic_density            -0.094750
global_frequency * global_synonyms_count                  -0.172198
global_frequency * rel_aoa                                 0.013016
global_frequency * rel_clustering                         -0.090247
global_frequency * rel_frequency                           0.007940
global_frequency * rel_letters_count                       0.097288
global_frequency * rel_orthographic_density                0.054604
global_frequency * rel_synonyms_count                     -0.072558
global_letters_count * global_orthographic_density         0.010799
global_letters_count * global_synonyms_count              -0.670618
global_letters_count * rel_aoa                            -0.002873
global_letters_count * rel_clustering                     -0.135911
global_letters_count * rel_frequency                       0.022298
global_letters_count * rel_letters_count                  -0.038299
global_letters_count * rel_orthographic_density           -0.088098
global_letters_count * rel_synonyms_count                  0.469794
global_orthographic_density * global_synonyms_count       -0.281463
global_orthographic_density * rel_aoa                      0.019174
global_orthographic_density * rel_clustering              -0.814868
global_orthographic_density * rel_frequency               -0.032771
global_orthographic_density * rel_letters_count           -0.116148
global_orthographic_density * rel_orthographic_density    -0.191136
global_orthographic_density * rel_synonyms_count           0.051335
global_synonyms_count * rel_aoa                           -0.164998
global_synonyms_count * rel_clustering                    -0.196877
global_synonyms_count * rel_frequency                      0.251743
global_synonyms_count * rel_letters_count                  0.560649
global_synonyms_count * rel_orthographic_density           0.212475
global_synonyms_count * rel_synonyms_count                -0.062234
rel_aoa * rel_clustering                                   0.035142
rel_aoa * rel_frequency                                   -0.000821
rel_aoa * rel_letters_count                                0.010661
rel_aoa * rel_orthographic_density                         0.009485
rel_aoa * rel_synonyms_count                               0.249622
rel_clustering * rel_frequency                             0.075259
rel_clustering * rel_letters_count                        -0.121021
rel_clustering * rel_orthographic_density                  0.487736
rel_clustering * rel_synonyms_count                        0.114165
rel_frequency * rel_letters_count                         -0.033847
rel_frequency * rel_orthographic_density                   0.055150
rel_frequency * rel_synonyms_count                        -0.036668
rel_letters_count * rel_orthographic_density               0.029156
rel_letters_count * rel_synonyms_count                    -0.424423
rel_orthographic_density * rel_synonyms_count              0.007750
dtype: float64

Regressing rel orthographic_density with 616 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15956341767380133

intercept                      2.422620
global_aoa                    -0.062222
global_clustering              0.019275
global_frequency              -0.082913
global_letters_count          -0.069062
global_orthographic_density   -0.569379
global_synonyms_count          0.015332
rel_aoa                        0.065494
rel_clustering                 0.020154
rel_frequency                  0.095330
rel_letters_count              0.010219
rel_orthographic_density       0.782052
rel_synonyms_count             0.040275
dtype: float64

Regressing rel orthographic_density with 616 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2431026076862033

intercept                                                -12.363285
global_aoa                                                -0.249221
global_clustering                                         -0.916923
global_frequency                                           0.770367
global_letters_count                                       0.647979
global_orthographic_density                                4.265004
global_synonyms_count                                      5.494445
rel_aoa                                                    0.445610
rel_clustering                                             2.127627
rel_frequency                                             -0.341149
rel_letters_count                                          0.016198
rel_orthographic_density                                  -1.624123
rel_synonyms_count                                        -0.153686
global_aoa * global_clustering                            -0.100035
global_aoa * global_frequency                             -0.021014
global_aoa * global_letters_count                         -0.009258
global_aoa * global_orthographic_density                  -0.077656
global_aoa * global_synonyms_count                         0.080907
global_aoa * rel_aoa                                      -0.015597
global_aoa * rel_clustering                                0.032213
global_aoa * rel_frequency                                 0.012122
global_aoa * rel_letters_count                             0.042628
global_aoa * rel_orthographic_density                      0.122216
global_aoa * rel_synonyms_count                           -0.126482
global_clustering * global_frequency                       0.029500
global_clustering * global_letters_count                  -0.017963
global_clustering * global_orthographic_density            0.532799
global_clustering * global_synonyms_count                  0.172574
global_clustering * rel_aoa                                0.054785
global_clustering * rel_clustering                        -0.035841
global_clustering * rel_frequency                         -0.008326
global_clustering * rel_letters_count                      0.172017
global_clustering * rel_orthographic_density              -0.205077
global_clustering * rel_synonyms_count                     0.018046
global_frequency * global_letters_count                   -0.062134
global_frequency * global_orthographic_density            -0.117111
global_frequency * global_synonyms_count                  -0.149592
global_frequency * rel_aoa                                 0.008677
global_frequency * rel_clustering                         -0.071581
global_frequency * rel_frequency                           0.008402
global_frequency * rel_letters_count                       0.089947
global_frequency * rel_orthographic_density                0.100323
global_frequency * rel_synonyms_count                     -0.085749
global_letters_count * global_orthographic_density         0.061878
global_letters_count * global_synonyms_count              -0.575380
global_letters_count * rel_aoa                            -0.004057
global_letters_count * rel_clustering                     -0.041806
global_letters_count * rel_frequency                       0.028648
global_letters_count * rel_letters_count                  -0.030008
global_letters_count * rel_orthographic_density           -0.088626
global_letters_count * rel_synonyms_count                  0.408228
global_orthographic_density * global_synonyms_count       -0.301359
global_orthographic_density * rel_aoa                     -0.003009
global_orthographic_density * rel_clustering              -0.608765
global_orthographic_density * rel_frequency                0.023457
global_orthographic_density * rel_letters_count           -0.113997
global_orthographic_density * rel_orthographic_density    -0.145664
global_orthographic_density * rel_synonyms_count           0.122770
global_synonyms_count * rel_aoa                           -0.164360
global_synonyms_count * rel_clustering                    -0.075559
global_synonyms_count * rel_frequency                      0.226800
global_synonyms_count * rel_letters_count                  0.454579
global_synonyms_count * rel_orthographic_density           0.149400
global_synonyms_count * rel_synonyms_count                -0.065979
rel_aoa * rel_clustering                                   0.035273
rel_aoa * rel_frequency                                   -0.008077
rel_aoa * rel_letters_count                                0.003285
rel_aoa * rel_orthographic_density                         0.025619
rel_aoa * rel_synonyms_count                               0.229662
rel_clustering * rel_frequency                             0.049235
rel_clustering * rel_letters_count                        -0.181773
rel_clustering * rel_orthographic_density                  0.287434
rel_clustering * rel_synonyms_count                       -0.013663
rel_frequency * rel_letters_count                         -0.045669
rel_frequency * rel_orthographic_density                  -0.011694
rel_frequency * rel_synonyms_count                        -0.025769
rel_letters_count * rel_orthographic_density              -0.005978
rel_letters_count * rel_synonyms_count                    -0.333468
rel_orthographic_density * rel_synonyms_count              0.029272
dtype: float64