Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.all, past=Past.all, durl=Durl.exclude_past, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 2796 substitutions for model Model(time=Time.continuous, source=Source.all, past=Past.all, durl=Durl.exclude_past, max_distance=1)
100% (2796 of 2796) |######################| Elapsed Time: 0:01:11 Time: 0:01:11

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *** | *** | *** | *   |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | ns. |
H_00 | *** | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | **  | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | *   |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | **  | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | **  |
H_00 | *** | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | ns. | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | *   |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | ns. | **  |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | ns. | *   | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | **  | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | **  |
H_00 | ns. | **  | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | **  | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | **  | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | **  |
H_00 | ns. | **  | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | **  |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | **  | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | ns. | *** |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | **  | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | **  | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | **  | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *   | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | **  | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | ns. | *** |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | **  | ns. | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.53640433  0.16946246  0.08062606  0.07142061  0.03433222  0.02946902
  0.02067933  0.01957253  0.01658681  0.0091576   0.00707813]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.464299 0.312558 -0.086096 0.244891 0.221365 -0.413891 0.229662 0.289709 -0.393644 0.289423 -0.153745 0.002565
Component-1 0.272056 -0.414401 0.150023 -0.283838 -0.283874 -0.411372 0.188006 -0.287212 -0.441902 0.248555 -0.162299 0.011784
Component-2 -0.704369 -0.134468 0.123523 -0.084282 -0.655714 0.098032 0.027147 -0.028908 0.091986 -0.098466 0.008599 0.063609

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (2796 of 2796) |######################| Elapsed Time: 0:01:07 Time: 0:01:07

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | **  | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.67489944  0.18438052]

Out[35]:
aoa frequency letters_count
Component-0 -0.745390 0.367805 -0.55598
Component-1 0.403847 -0.414413 -0.81558

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (2796 of 2796) |######################| Elapsed Time: 0:00:20 Time: 0:00:20

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1527 (cluster-unique) substitutions, but the PCA is in fact computed on 1226 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07897840707295034

intercept                      5.439621
global_aoa                     0.040497
global_clustering              0.083370
global_frequency               0.385553
global_letters_count          -0.014396
global_orthographic_density   -0.002815
global_synonyms_count         -0.019218
dtype: float64

Regressing global frequency with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09153388872159574

intercept                                              6.091697
global_aoa                                             0.129805
global_clustering                                      0.285129
global_frequency                                       0.571458
global_letters_count                                  -0.575781
global_orthographic_density                            0.826915
global_synonyms_count                                 -0.637141
global_aoa * global_clustering                         0.019919
global_aoa * global_frequency                          0.001233
global_aoa * global_letters_count                      0.007632
global_aoa * global_orthographic_density              -0.047812
global_aoa * global_synonyms_count                     0.058522
global_clustering * global_frequency                   0.013675
global_clustering * global_letters_count              -0.086728
global_clustering * global_orthographic_density        0.054739
global_clustering * global_synonyms_count             -0.009821
global_frequency * global_letters_count               -0.005907
global_frequency * global_orthographic_density        -0.061048
global_frequency * global_synonyms_count               0.010207
global_letters_count * global_orthographic_density     0.073650
global_letters_count * global_synonyms_count           0.015812
global_orthographic_density * global_synonyms_count    0.006539
dtype: float64

Regressing rel frequency with 908 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.040481533216506604

intercept                     -6.858621
global_aoa                     0.057496
global_clustering              0.058550
global_frequency               0.320526
global_letters_count           0.090073
global_orthographic_density    0.038458
global_synonyms_count          0.098529
dtype: float64

Regressing rel frequency with 908 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.05436493781992202

intercept                                             -13.291979
global_aoa                                              0.306044
global_clustering                                      -0.647558
global_frequency                                        0.901609
global_letters_count                                    0.050771
global_orthographic_density                             1.901505
global_synonyms_count                                  -0.629341
global_aoa * global_clustering                          0.045058
global_aoa * global_frequency                           0.000639
global_aoa * global_letters_count                       0.009656
global_aoa * global_orthographic_density               -0.080730
global_aoa * global_synonyms_count                      0.134812
global_clustering * global_frequency                    0.050023
global_clustering * global_letters_count               -0.027818
global_clustering * global_orthographic_density         0.093275
global_clustering * global_synonyms_count               0.112022
global_frequency * global_letters_count                -0.025578
global_frequency * global_orthographic_density         -0.125189
global_frequency * global_synonyms_count                0.070957
global_letters_count * global_orthographic_density      0.063783
global_letters_count * global_synonyms_count           -0.035831
global_orthographic_density * global_synonyms_count     0.081983
dtype: float64

Regressing global frequency with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.0512920523394621

intercept                   9.457621
rel_aoa                     0.066585
rel_clustering             -0.023113
rel_frequency               0.255115
rel_letters_count          -0.038927
rel_orthographic_density    0.003233
rel_synonyms_count         -0.094181
dtype: float64

Regressing global frequency with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.0648108593161627

intercept                                        9.305871
rel_aoa                                          0.165013
rel_clustering                                   0.117116
rel_frequency                                    0.183072
rel_letters_count                                0.055331
rel_orthographic_density                         0.058484
rel_synonyms_count                               0.085247
rel_aoa * rel_clustering                         0.032267
rel_aoa * rel_frequency                          0.037314
rel_aoa * rel_letters_count                     -0.006904
rel_aoa * rel_orthographic_density              -0.006621
rel_aoa * rel_synonyms_count                     0.027549
rel_clustering * rel_frequency                   0.014248
rel_clustering * rel_letters_count              -0.020804
rel_clustering * rel_orthographic_density        0.086216
rel_clustering * rel_synonyms_count              0.125064
rel_frequency * rel_letters_count                0.029922
rel_frequency * rel_orthographic_density         0.025069
rel_frequency * rel_synonyms_count               0.090769
rel_letters_count * rel_orthographic_density    -0.002626
rel_letters_count * rel_synonyms_count          -0.072689
rel_orthographic_density * rel_synonyms_count   -0.210514
dtype: float64

Regressing rel frequency with 908 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.25327777871562807

intercept                  -1.551479
rel_aoa                     0.064562
rel_clustering              0.169761
rel_frequency               0.649874
rel_letters_count          -0.113948
rel_orthographic_density   -0.174954
rel_synonyms_count         -0.007296
dtype: float64

Regressing rel frequency with 908 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.26592861821905633

intercept                                       -1.713013
rel_aoa                                          0.067926
rel_clustering                                   0.259996
rel_frequency                                    0.600974
rel_letters_count                               -0.014486
rel_orthographic_density                        -0.218676
rel_synonyms_count                               0.251031
rel_aoa * rel_clustering                        -0.033377
rel_aoa * rel_frequency                         -0.015783
rel_aoa * rel_letters_count                      0.010759
rel_aoa * rel_orthographic_density               0.054280
rel_aoa * rel_synonyms_count                     0.161753
rel_clustering * rel_frequency                  -0.002647
rel_clustering * rel_letters_count              -0.072286
rel_clustering * rel_orthographic_density       -0.102090
rel_clustering * rel_synonyms_count              0.074110
rel_frequency * rel_letters_count                0.023657
rel_frequency * rel_orthographic_density        -0.018485
rel_frequency * rel_synonyms_count               0.079259
rel_letters_count * rel_orthographic_density     0.002748
rel_letters_count * rel_synonyms_count          -0.046943
rel_orthographic_density * rel_synonyms_count    0.058599
dtype: float64

Regressing global frequency with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08484625194542394

intercept                      5.128044
global_aoa                    -0.016895
global_clustering              0.170245
global_frequency               0.408346
global_letters_count           0.162026
global_orthographic_density    0.112913
global_synonyms_count          0.022986
rel_aoa                        0.081344
rel_clustering                -0.120482
rel_frequency                 -0.024160
rel_letters_count             -0.190992
rel_orthographic_density      -0.121799
rel_synonyms_count            -0.069331
dtype: float64

Regressing global frequency with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.14845278966891262

intercept                                                -67.890862
global_aoa                                                 2.427407
global_clustering                                        -11.857517
global_frequency                                           2.428381
global_letters_count                                       1.772574
global_orthographic_density                               11.475390
global_synonyms_count                                     16.659860
rel_aoa                                                   -1.179979
rel_clustering                                            12.424861
rel_frequency                                             -1.476616
rel_letters_count                                         -2.484580
rel_orthographic_density                                  -9.551289
rel_synonyms_count                                       -14.720376
global_aoa * global_clustering                             0.431764
global_aoa * global_frequency                              0.098702
global_aoa * global_letters_count                         -0.034409
global_aoa * global_orthographic_density                  -0.254713
global_aoa * global_synonyms_count                        -0.450448
global_aoa * rel_aoa                                       0.004127
global_aoa * rel_clustering                               -0.393697
global_aoa * rel_frequency                                -0.030075
global_aoa * rel_letters_count                             0.033372
global_aoa * rel_orthographic_density                      0.183018
global_aoa * rel_synonyms_count                            0.458573
global_clustering * global_frequency                       0.389135
global_clustering * global_letters_count                   0.220654
global_clustering * global_orthographic_density            1.917394
global_clustering * global_synonyms_count                  1.048699
global_clustering * rel_aoa                               -0.358740
global_clustering * rel_clustering                         0.031720
global_clustering * rel_frequency                         -0.297554
global_clustering * rel_letters_count                     -0.328843
global_clustering * rel_orthographic_density              -1.678408
global_clustering * rel_synonyms_count                    -0.973729
global_frequency * global_letters_count                   -0.038921
global_frequency * global_orthographic_density             0.085781
global_frequency * global_synonyms_count                  -0.519448
global_frequency * rel_aoa                                -0.150108
global_frequency * rel_clustering                         -0.445766
global_frequency * rel_frequency                          -0.004629
global_frequency * rel_letters_count                      -0.004392
global_frequency * rel_orthographic_density               -0.152772
global_frequency * rel_synonyms_count                      0.417007
global_letters_count * global_orthographic_density         0.190085
global_letters_count * global_synonyms_count              -0.021857
global_letters_count * rel_aoa                             0.081121
global_letters_count * rel_clustering                     -0.323631
global_letters_count * rel_frequency                      -0.025482
global_letters_count * rel_letters_count                   0.019072
global_letters_count * rel_orthographic_density           -0.049772
global_letters_count * rel_synonyms_count                 -0.050371
global_orthographic_density * global_synonyms_count       -0.928785
global_orthographic_density * rel_aoa                      0.118177
global_orthographic_density * rel_clustering              -1.661508
global_orthographic_density * rel_frequency               -0.145123
global_orthographic_density * rel_letters_count           -0.020240
global_orthographic_density * rel_orthographic_density    -0.006861
global_orthographic_density * rel_synonyms_count           0.903789
global_synonyms_count * rel_aoa                            0.293123
global_synonyms_count * rel_clustering                    -1.042076
global_synonyms_count * rel_frequency                      0.421389
global_synonyms_count * rel_letters_count                  0.481529
global_synonyms_count * rel_orthographic_density           1.189233
global_synonyms_count * rel_synonyms_count                 0.090314
rel_aoa * rel_clustering                                   0.277767
rel_aoa * rel_frequency                                    0.074389
rel_aoa * rel_letters_count                               -0.075149
rel_aoa * rel_orthographic_density                        -0.059849
rel_aoa * rel_synonyms_count                              -0.244245
rel_clustering * rel_frequency                             0.341239
rel_clustering * rel_letters_count                         0.359905
rel_clustering * rel_orthographic_density                  1.345450
rel_clustering * rel_synonyms_count                        1.141519
rel_frequency * rel_letters_count                          0.087667
rel_frequency * rel_orthographic_density                   0.200876
rel_frequency * rel_synonyms_count                        -0.235127
rel_letters_count * rel_orthographic_density              -0.038049
rel_letters_count * rel_synonyms_count                    -0.332000
rel_orthographic_density * rel_synonyms_count             -1.101036
dtype: float64

Regressing rel frequency with 908 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.33637231094276354

intercept                      4.637880
global_aoa                    -0.020634
global_clustering              0.195192
global_frequency              -0.536062
global_letters_count           0.194408
global_orthographic_density    0.120868
global_synonyms_count          0.004599
rel_aoa                        0.067918
rel_clustering                -0.115936
rel_frequency                  0.960111
rel_letters_count             -0.232230
rel_orthographic_density      -0.136640
rel_synonyms_count            -0.057207
dtype: float64

Regressing rel frequency with 908 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.3796716906703421

intercept                                                -67.317548
global_aoa                                                 2.162265
global_clustering                                        -11.821873
global_frequency                                           1.460143
global_letters_count                                       1.577788
global_orthographic_density                               11.640783
global_synonyms_count                                     15.169521
rel_aoa                                                   -1.215735
rel_clustering                                            13.138377
rel_frequency                                             -0.582533
rel_letters_count                                         -2.321704
rel_orthographic_density                                 -10.153807
rel_synonyms_count                                       -13.405354
global_aoa * global_clustering                             0.408992
global_aoa * global_frequency                              0.096873
global_aoa * global_letters_count                         -0.014540
global_aoa * global_orthographic_density                  -0.236697
global_aoa * global_synonyms_count                        -0.419447
global_aoa * rel_aoa                                       0.004030
global_aoa * rel_clustering                               -0.381349
global_aoa * rel_frequency                                -0.032659
global_aoa * rel_letters_count                             0.025950
global_aoa * rel_orthographic_density                      0.185222
global_aoa * rel_synonyms_count                            0.422207
global_clustering * global_frequency                       0.398423
global_clustering * global_letters_count                   0.253480
global_clustering * global_orthographic_density            1.870155
global_clustering * global_synonyms_count                  1.001040
global_clustering * rel_aoa                               -0.348088
global_clustering * rel_clustering                         0.020644
global_clustering * rel_frequency                         -0.287482
global_clustering * rel_letters_count                     -0.309938
global_clustering * rel_orthographic_density              -1.591877
global_clustering * rel_synonyms_count                    -0.925189
global_frequency * global_letters_count                    0.004864
global_frequency * global_orthographic_density             0.057698
global_frequency * global_synonyms_count                  -0.427483
global_frequency * rel_aoa                                -0.132568
global_frequency * rel_clustering                         -0.504071
global_frequency * rel_frequency                           0.011628
global_frequency * rel_letters_count                      -0.027105
global_frequency * rel_orthographic_density               -0.089434
global_frequency * rel_synonyms_count                      0.350210
global_letters_count * global_orthographic_density         0.145742
global_letters_count * global_synonyms_count              -0.034756
global_letters_count * rel_aoa                             0.064116
global_letters_count * rel_clustering                     -0.363435
global_letters_count * rel_frequency                      -0.033128
global_letters_count * rel_letters_count                   0.021513
global_letters_count * rel_orthographic_density            0.027918
global_letters_count * rel_synonyms_count                 -0.033385
global_orthographic_density * global_synonyms_count       -0.954435
global_orthographic_density * rel_aoa                      0.099905
global_orthographic_density * rel_clustering              -1.659653
global_orthographic_density * rel_frequency               -0.101484
global_orthographic_density * rel_letters_count            0.045128
global_orthographic_density * rel_orthographic_density     0.038636
global_orthographic_density * rel_synonyms_count           0.877229
global_synonyms_count * rel_aoa                            0.256450
global_synonyms_count * rel_clustering                    -1.042503
global_synonyms_count * rel_frequency                      0.316542
global_synonyms_count * rel_letters_count                  0.423188
global_synonyms_count * rel_orthographic_density           1.091780
global_synonyms_count * rel_synonyms_count                 0.096887
rel_aoa * rel_clustering                                   0.282082
rel_aoa * rel_frequency                                    0.052526
rel_aoa * rel_letters_count                               -0.071594
rel_aoa * rel_orthographic_density                        -0.056332
rel_aoa * rel_synonyms_count                              -0.195402
rel_clustering * rel_frequency                             0.384432
rel_clustering * rel_letters_count                         0.362698
rel_clustering * rel_orthographic_density                  1.309525
rel_clustering * rel_synonyms_count                        1.134617
rel_frequency * rel_letters_count                          0.075976
rel_frequency * rel_orthographic_density                   0.120905
rel_frequency * rel_synonyms_count                        -0.161293
rel_letters_count * rel_orthographic_density              -0.118027
rel_letters_count * rel_synonyms_count                    -0.284456
rel_orthographic_density * rel_synonyms_count             -0.937737
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 841 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.08458383095835142

intercept                      6.715848
global_aoa                     0.237840
global_clustering              0.041469
global_frequency              -0.126834
global_letters_count           0.066535
global_orthographic_density   -0.038418
global_synonyms_count         -0.160542
dtype: float64

Regressing global aoa with 841 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.1116695334373361

intercept                                              9.875626
global_aoa                                            -0.411538
global_clustering                                     -0.356118
global_frequency                                      -0.529593
global_letters_count                                   0.535328
global_orthographic_density                           -1.301176
global_synonyms_count                                 -3.090728
global_aoa * global_clustering                         0.005296
global_aoa * global_frequency                          0.020254
global_aoa * global_letters_count                      0.073503
global_aoa * global_orthographic_density               0.041905
global_aoa * global_synonyms_count                    -0.031146
global_clustering * global_frequency                  -0.046237
global_clustering * global_letters_count               0.159608
global_clustering * global_orthographic_density       -0.015862
global_clustering * global_synonyms_count             -0.331313
global_frequency * global_letters_count               -0.018022
global_frequency * global_orthographic_density         0.056646
global_frequency * global_synonyms_count               0.016292
global_letters_count * global_orthographic_density     0.021861
global_letters_count * global_synonyms_count           0.118837
global_orthographic_density * global_synonyms_count    0.273395
dtype: float64

Regressing rel aoa with 841 measures, no interactions
           ^^^^^^^
R^2 = 0.022179767798859662

intercept                      1.203034
global_aoa                     0.092275
global_clustering             -0.035223
global_frequency              -0.125148
global_letters_count           0.031127
global_orthographic_density    0.052630
global_synonyms_count         -0.061545
dtype: float64

Regressing rel aoa with 841 measures, with interactions
           ^^^^^^^
R^2 = 0.04491229797401952

intercept                                              6.544300
global_aoa                                            -0.038111
global_clustering                                      0.032089
global_frequency                                      -0.526154
global_letters_count                                  -0.175113
global_orthographic_density                           -2.484188
global_synonyms_count                                 -2.757175
global_aoa * global_clustering                         0.045087
global_aoa * global_frequency                          0.004469
global_aoa * global_letters_count                      0.037588
global_aoa * global_orthographic_density               0.106437
global_aoa * global_synonyms_count                    -0.027990
global_clustering * global_frequency                  -0.026641
global_clustering * global_letters_count               0.035754
global_clustering * global_orthographic_density       -0.133934
global_clustering * global_synonyms_count             -0.457535
global_frequency * global_letters_count                0.013918
global_frequency * global_orthographic_density         0.109423
global_frequency * global_synonyms_count              -0.057191
global_letters_count * global_orthographic_density    -0.019281
global_letters_count * global_synonyms_count           0.073464
global_orthographic_density * global_synonyms_count    0.239862
dtype: float64

Regressing global aoa with 841 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.034157697755070826

intercept                   6.519641
rel_aoa                     0.017441
rel_clustering              0.301028
rel_frequency              -0.032590
rel_letters_count           0.027626
rel_orthographic_density   -0.322715
rel_synonyms_count         -0.189203
dtype: float64

Regressing global aoa with 841 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.06788813002308225

intercept                                        6.594780
rel_aoa                                         -0.167061
rel_clustering                                  -0.028205
rel_frequency                                    0.010770
rel_letters_count                               -0.003092
rel_orthographic_density                        -0.550635
rel_synonyms_count                              -0.278068
rel_aoa * rel_clustering                        -0.056013
rel_aoa * rel_frequency                         -0.051595
rel_aoa * rel_letters_count                      0.045271
rel_aoa * rel_orthographic_density               0.035744
rel_aoa * rel_synonyms_count                    -0.119154
rel_clustering * rel_frequency                   0.013957
rel_clustering * rel_letters_count               0.172002
rel_clustering * rel_orthographic_density       -0.027873
rel_clustering * rel_synonyms_count             -0.487386
rel_frequency * rel_letters_count               -0.018512
rel_frequency * rel_orthographic_density        -0.007891
rel_frequency * rel_synonyms_count              -0.154536
rel_letters_count * rel_orthographic_density     0.105495
rel_letters_count * rel_synonyms_count           0.207980
rel_orthographic_density * rel_synonyms_count    0.577560
dtype: float64

Regressing rel aoa with 841 measures, no interactions
           ^^^^^^^
R^2 = 0.15588455179063665

intercept                   0.694092
rel_aoa                     0.443315
rel_clustering             -0.041964
rel_frequency              -0.108769
rel_letters_count          -0.009607
rel_orthographic_density    0.117915
rel_synonyms_count         -0.057243
dtype: float64

Regressing rel aoa with 841 measures, with interactions
           ^^^^^^^
R^2 = 0.1819812230741339

intercept                                        0.997306
rel_aoa                                          0.491671
rel_clustering                                  -0.384578
rel_frequency                                   -0.017534
rel_letters_count                               -0.088162
rel_orthographic_density                         0.224226
rel_synonyms_count                              -0.078117
rel_aoa * rel_clustering                        -0.008414
rel_aoa * rel_frequency                          0.027024
rel_aoa * rel_letters_count                      0.026348
rel_aoa * rel_orthographic_density               0.030141
rel_aoa * rel_synonyms_count                    -0.119024
rel_clustering * rel_frequency                  -0.015216
rel_clustering * rel_letters_count               0.189390
rel_clustering * rel_orthographic_density        0.138858
rel_clustering * rel_synonyms_count             -0.153816
rel_frequency * rel_letters_count               -0.005886
rel_frequency * rel_orthographic_density         0.110577
rel_frequency * rel_synonyms_count              -0.069939
rel_letters_count * rel_orthographic_density     0.045451
rel_letters_count * rel_synonyms_count           0.167125
rel_orthographic_density * rel_synonyms_count    0.429021
dtype: float64

Regressing global aoa with 841 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.10751921033320277

intercept                      2.497441
global_aoa                     0.416541
global_clustering             -0.211521
global_frequency              -0.069136
global_letters_count           0.238925
global_orthographic_density    0.108696
global_synonyms_count         -0.080524
rel_aoa                       -0.263884
rel_clustering                 0.268916
rel_frequency                 -0.073833
rel_letters_count             -0.193078
rel_orthographic_density      -0.106675
rel_synonyms_count            -0.085498
dtype: float64

Regressing global aoa with 841 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.20659213194073722

intercept                                                 76.320670
global_aoa                                                -1.022459
global_clustering                                         11.123282
global_frequency                                          -0.256493
global_letters_count                                      -6.118472
global_orthographic_density                              -15.862959
global_synonyms_count                                    -10.270347
rel_aoa                                                    0.621242
rel_clustering                                            -6.811778
rel_frequency                                              1.724373
rel_letters_count                                          4.967062
rel_orthographic_density                                  12.746744
rel_synonyms_count                                        -0.756432
global_aoa * global_clustering                            -0.049735
global_aoa * global_frequency                             -0.040386
global_aoa * global_letters_count                          0.166655
global_aoa * global_orthographic_density                   0.331813
global_aoa * global_synonyms_count                         0.054216
global_aoa * rel_aoa                                       0.026585
global_aoa * rel_clustering                                0.031451
global_aoa * rel_frequency                                 0.016093
global_aoa * rel_letters_count                            -0.108411
global_aoa * rel_orthographic_density                     -0.303300
global_aoa * rel_synonyms_count                            0.126709
global_clustering * global_frequency                      -0.084613
global_clustering * global_letters_count                  -0.528676
global_clustering * global_orthographic_density           -3.332025
global_clustering * global_synonyms_count                 -0.547312
global_clustering * rel_aoa                                0.063799
global_clustering * rel_clustering                         0.110359
global_clustering * rel_frequency                          0.241164
global_clustering * rel_letters_count                      0.338747
global_clustering * rel_orthographic_density               2.869620
global_clustering * rel_synonyms_count                    -0.283334
global_frequency * global_letters_count                    0.225863
global_frequency * global_orthographic_density            -0.471183
global_frequency * global_synonyms_count                   0.205971
global_frequency * rel_aoa                                 0.052779
global_frequency * rel_clustering                         -0.135692
global_frequency * rel_frequency                          -0.051915
global_frequency * rel_letters_count                      -0.258342
global_frequency * rel_orthographic_density                0.479491
global_frequency * rel_synonyms_count                      0.204615
global_letters_count * global_orthographic_density        -0.306299
global_letters_count * global_synonyms_count               0.535922
global_letters_count * rel_aoa                            -0.149465
global_letters_count * rel_clustering                      0.479117
global_letters_count * rel_frequency                      -0.117918
global_letters_count * rel_letters_count                   0.045536
global_letters_count * rel_orthographic_density            0.207133
global_letters_count * rel_synonyms_count                 -0.310552
global_orthographic_density * global_synonyms_count        0.860329
global_orthographic_density * rel_aoa                     -0.237010
global_orthographic_density * rel_clustering               2.853630
global_orthographic_density * rel_frequency                0.326763
global_orthographic_density * rel_letters_count            0.183566
global_orthographic_density * rel_orthographic_density     0.183805
global_orthographic_density * rel_synonyms_count          -1.214164
global_synonyms_count * rel_aoa                           -0.158070
global_synonyms_count * rel_clustering                     0.328440
global_synonyms_count * rel_frequency                     -0.423026
global_synonyms_count * rel_letters_count                 -0.978827
global_synonyms_count * rel_orthographic_density          -1.400735
global_synonyms_count * rel_synonyms_count                -0.229836
rel_aoa * rel_clustering                                  -0.079729
rel_aoa * rel_frequency                                   -0.032942
rel_aoa * rel_letters_count                                0.108804
rel_aoa * rel_orthographic_density                         0.214029
rel_aoa * rel_synonyms_count                              -0.104744
rel_clustering * rel_frequency                            -0.104489
rel_clustering * rel_letters_count                        -0.108271
rel_clustering * rel_orthographic_density                 -2.228730
rel_clustering * rel_synonyms_count                       -0.034284
rel_frequency * rel_letters_count                          0.079698
rel_frequency * rel_orthographic_density                  -0.273334
rel_frequency * rel_synonyms_count                        -0.064083
rel_letters_count * rel_orthographic_density               0.124145
rel_letters_count * rel_synonyms_count                     0.872691
rel_orthographic_density * rel_synonyms_count              2.288452
dtype: float64

Regressing rel aoa with 841 measures, no interactions
           ^^^^^^^
R^2 = 0.20194311186946068

intercept                      1.580856
global_aoa                    -0.383326
global_clustering             -0.215339
global_frequency              -0.065002
global_letters_count           0.132345
global_orthographic_density   -0.000845
global_synonyms_count          0.125358
rel_aoa                        0.702634
rel_clustering                 0.246584
rel_frequency                 -0.060967
rel_letters_count             -0.093912
rel_orthographic_density      -0.054382
rel_synonyms_count            -0.247716
dtype: float64

Regressing rel aoa with 841 measures, with interactions
           ^^^^^^^
R^2 = 0.283954286464574

intercept                                                 65.423393
global_aoa                                                -2.377606
global_clustering                                          8.978900
global_frequency                                          -0.930886
global_letters_count                                      -3.503707
global_orthographic_density                              -14.400246
global_synonyms_count                                     -7.609491
rel_aoa                                                    2.248635
rel_clustering                                            -2.866486
rel_frequency                                              1.839298
rel_letters_count                                          3.251422
rel_orthographic_density                                  12.192878
rel_synonyms_count                                        -0.856286
global_aoa * global_clustering                            -0.150350
global_aoa * global_frequency                             -0.016425
global_aoa * global_letters_count                          0.090537
global_aoa * global_orthographic_density                   0.331617
global_aoa * global_synonyms_count                         0.243384
global_aoa * rel_aoa                                      -0.000995
global_aoa * rel_clustering                                0.051461
global_aoa * rel_frequency                                -0.006389
global_aoa * rel_letters_count                            -0.067540
global_aoa * rel_orthographic_density                     -0.328124
global_aoa * rel_synonyms_count                           -0.074559
global_clustering * global_frequency                      -0.115088
global_clustering * global_letters_count                  -0.254393
global_clustering * global_orthographic_density           -2.528835
global_clustering * global_synonyms_count                 -0.756785
global_clustering * rel_aoa                                0.140969
global_clustering * rel_clustering                         0.152151
global_clustering * rel_frequency                          0.176491
global_clustering * rel_letters_count                      0.120927
global_clustering * rel_orthographic_density               2.075664
global_clustering * rel_synonyms_count                     0.107979
global_frequency * global_letters_count                    0.179766
global_frequency * global_orthographic_density            -0.187598
global_frequency * global_synonyms_count                   0.039756
global_frequency * rel_aoa                                 0.038328
global_frequency * rel_clustering                         -0.184690
global_frequency * rel_frequency                          -0.036609
global_frequency * rel_letters_count                      -0.230082
global_frequency * rel_orthographic_density                0.139518
global_frequency * rel_synonyms_count                      0.272182
global_letters_count * global_orthographic_density        -0.241262
global_letters_count * global_synonyms_count               0.049062
global_letters_count * rel_aoa                            -0.083682
global_letters_count * rel_clustering                      0.190106
global_letters_count * rel_frequency                      -0.108809
global_letters_count * rel_letters_count                   0.029166
global_letters_count * rel_orthographic_density            0.169763
global_letters_count * rel_synonyms_count                  0.028731
global_orthographic_density * global_synonyms_count        0.486496
global_orthographic_density * rel_aoa                     -0.266628
global_orthographic_density * rel_clustering               2.034863
global_orthographic_density * rel_frequency                0.022502
global_orthographic_density * rel_letters_count            0.097243
global_orthographic_density * rel_orthographic_density     0.093747
global_orthographic_density * rel_synonyms_count          -0.679841
global_synonyms_count * rel_aoa                           -0.345029
global_synonyms_count * rel_clustering                     0.425691
global_synonyms_count * rel_frequency                     -0.150078
global_synonyms_count * rel_letters_count                 -0.375388
global_synonyms_count * rel_orthographic_density          -0.894972
global_synonyms_count * rel_synonyms_count                -0.172856
rel_aoa * rel_clustering                                  -0.078117
rel_aoa * rel_frequency                                   -0.012841
rel_aoa * rel_letters_count                                0.068978
rel_aoa * rel_orthographic_density                         0.263272
rel_aoa * rel_synonyms_count                               0.077073
rel_clustering * rel_frequency                             0.078258
rel_clustering * rel_letters_count                         0.121856
rel_clustering * rel_orthographic_density                 -1.457830
rel_clustering * rel_synonyms_count                       -0.290318
rel_frequency * rel_letters_count                          0.104631
rel_frequency * rel_orthographic_density                   0.083719
rel_frequency * rel_synonyms_count                        -0.238721
rel_letters_count * rel_orthographic_density               0.110837
rel_letters_count * rel_synonyms_count                     0.389291
rel_orthographic_density * rel_synonyms_count              1.501772
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 739 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.033730328904089

intercept                     -4.436121
global_aoa                    -0.008306
global_clustering              0.151910
global_frequency              -0.040571
global_letters_count          -0.005362
global_orthographic_density    0.002957
global_synonyms_count         -0.053389
dtype: float64

Regressing global clustering with 739 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.08080864743351346

intercept                                             -6.987878
global_aoa                                             0.286701
global_clustering                                     -0.215154
global_frequency                                      -0.348813
global_letters_count                                   0.480794
global_orthographic_density                            0.803546
global_synonyms_count                                  0.141755
global_aoa * global_clustering                         0.053173
global_aoa * global_frequency                          0.003123
global_aoa * global_letters_count                     -0.001725
global_aoa * global_orthographic_density               0.005312
global_aoa * global_synonyms_count                    -0.013954
global_clustering * global_frequency                  -0.050369
global_clustering * global_letters_count               0.064931
global_clustering * global_orthographic_density        0.108270
global_clustering * global_synonyms_count             -0.034317
global_frequency * global_letters_count               -0.004643
global_frequency * global_orthographic_density         0.001929
global_frequency * global_synonyms_count               0.029003
global_letters_count * global_orthographic_density    -0.030204
global_letters_count * global_synonyms_count          -0.065707
global_orthographic_density * global_synonyms_count   -0.139476
dtype: float64

Regressing rel clustering with 739 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.018982053382418873

intercept                      1.316497
global_aoa                    -0.007069
global_clustering              0.124599
global_frequency              -0.019243
global_letters_count          -0.001006
global_orthographic_density    0.022652
global_synonyms_count         -0.047436
dtype: float64

Regressing rel clustering with 739 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.04791932744078408

intercept                                              0.460935
global_aoa                                             0.134267
global_clustering                                     -0.246821
global_frequency                                      -0.423982
global_letters_count                                   0.254815
global_orthographic_density                            0.571793
global_synonyms_count                                  0.521196
global_aoa * global_clustering                         0.035405
global_aoa * global_frequency                          0.004859
global_aoa * global_letters_count                      0.003917
global_aoa * global_orthographic_density               0.005772
global_aoa * global_synonyms_count                    -0.020965
global_clustering * global_frequency                  -0.041749
global_clustering * global_letters_count               0.067549
global_clustering * global_orthographic_density        0.105660
global_clustering * global_synonyms_count              0.025629
global_frequency * global_letters_count                0.015891
global_frequency * global_orthographic_density         0.021121
global_frequency * global_synonyms_count               0.009339
global_letters_count * global_orthographic_density    -0.022306
global_letters_count * global_synonyms_count          -0.037823
global_orthographic_density * global_synonyms_count   -0.107934
dtype: float64

Regressing global clustering with 739 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.02060880217404526

intercept                  -5.850840
rel_aoa                    -0.002264
rel_clustering              0.146917
rel_frequency              -0.007073
rel_letters_count          -0.006038
rel_orthographic_density    0.008765
rel_synonyms_count         -0.080507
dtype: float64

Regressing global clustering with 739 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04274384013046817

intercept                                       -5.823156
rel_aoa                                         -0.009745
rel_clustering                                   0.050422
rel_frequency                                   -0.007545
rel_letters_count                               -0.057242
rel_orthographic_density                        -0.014541
rel_synonyms_count                              -0.088330
rel_aoa * rel_clustering                         0.049171
rel_aoa * rel_frequency                         -0.005094
rel_aoa * rel_letters_count                     -0.014105
rel_aoa * rel_orthographic_density              -0.006698
rel_aoa * rel_synonyms_count                    -0.003759
rel_clustering * rel_frequency                   0.004898
rel_clustering * rel_letters_count               0.039039
rel_clustering * rel_orthographic_density        0.010779
rel_clustering * rel_synonyms_count              0.072285
rel_frequency * rel_letters_count               -0.011188
rel_frequency * rel_orthographic_density        -0.025675
rel_frequency * rel_synonyms_count              -0.024319
rel_letters_count * rel_orthographic_density    -0.018157
rel_letters_count * rel_synonyms_count          -0.026912
rel_orthographic_density * rel_synonyms_count    0.016116
dtype: float64

Regressing rel clustering with 739 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1215223170604658

intercept                   0.283518
rel_aoa                    -0.019006
rel_clustering              0.378733
rel_frequency               0.007634
rel_letters_count           0.011272
rel_orthographic_density    0.029462
rel_synonyms_count         -0.001505
dtype: float64

Regressing rel clustering with 739 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.14458106694100492

intercept                                        0.283497
rel_aoa                                         -0.033884
rel_clustering                                   0.254020
rel_frequency                                   -0.009604
rel_letters_count                               -0.024937
rel_orthographic_density                         0.006478
rel_synonyms_count                              -0.015412
rel_aoa * rel_clustering                         0.038624
rel_aoa * rel_frequency                         -0.004778
rel_aoa * rel_letters_count                     -0.013382
rel_aoa * rel_orthographic_density              -0.015476
rel_aoa * rel_synonyms_count                     0.000246
rel_clustering * rel_frequency                  -0.002550
rel_clustering * rel_letters_count               0.052361
rel_clustering * rel_orthographic_density        0.022928
rel_clustering * rel_synonyms_count              0.025730
rel_frequency * rel_letters_count               -0.002921
rel_frequency * rel_orthographic_density        -0.026112
rel_frequency * rel_synonyms_count              -0.027800
rel_letters_count * rel_orthographic_density    -0.018810
rel_letters_count * rel_synonyms_count          -0.006847
rel_orthographic_density * rel_synonyms_count    0.051121
dtype: float64

Regressing global clustering with 739 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04379376854692674

intercept                     -3.394943
global_aoa                    -0.011083
global_clustering              0.136117
global_frequency              -0.104628
global_letters_count          -0.059721
global_orthographic_density   -0.070458
global_synonyms_count          0.052051
rel_aoa                       -0.000526
rel_clustering                 0.030890
rel_frequency                  0.070369
rel_letters_count              0.051789
rel_orthographic_density       0.073970
rel_synonyms_count            -0.133517
dtype: float64

Regressing global clustering with 739 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16220341788160542

intercept                                                 8.211730
global_aoa                                               -0.169378
global_clustering                                         2.557407
global_frequency                                         -1.560905
global_letters_count                                      1.629610
global_orthographic_density                               1.224887
global_synonyms_count                                    -5.213973
rel_aoa                                                  -0.115670
rel_clustering                                           -3.066821
rel_frequency                                             0.369351
rel_letters_count                                        -1.237897
rel_orthographic_density                                 -0.296975
rel_synonyms_count                                        2.209841
global_aoa * global_clustering                           -0.097244
global_aoa * global_frequency                            -0.007576
global_aoa * global_letters_count                        -0.052769
global_aoa * global_orthographic_density                 -0.086332
global_aoa * global_synonyms_count                        0.032151
global_aoa * rel_aoa                                      0.007215
global_aoa * rel_clustering                               0.216226
global_aoa * rel_frequency                                0.025394
global_aoa * rel_letters_count                            0.068691
global_aoa * rel_orthographic_density                     0.083653
global_aoa * rel_synonyms_count                           0.029186
global_clustering * global_frequency                     -0.249533
global_clustering * global_letters_count                  0.177286
global_clustering * global_orthographic_density           0.111188
global_clustering * global_synonyms_count                -0.470271
global_clustering * rel_aoa                               0.023696
global_clustering * rel_clustering                       -0.068733
global_clustering * rel_frequency                         0.087094
global_clustering * rel_letters_count                    -0.085217
global_clustering * rel_orthographic_density              0.016809
global_clustering * rel_synonyms_count                    0.301432
global_frequency * global_letters_count                  -0.026917
global_frequency * global_orthographic_density            0.000091
global_frequency * global_synonyms_count                  0.281507
global_frequency * rel_aoa                               -0.010194
global_frequency * rel_clustering                         0.201131
global_frequency * rel_frequency                          0.007624
global_frequency * rel_letters_count                      0.041293
global_frequency * rel_orthographic_density               0.009666
global_frequency * rel_synonyms_count                    -0.146692
global_letters_count * global_orthographic_density        0.028101
global_letters_count * global_synonyms_count             -0.082335
global_letters_count * rel_aoa                            0.025889
global_letters_count * rel_clustering                    -0.167277
global_letters_count * rel_frequency                      0.027583
global_letters_count * rel_letters_count                 -0.004444
global_letters_count * rel_orthographic_density          -0.046684
global_letters_count * rel_synonyms_count                 0.120573
global_orthographic_density * global_synonyms_count      -0.414609
global_orthographic_density * rel_aoa                     0.131999
global_orthographic_density * rel_clustering             -0.070358
global_orthographic_density * rel_frequency               0.014266
global_orthographic_density * rel_letters_count          -0.042192
global_orthographic_density * rel_orthographic_density    0.034201
global_orthographic_density * rel_synonyms_count          0.368629
global_synonyms_count * rel_aoa                          -0.045827
global_synonyms_count * rel_clustering                    0.262612
global_synonyms_count * rel_frequency                    -0.291454
global_synonyms_count * rel_letters_count                -0.134623
global_synonyms_count * rel_orthographic_density         -0.108140
global_synonyms_count * rel_synonyms_count                0.013759
rel_aoa * rel_clustering                                 -0.075921
rel_aoa * rel_frequency                                  -0.001638
rel_aoa * rel_letters_count                              -0.049451
rel_aoa * rel_orthographic_density                       -0.125973
rel_aoa * rel_synonyms_count                             -0.010646
rel_clustering * rel_frequency                           -0.076999
rel_clustering * rel_letters_count                        0.133280
rel_clustering * rel_orthographic_density                 0.058370
rel_clustering * rel_synonyms_count                      -0.098299
rel_frequency * rel_letters_count                        -0.052178
rel_frequency * rel_orthographic_density                 -0.027889
rel_frequency * rel_synonyms_count                        0.162942
rel_letters_count * rel_orthographic_density              0.047803
rel_letters_count * rel_synonyms_count                    0.044804
rel_orthographic_density * rel_synonyms_count             0.124926
dtype: float64

Regressing rel clustering with 739 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2200374198789605

intercept                     -2.123757
global_aoa                    -0.005720
global_clustering             -0.596742
global_frequency              -0.086370
global_letters_count          -0.058922
global_orthographic_density   -0.020262
global_synonyms_count         -0.026044
rel_aoa                       -0.010891
rel_clustering                 0.885818
rel_frequency                  0.062721
rel_letters_count              0.057982
rel_orthographic_density       0.016538
rel_synonyms_count            -0.028515
dtype: float64

Regressing rel clustering with 739 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.3047999474005013

intercept                                                 5.633636
global_aoa                                               -0.013802
global_clustering                                         0.980463
global_frequency                                         -1.015184
global_letters_count                                      0.968462
global_orthographic_density                               0.535277
global_synonyms_count                                    -3.354422
rel_aoa                                                  -0.247823
rel_clustering                                           -1.784743
rel_frequency                                             0.164052
rel_letters_count                                        -0.894162
rel_orthographic_density                                 -0.190006
rel_synonyms_count                                        0.685329
global_aoa * global_clustering                           -0.053283
global_aoa * global_frequency                            -0.004552
global_aoa * global_letters_count                        -0.038032
global_aoa * global_orthographic_density                 -0.078095
global_aoa * global_synonyms_count                       -0.012601
global_aoa * rel_aoa                                      0.007916
global_aoa * rel_clustering                               0.154438
global_aoa * rel_frequency                                0.016379
global_aoa * rel_letters_count                            0.053935
global_aoa * rel_orthographic_density                     0.067010
global_aoa * rel_synonyms_count                           0.079257
global_clustering * global_frequency                     -0.157038
global_clustering * global_letters_count                  0.092584
global_clustering * global_orthographic_density           0.080801
global_clustering * global_synonyms_count                -0.354023
global_clustering * rel_aoa                              -0.003480
global_clustering * rel_clustering                       -0.084968
global_clustering * rel_frequency                         0.045236
global_clustering * rel_letters_count                    -0.055477
global_clustering * rel_orthographic_density             -0.004096
global_clustering * rel_synonyms_count                    0.218326
global_frequency * global_letters_count                  -0.027534
global_frequency * global_orthographic_density            0.031073
global_frequency * global_synonyms_count                  0.160097
global_frequency * rel_aoa                               -0.011724
global_frequency * rel_clustering                         0.141305
global_frequency * rel_frequency                          0.008844
global_frequency * rel_letters_count                      0.037925
global_frequency * rel_orthographic_density              -0.009943
global_frequency * rel_synonyms_count                    -0.043258
global_letters_count * global_orthographic_density        0.052875
global_letters_count * global_synonyms_count             -0.058599
global_letters_count * rel_aoa                            0.022939
global_letters_count * rel_clustering                    -0.063566
global_letters_count * rel_frequency                      0.026186
global_letters_count * rel_letters_count                 -0.002651
global_letters_count * rel_orthographic_density          -0.040438
global_letters_count * rel_synonyms_count                 0.097107
global_orthographic_density * global_synonyms_count      -0.309811
global_orthographic_density * rel_aoa                     0.126476
global_orthographic_density * rel_clustering             -0.060349
global_orthographic_density * rel_frequency              -0.013991
global_orthographic_density * rel_letters_count          -0.060193
global_orthographic_density * rel_orthographic_density    0.038737
global_orthographic_density * rel_synonyms_count          0.280120
global_synonyms_count * rel_aoa                          -0.040837
global_synonyms_count * rel_clustering                    0.291735
global_synonyms_count * rel_frequency                    -0.197948
global_synonyms_count * rel_letters_count                -0.104043
global_synonyms_count * rel_orthographic_density         -0.171349
global_synonyms_count * rel_synonyms_count                0.026827
rel_aoa * rel_clustering                                 -0.051035
rel_aoa * rel_frequency                                   0.001848
rel_aoa * rel_letters_count                              -0.041424
rel_aoa * rel_orthographic_density                       -0.108168
rel_aoa * rel_synonyms_count                             -0.020202
rel_clustering * rel_frequency                           -0.047661
rel_clustering * rel_letters_count                        0.085314
rel_clustering * rel_orthographic_density                 0.068524
rel_clustering * rel_synonyms_count                      -0.171169
rel_frequency * rel_letters_count                        -0.042393
rel_frequency * rel_orthographic_density                 -0.014445
rel_frequency * rel_synonyms_count                        0.075390
rel_letters_count * rel_orthographic_density              0.046357
rel_letters_count * rel_synonyms_count                    0.015414
rel_orthographic_density * rel_synonyms_count             0.165213
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09399136096016436

intercept                      4.353917
global_aoa                     0.027751
global_clustering             -0.053081
global_frequency              -0.009910
global_letters_count           0.307711
global_orthographic_density   -0.041760
global_synonyms_count         -0.338089
dtype: float64

Regressing global letters_count with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11587525201389182

intercept                                              5.021519
global_aoa                                            -0.092879
global_clustering                                     -1.120805
global_frequency                                      -0.105459
global_letters_count                                  -0.154892
global_orthographic_density                           -2.469330
global_synonyms_count                                 -0.324237
global_aoa * global_clustering                         0.137327
global_aoa * global_frequency                          0.071154
global_aoa * global_letters_count                      0.034892
global_aoa * global_orthographic_density               0.074453
global_aoa * global_synonyms_count                    -0.040246
global_clustering * global_frequency                   0.062565
global_clustering * global_letters_count              -0.076167
global_clustering * global_orthographic_density       -0.088578
global_clustering * global_synonyms_count              0.411143
global_frequency * global_letters_count               -0.035201
global_frequency * global_orthographic_density         0.122486
global_frequency * global_synonyms_count               0.119588
global_letters_count * global_orthographic_density     0.020375
global_letters_count * global_synonyms_count           0.180086
global_orthographic_density * global_synonyms_count    0.433983
dtype: float64

Regressing rel letters_count with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.046913631008842493

intercept                      1.001153
global_aoa                     0.000390
global_clustering             -0.133900
global_frequency              -0.028971
global_letters_count           0.217226
global_orthographic_density    0.002737
global_synonyms_count         -0.416629
dtype: float64

Regressing rel letters_count with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.07035257526847372

intercept                                              4.092780
global_aoa                                             0.015633
global_clustering                                     -1.078573
global_frequency                                      -0.252576
global_letters_count                                  -0.665660
global_orthographic_density                           -3.333126
global_synonyms_count                                 -0.898141
global_aoa * global_clustering                         0.139303
global_aoa * global_frequency                          0.058916
global_aoa * global_letters_count                      0.031032
global_aoa * global_orthographic_density               0.080287
global_aoa * global_synonyms_count                    -0.060515
global_clustering * global_frequency                   0.072864
global_clustering * global_letters_count              -0.095857
global_clustering * global_orthographic_density       -0.153697
global_clustering * global_synonyms_count              0.300460
global_frequency * global_letters_count                0.000832
global_frequency * global_orthographic_density         0.177044
global_frequency * global_synonyms_count               0.070107
global_letters_count * global_orthographic_density     0.009786
global_letters_count * global_synonyms_count           0.234659
global_orthographic_density * global_synonyms_count    0.515387
dtype: float64

Regressing global letters_count with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07051098176945825

intercept                   5.736624
rel_aoa                    -0.079858
rel_clustering              0.174029
rel_frequency               0.034973
rel_letters_count           0.263722
rel_orthographic_density   -0.174157
rel_synonyms_count         -0.292798
dtype: float64

Regressing global letters_count with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08759934522285995

intercept                                        5.766470
rel_aoa                                         -0.145584
rel_clustering                                   0.147013
rel_frequency                                    0.142358
rel_letters_count                                0.382091
rel_orthographic_density                        -0.287997
rel_synonyms_count                              -0.301929
rel_aoa * rel_clustering                         0.083877
rel_aoa * rel_frequency                         -0.001654
rel_aoa * rel_letters_count                      0.024474
rel_aoa * rel_orthographic_density               0.028483
rel_aoa * rel_synonyms_count                    -0.062947
rel_clustering * rel_frequency                  -0.053184
rel_clustering * rel_letters_count              -0.117240
rel_clustering * rel_orthographic_density       -0.087474
rel_clustering * rel_synonyms_count             -0.001294
rel_frequency * rel_letters_count               -0.023041
rel_frequency * rel_orthographic_density         0.034808
rel_frequency * rel_synonyms_count              -0.033415
rel_letters_count * rel_orthographic_density     0.117331
rel_letters_count * rel_synonyms_count           0.176768
rel_orthographic_density * rel_synonyms_count    0.440381
dtype: float64

Regressing rel letters_count with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.13563678498175824

intercept                   1.437584
rel_aoa                    -0.073244
rel_clustering             -0.030519
rel_frequency              -0.164068
rel_letters_count           0.445910
rel_orthographic_density    0.119716
rel_synonyms_count         -0.278633
dtype: float64

Regressing rel letters_count with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.15876265002492884

intercept                                        1.501335
rel_aoa                                         -0.034543
rel_clustering                                  -0.093303
rel_frequency                                   -0.060412
rel_letters_count                                0.596087
rel_orthographic_density                         0.133701
rel_synonyms_count                              -0.199782
rel_aoa * rel_clustering                         0.140356
rel_aoa * rel_frequency                          0.035017
rel_aoa * rel_letters_count                     -0.032000
rel_aoa * rel_orthographic_density              -0.073078
rel_aoa * rel_synonyms_count                    -0.076267
rel_clustering * rel_frequency                  -0.035005
rel_clustering * rel_letters_count              -0.062866
rel_clustering * rel_orthographic_density        0.000143
rel_clustering * rel_synonyms_count              0.063656
rel_frequency * rel_letters_count               -0.015954
rel_frequency * rel_orthographic_density         0.075900
rel_frequency * rel_synonyms_count              -0.012771
rel_letters_count * rel_orthographic_density     0.115097
rel_letters_count * rel_synonyms_count           0.177738
rel_orthographic_density * rel_synonyms_count    0.461611
dtype: float64

Regressing global letters_count with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10781362451675591

intercept                      0.553744
global_aoa                     0.151266
global_clustering             -0.360402
global_frequency               0.089554
global_letters_count           0.363906
global_orthographic_density   -0.133891
global_synonyms_count         -0.340233
rel_aoa                       -0.179646
rel_clustering                 0.348284
rel_frequency                 -0.120547
rel_letters_count             -0.061763
rel_orthographic_density       0.129621
rel_synonyms_count             0.026252
dtype: float64

Regressing global letters_count with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.19120439556463298

intercept                                                 36.551098
global_aoa                                                -1.015784
global_clustering                                          5.358314
global_frequency                                          -0.267451
global_letters_count                                      -2.400440
global_orthographic_density                               -7.043309
global_synonyms_count                                     -1.055762
rel_aoa                                                   -1.873534
rel_clustering                                            -9.381687
rel_frequency                                              1.014887
rel_letters_count                                          3.784836
rel_orthographic_density                                   3.347888
rel_synonyms_count                                        -1.153464
global_aoa * global_clustering                             0.072034
global_aoa * global_frequency                              0.028273
global_aoa * global_letters_count                          0.119966
global_aoa * global_orthographic_density                   0.401264
global_aoa * global_synonyms_count                        -0.089287
global_aoa * rel_aoa                                       0.034086
global_aoa * rel_clustering                                0.075176
global_aoa * rel_frequency                                -0.008021
global_aoa * rel_letters_count                            -0.121335
global_aoa * rel_orthographic_density                     -0.317227
global_aoa * rel_synonyms_count                            0.027262
global_clustering * global_frequency                      -0.090190
global_clustering * global_letters_count                  -0.225523
global_clustering * global_orthographic_density           -1.790443
global_clustering * global_synonyms_count                  0.333956
global_clustering * rel_aoa                               -0.007843
global_clustering * rel_clustering                         0.065214
global_clustering * rel_frequency                          0.210150
global_clustering * rel_letters_count                      0.235945
global_clustering * rel_orthographic_density               1.783651
global_clustering * rel_synonyms_count                     0.190723
global_frequency * global_letters_count                    0.132938
global_frequency * global_orthographic_density            -0.373238
global_frequency * global_synonyms_count                   0.107134
global_frequency * rel_aoa                                 0.152041
global_frequency * rel_clustering                          0.504145
global_frequency * rel_frequency                          -0.018914
global_frequency * rel_letters_count                      -0.193706
global_frequency * rel_orthographic_density                0.604508
global_frequency * rel_synonyms_count                      0.138252
global_letters_count * global_orthographic_density        -0.444555
global_letters_count * global_synonyms_count               0.342738
global_letters_count * rel_aoa                             0.016182
global_letters_count * rel_clustering                      0.331630
global_letters_count * rel_frequency                      -0.078712
global_letters_count * rel_letters_count                   0.042020
global_letters_count * rel_orthographic_density            0.495692
global_letters_count * rel_synonyms_count                  0.037546
global_orthographic_density * global_synonyms_count        0.604066
global_orthographic_density * rel_aoa                     -0.150481
global_orthographic_density * rel_clustering               1.083388
global_orthographic_density * rel_frequency                0.320802
global_orthographic_density * rel_letters_count            0.236999
global_orthographic_density * rel_orthographic_density     0.247914
global_orthographic_density * rel_synonyms_count           0.136809
global_synonyms_count * rel_aoa                            0.231139
global_synonyms_count * rel_clustering                     0.174196
global_synonyms_count * rel_frequency                      0.043966
global_synonyms_count * rel_letters_count                 -0.598927
global_synonyms_count * rel_orthographic_density          -0.800724
global_synonyms_count * rel_synonyms_count                -0.262051
rel_aoa * rel_clustering                                   0.034602
rel_aoa * rel_frequency                                   -0.097359
rel_aoa * rel_letters_count                               -0.047118
rel_aoa * rel_orthographic_density                         0.107536
rel_aoa * rel_synonyms_count                              -0.195063
rel_clustering * rel_frequency                            -0.522570
rel_clustering * rel_letters_count                        -0.437354
rel_clustering * rel_orthographic_density                 -1.019779
rel_clustering * rel_synonyms_count                       -0.351725
rel_frequency * rel_letters_count                          0.074926
rel_frequency * rel_orthographic_density                  -0.409326
rel_frequency * rel_synonyms_count                        -0.187227
rel_letters_count * rel_orthographic_density              -0.085049
rel_letters_count * rel_synonyms_count                     0.337010
rel_orthographic_density * rel_synonyms_count              0.529550
dtype: float64

Regressing rel letters_count with 908 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.18333915846656046

intercept                      0.064934
global_aoa                     0.113765
global_clustering             -0.366920
global_frequency               0.096758
global_letters_count          -0.530998
global_orthographic_density   -0.104126
global_synonyms_count         -0.384578
rel_aoa                       -0.137894
rel_clustering                 0.344272
rel_frequency                 -0.145092
rel_letters_count              0.858391
rel_orthographic_density       0.071666
rel_synonyms_count             0.085390
dtype: float64

Regressing rel letters_count with 908 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.26135344412463524

intercept                                                 27.607358
global_aoa                                                -1.387738
global_clustering                                          3.225201
global_frequency                                          -0.279290
global_letters_count                                      -2.325418
global_orthographic_density                               -6.295999
global_synonyms_count                                      0.372457
rel_aoa                                                   -1.286028
rel_clustering                                            -8.004335
rel_frequency                                              0.884687
rel_letters_count                                          3.851576
rel_orthographic_density                                   2.458415
rel_synonyms_count                                        -2.886651
global_aoa * global_clustering                             0.057653
global_aoa * global_frequency                              0.052302
global_aoa * global_letters_count                          0.132478
global_aoa * global_orthographic_density                   0.339856
global_aoa * global_synonyms_count                        -0.134646
global_aoa * rel_aoa                                       0.031212
global_aoa * rel_clustering                                0.101116
global_aoa * rel_frequency                                -0.037150
global_aoa * rel_letters_count                            -0.129065
global_aoa * rel_orthographic_density                     -0.283699
global_aoa * rel_synonyms_count                            0.045299
global_clustering * global_frequency                      -0.034112
global_clustering * global_letters_count                  -0.067821
global_clustering * global_orthographic_density           -1.400250
global_clustering * global_synonyms_count                  0.435553
global_clustering * rel_aoa                                0.016814
global_clustering * rel_clustering                         0.050779
global_clustering * rel_frequency                          0.147140
global_clustering * rel_letters_count                      0.083335
global_clustering * rel_orthographic_density               1.371910
global_clustering * rel_synonyms_count                    -0.015987
global_frequency * global_letters_count                    0.104522
global_frequency * global_orthographic_density            -0.231485
global_frequency * global_synonyms_count                   0.091084
global_frequency * rel_aoa                                 0.118911
global_frequency * rel_clustering                          0.461722
global_frequency * rel_frequency                          -0.023764
global_frequency * rel_letters_count                      -0.165890
global_frequency * rel_orthographic_density                0.464721
global_frequency * rel_synonyms_count                      0.147263
global_letters_count * global_orthographic_density        -0.310596
global_letters_count * global_synonyms_count               0.329591
global_letters_count * rel_aoa                            -0.007208
global_letters_count * rel_clustering                      0.220495
global_letters_count * rel_frequency                      -0.044108
global_letters_count * rel_letters_count                   0.032644
global_letters_count * rel_orthographic_density            0.380144
global_letters_count * rel_synonyms_count                  0.045600
global_orthographic_density * global_synonyms_count        0.431243
global_orthographic_density * rel_aoa                     -0.081109
global_orthographic_density * rel_clustering               0.780284
global_orthographic_density * rel_frequency                0.223601
global_orthographic_density * rel_letters_count            0.104046
global_orthographic_density * rel_orthographic_density     0.251327
global_orthographic_density * rel_synonyms_count           0.292048
global_synonyms_count * rel_aoa                            0.296230
global_synonyms_count * rel_clustering                     0.104269
global_synonyms_count * rel_frequency                      0.064205
global_synonyms_count * rel_letters_count                 -0.566158
global_synonyms_count * rel_orthographic_density          -0.614659
global_synonyms_count * rel_synonyms_count                -0.248789
rel_aoa * rel_clustering                                  -0.004841
rel_aoa * rel_frequency                                   -0.059234
rel_aoa * rel_letters_count                               -0.031074
rel_aoa * rel_orthographic_density                         0.059013
rel_aoa * rel_synonyms_count                              -0.219041
rel_clustering * rel_frequency                            -0.490562
rel_clustering * rel_letters_count                        -0.329796
rel_clustering * rel_orthographic_density                 -0.709406
rel_clustering * rel_synonyms_count                       -0.213738
rel_frequency * rel_letters_count                          0.044449
rel_frequency * rel_orthographic_density                  -0.332067
rel_frequency * rel_synonyms_count                        -0.200708
rel_letters_count * rel_orthographic_density               0.039330
rel_letters_count * rel_synonyms_count                     0.308405
rel_orthographic_density * rel_synonyms_count              0.367322
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 881 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.01862454368918276

intercept                      0.765652
global_aoa                    -0.011632
global_clustering              0.024967
global_frequency              -0.011699
global_letters_count          -0.014853
global_orthographic_density   -0.023068
global_synonyms_count          0.125488
dtype: float64

Regressing global synonyms_count with 881 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.028725029856128573

intercept                                              1.311701
global_aoa                                             0.120469
global_clustering                                      0.138934
global_frequency                                      -0.120251
global_letters_count                                  -0.053773
global_orthographic_density                           -0.080267
global_synonyms_count                                 -0.038661
global_aoa * global_clustering                         0.016224
global_aoa * global_frequency                         -0.001024
global_aoa * global_letters_count                     -0.006078
global_aoa * global_orthographic_density               0.005120
global_aoa * global_synonyms_count                     0.022543
global_clustering * global_frequency                  -0.015747
global_clustering * global_letters_count              -0.013452
global_clustering * global_orthographic_density       -0.001502
global_clustering * global_synonyms_count              0.044465
global_frequency * global_letters_count                0.001863
global_frequency * global_orthographic_density         0.010566
global_frequency * global_synonyms_count               0.011484
global_letters_count * global_orthographic_density    -0.017880
global_letters_count * global_synonyms_count           0.015999
global_orthographic_density * global_synonyms_count    0.064038
dtype: float64

Regressing rel synonyms_count with 881 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.01405259430330452

intercept                      0.328904
global_aoa                    -0.008824
global_clustering              0.003907
global_frequency              -0.016393
global_letters_count          -0.010634
global_orthographic_density   -0.011638
global_synonyms_count          0.100772
dtype: float64

Regressing rel synonyms_count with 881 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.027163186218870483

intercept                                              1.681120
global_aoa                                             0.028609
global_clustering                                      0.160509
global_frequency                                      -0.148505
global_letters_count                                  -0.120991
global_orthographic_density                           -0.200910
global_synonyms_count                                  0.120933
global_aoa * global_clustering                         0.014981
global_aoa * global_frequency                          0.006135
global_aoa * global_letters_count                     -0.003023
global_aoa * global_orthographic_density               0.009307
global_aoa * global_synonyms_count                     0.020816
global_clustering * global_frequency                  -0.012750
global_clustering * global_letters_count              -0.022108
global_clustering * global_orthographic_density       -0.018083
global_clustering * global_synonyms_count              0.073331
global_frequency * global_letters_count                0.001264
global_frequency * global_orthographic_density         0.011470
global_frequency * global_synonyms_count               0.010049
global_letters_count * global_orthographic_density    -0.016344
global_letters_count * global_synonyms_count           0.022605
global_orthographic_density * global_synonyms_count    0.040650
dtype: float64

Regressing global synonyms_count with 881 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.014203199176267447

intercept                   0.355000
rel_aoa                    -0.012420
rel_clustering             -0.018799
rel_frequency              -0.010743
rel_letters_count          -0.014714
rel_orthographic_density   -0.019590
rel_synonyms_count          0.111269
dtype: float64

Regressing global synonyms_count with 881 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.02577161824020857

intercept                                        0.391976
rel_aoa                                         -0.038124
rel_clustering                                  -0.090849
rel_frequency                                   -0.004597
rel_letters_count                               -0.032977
rel_orthographic_density                         0.022865
rel_synonyms_count                               0.066503
rel_aoa * rel_clustering                         0.002686
rel_aoa * rel_frequency                         -0.003804
rel_aoa * rel_letters_count                      0.012799
rel_aoa * rel_orthographic_density               0.015285
rel_aoa * rel_synonyms_count                     0.017170
rel_clustering * rel_frequency                  -0.015171
rel_clustering * rel_letters_count               0.004243
rel_clustering * rel_orthographic_density       -0.022453
rel_clustering * rel_synonyms_count              0.022596
rel_frequency * rel_letters_count                0.004471
rel_frequency * rel_orthographic_density         0.009696
rel_frequency * rel_synonyms_count               0.003490
rel_letters_count * rel_orthographic_density    -0.012810
rel_letters_count * rel_synonyms_count           0.003296
rel_orthographic_density * rel_synonyms_count   -0.017355
dtype: float64

Regressing rel synonyms_count with 881 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07853007264364209

intercept                   0.034484
rel_aoa                    -0.018762
rel_clustering              0.031071
rel_frequency              -0.003808
rel_letters_count          -0.011237
rel_orthographic_density   -0.025534
rel_synonyms_count          0.274544
dtype: float64

Regressing rel synonyms_count with 881 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.09029591217280908

intercept                                        0.077491
rel_aoa                                         -0.021695
rel_clustering                                  -0.042616
rel_frequency                                    0.005559
rel_letters_count                               -0.026897
rel_orthographic_density                         0.018564
rel_synonyms_count                               0.304923
rel_aoa * rel_clustering                         0.016501
rel_aoa * rel_frequency                          0.005369
rel_aoa * rel_letters_count                      0.013587
rel_aoa * rel_orthographic_density               0.019782
rel_aoa * rel_synonyms_count                     0.009317
rel_clustering * rel_frequency                  -0.017655
rel_clustering * rel_letters_count               0.001104
rel_clustering * rel_orthographic_density       -0.014147
rel_clustering * rel_synonyms_count              0.011659
rel_frequency * rel_letters_count                0.002851
rel_frequency * rel_orthographic_density         0.015864
rel_frequency * rel_synonyms_count               0.018275
rel_letters_count * rel_orthographic_density    -0.007697
rel_letters_count * rel_synonyms_count           0.014356
rel_orthographic_density * rel_synonyms_count    0.022767
dtype: float64

Regressing global synonyms_count with 881 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.02386560669603144

intercept                      1.288189
global_aoa                    -0.002645
global_clustering              0.114250
global_frequency              -0.014205
global_letters_count          -0.016403
global_orthographic_density   -0.009902
global_synonyms_count          0.113332
rel_aoa                       -0.013266
rel_clustering                -0.108043
rel_frequency                  0.004043
rel_letters_count              0.001979
rel_orthographic_density      -0.010561
rel_synonyms_count             0.012290
dtype: float64

Regressing global synonyms_count with 881 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10342408823916616

intercept                                                 7.965299
global_aoa                                               -0.381502
global_clustering                                         1.739802
global_frequency                                         -0.216545
global_letters_count                                      0.242636
global_orthographic_density                               0.298811
global_synonyms_count                                     1.540704
rel_aoa                                                   0.149740
rel_clustering                                           -0.824227
rel_frequency                                             0.331349
rel_letters_count                                        -0.507127
rel_orthographic_density                                 -0.273960
rel_synonyms_count                                       -3.414363
global_aoa * global_clustering                           -0.022852
global_aoa * global_frequency                             0.023819
global_aoa * global_letters_count                         0.009829
global_aoa * global_orthographic_density                 -0.030543
global_aoa * global_synonyms_count                       -0.005490
global_aoa * rel_aoa                                     -0.012401
global_aoa * rel_clustering                               0.041196
global_aoa * rel_frequency                               -0.041632
global_aoa * rel_letters_count                           -0.024376
global_aoa * rel_orthographic_density                     0.039425
global_aoa * rel_synonyms_count                           0.055282
global_clustering * global_frequency                     -0.081153
global_clustering * global_letters_count                 -0.014651
global_clustering * global_orthographic_density          -0.151013
global_clustering * global_synonyms_count                 0.068868
global_clustering * rel_aoa                              -0.002898
global_clustering * rel_clustering                        0.035991
global_clustering * rel_frequency                         0.082626
global_clustering * rel_letters_count                    -0.061005
global_clustering * rel_orthographic_density              0.136493
global_clustering * rel_synonyms_count                   -0.130024
global_frequency * global_letters_count                  -0.033413
global_frequency * global_orthographic_density           -0.092754
global_frequency * global_synonyms_count                 -0.124760
global_frequency * rel_aoa                               -0.000692
global_frequency * rel_clustering                         0.038111
global_frequency * rel_frequency                          0.012519
global_frequency * rel_letters_count                      0.013852
global_frequency * rel_orthographic_density               0.088754
global_frequency * rel_synonyms_count                     0.171575
global_letters_count * global_orthographic_density       -0.036243
global_letters_count * global_synonyms_count              0.055059
global_letters_count * rel_aoa                           -0.012099
global_letters_count * rel_clustering                    -0.033624
global_letters_count * rel_frequency                      0.025966
global_letters_count * rel_letters_count                  0.000591
global_letters_count * rel_orthographic_density           0.024940
global_letters_count * rel_synonyms_count                 0.034917
global_orthographic_density * global_synonyms_count       0.157855
global_orthographic_density * rel_aoa                    -0.038201
global_orthographic_density * rel_clustering              0.093844
global_orthographic_density * rel_frequency               0.064836
global_orthographic_density * rel_letters_count           0.060104
global_orthographic_density * rel_orthographic_density   -0.064211
global_orthographic_density * rel_synonyms_count          0.017650
global_synonyms_count * rel_aoa                           0.054248
global_synonyms_count * rel_clustering                    0.093402
global_synonyms_count * rel_frequency                     0.118694
global_synonyms_count * rel_letters_count                -0.039745
global_synonyms_count * rel_orthographic_density         -0.037052
global_synonyms_count * rel_synonyms_count                0.126793
rel_aoa * rel_clustering                                  0.011628
rel_aoa * rel_frequency                                   0.016251
rel_aoa * rel_letters_count                               0.032880
rel_aoa * rel_orthographic_density                        0.024530
rel_aoa * rel_synonyms_count                             -0.076510
rel_clustering * rel_frequency                           -0.026225
rel_clustering * rel_letters_count                        0.094520
rel_clustering * rel_orthographic_density                -0.080254
rel_clustering * rel_synonyms_count                      -0.026920
rel_frequency * rel_letters_count                         0.000365
rel_frequency * rel_orthographic_density                 -0.056915
rel_frequency * rel_synonyms_count                       -0.166516
rel_letters_count * rel_orthographic_density             -0.093481
rel_letters_count * rel_synonyms_count                   -0.013549
rel_orthographic_density * rel_synonyms_count            -0.060271
dtype: float64

Regressing rel synonyms_count with 881 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.15751770981369517

intercept                      0.767306
global_aoa                    -0.001191
global_clustering              0.061671
global_frequency              -0.017435
global_letters_count          -0.004272
global_orthographic_density    0.031137
global_synonyms_count         -0.619590
rel_aoa                       -0.011780
rel_clustering                -0.055638
rel_frequency                  0.006672
rel_letters_count             -0.006843
rel_orthographic_density      -0.045506
rel_synonyms_count             0.849242
dtype: float64

Regressing rel synonyms_count with 881 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.22426720105983647

intercept                                                 9.483724
global_aoa                                               -0.311298
global_clustering                                         1.799557
global_frequency                                         -0.465860
global_letters_count                                     -0.050971
global_orthographic_density                               0.469107
global_synonyms_count                                    -0.115210
rel_aoa                                                   0.240607
rel_clustering                                           -0.975250
rel_frequency                                             0.468539
rel_letters_count                                        -0.271403
rel_orthographic_density                                 -0.343414
rel_synonyms_count                                       -1.609889
global_aoa * global_clustering                           -0.018360
global_aoa * global_frequency                             0.021315
global_aoa * global_letters_count                         0.012195
global_aoa * global_orthographic_density                 -0.040726
global_aoa * global_synonyms_count                       -0.003109
global_aoa * rel_aoa                                     -0.007049
global_aoa * rel_clustering                               0.031877
global_aoa * rel_frequency                               -0.033360
global_aoa * rel_letters_count                           -0.021354
global_aoa * rel_orthographic_density                     0.046241
global_aoa * rel_synonyms_count                           0.064261
global_clustering * global_frequency                     -0.101092
global_clustering * global_letters_count                 -0.037066
global_clustering * global_orthographic_density          -0.105835
global_clustering * global_synonyms_count                 0.135675
global_clustering * rel_aoa                               0.007350
global_clustering * rel_clustering                        0.033588
global_clustering * rel_frequency                         0.083174
global_clustering * rel_letters_count                    -0.031098
global_clustering * rel_orthographic_density              0.114862
global_clustering * rel_synonyms_count                   -0.158522
global_frequency * global_letters_count                  -0.022087
global_frequency * global_orthographic_density           -0.074067
global_frequency * global_synonyms_count                 -0.019845
global_frequency * rel_aoa                               -0.003546
global_frequency * rel_clustering                         0.074352
global_frequency * rel_frequency                          0.009252
global_frequency * rel_letters_count                      0.009052
global_frequency * rel_orthographic_density               0.075397
global_frequency * rel_synonyms_count                     0.095117
global_letters_count * global_orthographic_density       -0.030263
global_letters_count * global_synonyms_count              0.115388
global_letters_count * rel_aoa                           -0.017402
global_letters_count * rel_clustering                    -0.014695
global_letters_count * rel_frequency                      0.012820
global_letters_count * rel_letters_count                  0.001273
global_letters_count * rel_orthographic_density           0.024181
global_letters_count * rel_synonyms_count                -0.029506
global_orthographic_density * global_synonyms_count       0.116986
global_orthographic_density * rel_aoa                    -0.028851
global_orthographic_density * rel_clustering              0.015958
global_orthographic_density * rel_frequency               0.040248
global_orthographic_density * rel_letters_count           0.058435
global_orthographic_density * rel_orthographic_density   -0.054315
global_orthographic_density * rel_synonyms_count         -0.003930
global_synonyms_count * rel_aoa                           0.037448
global_synonyms_count * rel_clustering                   -0.005150
global_synonyms_count * rel_frequency                     0.041136
global_synonyms_count * rel_letters_count                -0.122440
global_synonyms_count * rel_orthographic_density         -0.080185
global_synonyms_count * rel_synonyms_count                0.118076
rel_aoa * rel_clustering                                  0.005653
rel_aoa * rel_frequency                                   0.018538
rel_aoa * rel_letters_count                               0.031745
rel_aoa * rel_orthographic_density                        0.027660
rel_aoa * rel_synonyms_count                             -0.082005
rel_clustering * rel_frequency                           -0.046433
rel_clustering * rel_letters_count                        0.060810
rel_clustering * rel_orthographic_density                -0.040979
rel_clustering * rel_synonyms_count                       0.024287
rel_frequency * rel_letters_count                         0.003030
rel_frequency * rel_orthographic_density                 -0.032098
rel_frequency * rel_synonyms_count                       -0.105927
rel_letters_count * rel_orthographic_density             -0.083369
rel_letters_count * rel_synonyms_count                    0.062893
rel_orthographic_density * rel_synonyms_count             0.029785
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 745 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08678265971384702

intercept                      1.774423
global_aoa                    -0.035464
global_clustering              0.013683
global_frequency               0.017255
global_letters_count          -0.079792
global_orthographic_density    0.107818
global_synonyms_count          0.014431
dtype: float64

Regressing global orthographic_density with 745 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10218441348430618

intercept                                              1.825544
global_aoa                                            -0.061133
global_clustering                                      0.207708
global_frequency                                       0.093089
global_letters_count                                  -0.136820
global_orthographic_density                            0.533162
global_synonyms_count                                  0.189298
global_aoa * global_clustering                         0.004102
global_aoa * global_frequency                          0.008759
global_aoa * global_letters_count                     -0.009111
global_aoa * global_orthographic_density               0.018478
global_aoa * global_synonyms_count                     0.025446
global_clustering * global_frequency                   0.000921
global_clustering * global_letters_count              -0.043546
global_clustering * global_orthographic_density        0.019413
global_clustering * global_synonyms_count             -0.010652
global_frequency * global_letters_count               -0.010781
global_frequency * global_orthographic_density        -0.037887
global_frequency * global_synonyms_count              -0.008902
global_letters_count * global_orthographic_density    -0.006333
global_letters_count * global_synonyms_count          -0.054917
global_orthographic_density * global_synonyms_count    0.016963
dtype: float64

Regressing rel orthographic_density with 745 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.062300511225591104

intercept                     -0.608698
global_aoa                    -0.020103
global_clustering              0.020676
global_frequency               0.023462
global_letters_count          -0.071200
global_orthographic_density    0.077255
global_synonyms_count          0.020458
dtype: float64

Regressing rel orthographic_density with 745 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07890759245597734

intercept                                              0.185524
global_aoa                                            -0.039844
global_clustering                                      0.394951
global_frequency                                       0.169850
global_letters_count                                  -0.240017
global_orthographic_density                            0.186110
global_synonyms_count                                  0.088614
global_aoa * global_clustering                         0.015300
global_aoa * global_frequency                          0.012069
global_aoa * global_letters_count                     -0.005134
global_aoa * global_orthographic_density               0.023513
global_aoa * global_synonyms_count                     0.015496
global_clustering * global_frequency                   0.001588
global_clustering * global_letters_count              -0.079726
global_clustering * global_orthographic_density       -0.010194
global_clustering * global_synonyms_count             -0.055468
global_frequency * global_letters_count               -0.026564
global_frequency * global_orthographic_density        -0.034102
global_frequency * global_synonyms_count              -0.020204
global_letters_count * global_orthographic_density     0.008296
global_letters_count * global_synonyms_count          -0.046306
global_orthographic_density * global_synonyms_count   -0.015819
dtype: float64

Regressing global orthographic_density with 745 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06006555828015647

intercept                   1.624856
rel_aoa                     0.008299
rel_clustering             -0.069706
rel_frequency              -0.002238
rel_letters_count          -0.073358
rel_orthographic_density    0.148661
rel_synonyms_count          0.024007
dtype: float64

Regressing global orthographic_density with 745 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.078840300099565

intercept                                        1.608390
rel_aoa                                          0.088005
rel_clustering                                   0.058847
rel_frequency                                   -0.019756
rel_letters_count                               -0.068430
rel_orthographic_density                         0.223485
rel_synonyms_count                               0.249597
rel_aoa * rel_clustering                         0.057844
rel_aoa * rel_frequency                          0.021160
rel_aoa * rel_letters_count                     -0.005788
rel_aoa * rel_orthographic_density               0.035755
rel_aoa * rel_synonyms_count                     0.047501
rel_clustering * rel_frequency                   0.022738
rel_clustering * rel_letters_count              -0.025331
rel_clustering * rel_orthographic_density        0.066244
rel_clustering * rel_synonyms_count              0.000840
rel_frequency * rel_letters_count                0.006266
rel_frequency * rel_orthographic_density         0.025092
rel_frequency * rel_synonyms_count               0.050721
rel_letters_count * rel_orthographic_density    -0.026093
rel_letters_count * rel_synonyms_count          -0.044353
rel_orthographic_density * rel_synonyms_count   -0.009262
dtype: float64

Regressing rel orthographic_density with 745 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10999070103065157

intercept                  -0.490380
rel_aoa                     0.016622
rel_clustering             -0.037461
rel_frequency               0.043965
rel_letters_count          -0.064735
rel_orthographic_density    0.220313
rel_synonyms_count          0.008645
dtype: float64

Regressing rel orthographic_density with 745 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1249495358920536

intercept                                       -0.473533
rel_aoa                                          0.076767
rel_clustering                                   0.065979
rel_frequency                                    0.058242
rel_letters_count                               -0.043633
rel_orthographic_density                         0.280066
rel_synonyms_count                               0.147342
rel_aoa * rel_clustering                         0.037081
rel_aoa * rel_frequency                          0.008882
rel_aoa * rel_letters_count                     -0.000632
rel_aoa * rel_orthographic_density               0.051438
rel_aoa * rel_synonyms_count                     0.053368
rel_clustering * rel_frequency                   0.008258
rel_clustering * rel_letters_count              -0.037316
rel_clustering * rel_orthographic_density        0.031790
rel_clustering * rel_synonyms_count             -0.051776
rel_frequency * rel_letters_count                0.001416
rel_frequency * rel_orthographic_density         0.030402
rel_frequency * rel_synonyms_count               0.027006
rel_letters_count * rel_orthographic_density    -0.005893
rel_letters_count * rel_synonyms_count          -0.047622
rel_orthographic_density * rel_synonyms_count   -0.040380
dtype: float64

Regressing global orthographic_density with 745 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09716316181005347

intercept                      3.232900
global_aoa                    -0.080048
global_clustering              0.151769
global_frequency              -0.020772
global_letters_count          -0.099187
global_orthographic_density    0.192990
global_synonyms_count         -0.039260
rel_aoa                        0.064679
rel_clustering                -0.160120
rel_frequency                  0.045642
rel_letters_count              0.020033
rel_orthographic_density      -0.106725
rel_synonyms_count             0.056705
dtype: float64

Regressing global orthographic_density with 745 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1550278646618154

intercept                                                -5.940377
global_aoa                                               -0.058693
global_clustering                                        -0.161640
global_frequency                                         -0.089416
global_letters_count                                      0.710502
global_orthographic_density                               5.790332
global_synonyms_count                                     2.704387
rel_aoa                                                   0.243409
rel_clustering                                            1.761593
rel_frequency                                             0.125672
rel_letters_count                                        -0.637752
rel_orthographic_density                                 -4.550559
rel_synonyms_count                                        1.032877
global_aoa * global_clustering                           -0.098308
global_aoa * global_frequency                            -0.010248
global_aoa * global_letters_count                        -0.053140
global_aoa * global_orthographic_density                 -0.107533
global_aoa * global_synonyms_count                        0.022252
global_aoa * rel_aoa                                     -0.002387
global_aoa * rel_clustering                               0.036484
global_aoa * rel_frequency                                0.011465
global_aoa * rel_letters_count                            0.044530
global_aoa * rel_orthographic_density                     0.115085
global_aoa * rel_synonyms_count                          -0.089533
global_clustering * global_frequency                     -0.056556
global_clustering * global_letters_count                 -0.019460
global_clustering * global_orthographic_density           0.729416
global_clustering * global_synonyms_count                -0.143046
global_clustering * rel_aoa                               0.088006
global_clustering * rel_clustering                        0.004150
global_clustering * rel_frequency                         0.021457
global_clustering * rel_letters_count                     0.029477
global_clustering * rel_orthographic_density             -0.667768
global_clustering * rel_synonyms_count                    0.242440
global_frequency * global_letters_count                  -0.035708
global_frequency * global_orthographic_density           -0.015477
global_frequency * global_synonyms_count                 -0.234350
global_frequency * rel_aoa                                0.021952
global_frequency * rel_clustering                        -0.056150
global_frequency * rel_frequency                         -0.004969
global_frequency * rel_letters_count                      0.041567
global_frequency * rel_orthographic_density              -0.054132
global_frequency * rel_synonyms_count                     0.087594
global_letters_count * global_orthographic_density       -0.045866
global_letters_count * global_synonyms_count             -0.204888
global_letters_count * rel_aoa                            0.035893
global_letters_count * rel_clustering                     0.010326
global_letters_count * rel_frequency                      0.020446
global_letters_count * rel_letters_count                 -0.001723
global_letters_count * rel_orthographic_density           0.064273
global_letters_count * rel_synonyms_count                 0.059670
global_orthographic_density * global_synonyms_count      -0.090855
global_orthographic_density * rel_aoa                     0.010161
global_orthographic_density * rel_clustering             -0.655092
global_orthographic_density * rel_frequency              -0.040567
global_orthographic_density * rel_letters_count           0.007981
global_orthographic_density * rel_orthographic_density   -0.061590
global_orthographic_density * rel_synonyms_count         -0.029512
global_synonyms_count * rel_aoa                          -0.056820
global_synonyms_count * rel_clustering                    0.132559
global_synonyms_count * rel_frequency                     0.149145
global_synonyms_count * rel_letters_count                 0.132214
global_synonyms_count * rel_orthographic_density          0.137000
global_synonyms_count * rel_synonyms_count               -0.072875
rel_aoa * rel_clustering                                 -0.003713
rel_aoa * rel_frequency                                  -0.010420
rel_aoa * rel_letters_count                              -0.033729
rel_aoa * rel_orthographic_density                        0.026676
rel_aoa * rel_synonyms_count                              0.149392
rel_clustering * rel_frequency                            0.064296
rel_clustering * rel_letters_count                       -0.063039
rel_clustering * rel_orthographic_density                 0.607658
rel_clustering * rel_synonyms_count                      -0.232328
rel_frequency * rel_letters_count                        -0.024825
rel_frequency * rel_orthographic_density                  0.098646
rel_frequency * rel_synonyms_count                        0.005062
rel_letters_count * rel_orthographic_density             -0.062827
rel_letters_count * rel_synonyms_count                   -0.085099
rel_orthographic_density * rel_synonyms_count            -0.126719
dtype: float64

Regressing rel orthographic_density with 745 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1424678701543104

intercept                      2.083248
global_aoa                    -0.062995
global_clustering              0.126611
global_frequency              -0.007306
global_letters_count          -0.061208
global_orthographic_density   -0.507360
global_synonyms_count         -0.012887
rel_aoa                        0.050453
rel_clustering                -0.121985
rel_frequency                  0.044333
rel_letters_count             -0.019713
rel_orthographic_density       0.656926
rel_synonyms_count             0.025425
dtype: float64

Regressing rel orthographic_density with 745 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1982399172621354

intercept                                                -7.088358
global_aoa                                                0.223699
global_clustering                                         0.132132
global_frequency                                          0.114768
global_letters_count                                      0.384295
global_orthographic_density                               4.821599
global_synonyms_count                                     2.977811
rel_aoa                                                  -0.001161
rel_clustering                                            1.146017
rel_frequency                                            -0.235869
rel_letters_count                                        -0.493761
rel_orthographic_density                                 -3.672083
rel_synonyms_count                                        0.202176
global_aoa * global_clustering                           -0.069395
global_aoa * global_frequency                            -0.010703
global_aoa * global_letters_count                        -0.055106
global_aoa * global_orthographic_density                 -0.127209
global_aoa * global_synonyms_count                       -0.026094
global_aoa * rel_aoa                                      0.000999
global_aoa * rel_clustering                               0.022602
global_aoa * rel_frequency                                0.021415
global_aoa * rel_letters_count                            0.052183
global_aoa * rel_orthographic_density                     0.142727
global_aoa * rel_synonyms_count                          -0.017070
global_clustering * global_frequency                     -0.052704
global_clustering * global_letters_count                 -0.069433
global_clustering * global_orthographic_density           0.598313
global_clustering * global_synonyms_count                -0.175791
global_clustering * rel_aoa                               0.065141
global_clustering * rel_clustering                        0.005703
global_clustering * rel_frequency                         0.021323
global_clustering * rel_letters_count                     0.061321
global_clustering * rel_orthographic_density             -0.532255
global_clustering * rel_synonyms_count                    0.268244
global_frequency * global_letters_count                  -0.037492
global_frequency * global_orthographic_density           -0.072252
global_frequency * global_synonyms_count                 -0.245647
global_frequency * rel_aoa                                0.023707
global_frequency * rel_clustering                        -0.052413
global_frequency * rel_frequency                         -0.000318
global_frequency * rel_letters_count                      0.041640
global_frequency * rel_orthographic_density               0.014623
global_frequency * rel_synonyms_count                     0.102839
global_letters_count * global_orthographic_density        0.011930
global_letters_count * global_synonyms_count             -0.205714
global_letters_count * rel_aoa                            0.042029
global_letters_count * rel_clustering                     0.069878
global_letters_count * rel_frequency                      0.031786
global_letters_count * rel_letters_count                 -0.004217
global_letters_count * rel_orthographic_density           0.021298
global_letters_count * rel_synonyms_count                 0.075404
global_orthographic_density * global_synonyms_count      -0.113604
global_orthographic_density * rel_aoa                     0.014367
global_orthographic_density * rel_clustering             -0.474222
global_orthographic_density * rel_frequency               0.054886
global_orthographic_density * rel_letters_count          -0.023434
global_orthographic_density * rel_orthographic_density   -0.040743
global_orthographic_density * rel_synonyms_count          0.097778
global_synonyms_count * rel_aoa                          -0.044801
global_synonyms_count * rel_clustering                    0.252651
global_synonyms_count * rel_frequency                     0.176489
global_synonyms_count * rel_letters_count                 0.128367
global_synonyms_count * rel_orthographic_density          0.089288
global_synonyms_count * rel_synonyms_count               -0.068617
rel_aoa * rel_clustering                                 -0.003471
rel_aoa * rel_frequency                                  -0.019879
rel_aoa * rel_letters_count                              -0.044889
rel_aoa * rel_orthographic_density                        0.016065
rel_aoa * rel_synonyms_count                              0.119547
rel_clustering * rel_frequency                            0.059315
rel_clustering * rel_letters_count                       -0.108098
rel_clustering * rel_orthographic_density                 0.416077
rel_clustering * rel_synonyms_count                      -0.354674
rel_frequency * rel_letters_count                        -0.039685
rel_frequency * rel_orthographic_density                  0.001130
rel_frequency * rel_synonyms_count                       -0.033588
rel_letters_count * rel_orthographic_density             -0.054650
rel_letters_count * rel_synonyms_count                   -0.090946
rel_orthographic_density * rel_synonyms_count            -0.174748
dtype: float64