Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.discrete, source=Source.all, past=Past.all, durl=Durl.exclude_past, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 15119 substitutions for model Model(time=Time.discrete, source=Source.all, past=Past.all, durl=Durl.exclude_past, max_distance=1)
100% (15119 of 15119) |####################| Elapsed Time: 0:03:42 Time: 0:03:42

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | *** | *** | *** | **  |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *   | ns. |
H_00 | ns. | ns. | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | **  | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | ns. |
H_00 | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | **  | ns. | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *   | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | *   | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *   | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | *   | *   |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | ns. | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | **  |
H_00 | ns. | *   | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | **  | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | **  |
H_00 | ns. | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | **  |
H_00 | ns. | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | **  | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | ns. | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | **  |
H_00 | ns. | ns. | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *   |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | ns. | **  | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *   |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.54225853  0.16320232  0.08232772  0.06717044  0.03678721  0.0319946
  0.02048765  0.01835907  0.01581637  0.00898108  0.00738444]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.443599 0.316367 -0.081647 0.245961 0.223270 -0.426827 0.221315 0.288217 -0.411705 0.276457 -0.161091 0.003120
Component-1 0.299908 -0.403115 0.139620 -0.275936 -0.287068 -0.417796 0.167611 -0.294412 -0.452079 0.216897 -0.169097 0.013129
Component-2 -0.685533 -0.090121 0.120051 -0.070612 -0.692904 0.088525 0.009312 -0.042703 0.062507 -0.077001 -0.007284 0.052660

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (15119 of 15119) |####################| Elapsed Time: 0:02:48 Time: 0:02:48

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | **  | *** | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.67271316  0.18552666]

Out[35]:
aoa frequency letters_count
Component-0 -0.738035 0.358249 -0.571805
Component-1 0.450823 -0.368725 -0.812896

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (15119 of 15119) |####################| Elapsed Time: 0:01:36 Time: 0:01:36

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1115 (cluster-unique) substitutions, but the PCA is in fact computed on 907 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  *** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
    * global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06768116254566503

intercept                      6.235912
global_aoa                    -0.011847
global_clustering              0.087012
global_frequency               0.346727
global_letters_count          -0.023213
global_orthographic_density   -0.066673
global_synonyms_count         -0.098932
dtype: float64

Regressing global frequency with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08229120970813819

intercept                                              13.810942
global_aoa                                             -0.404011
global_clustering                                       0.358266
global_frequency                                        0.105180
global_letters_count                                   -1.286151
global_orthographic_density                            -0.809256
global_synonyms_count                                  -0.421776
global_aoa * global_clustering                          0.049650
global_aoa * global_frequency                           0.029966
global_aoa * global_letters_count                       0.055385
global_aoa * global_orthographic_density                0.040053
global_aoa * global_synonyms_count                      0.063634
global_clustering * global_frequency                    0.005529
global_clustering * global_letters_count               -0.119875
global_clustering * global_orthographic_density         0.057949
global_clustering * global_synonyms_count               0.064237
global_frequency * global_letters_count                 0.008838
global_frequency * global_orthographic_density          0.037285
global_frequency * global_synonyms_count               -0.018481
global_letters_count * global_orthographic_density      0.077663
global_letters_count * global_synonyms_count            0.050813
global_orthographic_density * global_synonyms_count     0.146826
dtype: float64

Regressing rel frequency with 680 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.03180437798475133

intercept                     -5.621690
global_aoa                    -0.013954
global_clustering              0.101904
global_frequency               0.284680
global_letters_count           0.050651
global_orthographic_density   -0.070604
global_synonyms_count          0.016534
dtype: float64

Regressing rel frequency with 680 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.04402748669607848

intercept                                             -8.260316
global_aoa                                            -0.274753
global_clustering                                     -1.218468
global_frequency                                       0.306544
global_letters_count                                  -0.213618
global_orthographic_density                            0.616509
global_synonyms_count                                 -0.035598
global_aoa * global_clustering                         0.061393
global_aoa * global_frequency                          0.037268
global_aoa * global_letters_count                      0.038275
global_aoa * global_orthographic_density               0.005924
global_aoa * global_synonyms_count                     0.133439
global_clustering * global_frequency                   0.055268
global_clustering * global_letters_count               0.023299
global_clustering * global_orthographic_density        0.201470
global_clustering * global_synonyms_count              0.152549
global_frequency * global_letters_count                0.009471
global_frequency * global_orthographic_density         0.018148
global_frequency * global_synonyms_count              -0.015915
global_letters_count * global_orthographic_density     0.033665
global_letters_count * global_synonyms_count          -0.004110
global_orthographic_density * global_synonyms_count    0.266572
dtype: float64

Regressing global frequency with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.03888686389659979

intercept                   9.318972
rel_aoa                     0.055328
rel_clustering              0.009002
rel_frequency               0.221201
rel_letters_count          -0.054771
rel_orthographic_density   -0.041349
rel_synonyms_count         -0.105414
dtype: float64

Regressing global frequency with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.051360549482052924

intercept                                        9.264453
rel_aoa                                          0.179116
rel_clustering                                   0.207187
rel_frequency                                    0.213509
rel_letters_count                                0.046597
rel_orthographic_density                         0.047073
rel_synonyms_count                              -0.095525
rel_aoa * rel_clustering                         0.056537
rel_aoa * rel_frequency                          0.040401
rel_aoa * rel_letters_count                     -0.007114
rel_aoa * rel_orthographic_density               0.013283
rel_aoa * rel_synonyms_count                     0.031596
rel_clustering * rel_frequency                   0.014212
rel_clustering * rel_letters_count              -0.065132
rel_clustering * rel_orthographic_density        0.042740
rel_clustering * rel_synonyms_count              0.211784
rel_frequency * rel_letters_count                0.016745
rel_frequency * rel_orthographic_density         0.055657
rel_frequency * rel_synonyms_count               0.050779
rel_letters_count * rel_orthographic_density     0.017311
rel_letters_count * rel_synonyms_count          -0.053943
rel_orthographic_density * rel_synonyms_count   -0.141171
dtype: float64

Regressing rel frequency with 680 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.2439595550620587

intercept                  -1.771436
rel_aoa                     0.043228
rel_clustering              0.241247
rel_frequency               0.634760
rel_letters_count          -0.131439
rel_orthographic_density   -0.232933
rel_synonyms_count         -0.042782
dtype: float64

Regressing rel frequency with 680 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.25477205790186264

intercept                                       -1.787000
rel_aoa                                          0.053716
rel_clustering                                   0.394875
rel_frequency                                    0.645470
rel_letters_count                               -0.070647
rel_orthographic_density                        -0.188680
rel_synonyms_count                               0.026510
rel_aoa * rel_clustering                        -0.017702
rel_aoa * rel_frequency                         -0.011810
rel_aoa * rel_letters_count                      0.030360
rel_aoa * rel_orthographic_density               0.101403
rel_aoa * rel_synonyms_count                     0.153574
rel_clustering * rel_frequency                   0.007162
rel_clustering * rel_letters_count              -0.082836
rel_clustering * rel_orthographic_density       -0.074205
rel_clustering * rel_synonyms_count              0.011538
rel_frequency * rel_letters_count                0.018264
rel_frequency * rel_orthographic_density         0.043096
rel_frequency * rel_synonyms_count               0.003286
rel_letters_count * rel_orthographic_density     0.005982
rel_letters_count * rel_synonyms_count          -0.033454
rel_orthographic_density * rel_synonyms_count    0.107850
dtype: float64

Regressing global frequency with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07580067583287708

intercept                      5.110271
global_aoa                    -0.094857
global_clustering              0.052375
global_frequency               0.367556
global_letters_count           0.170230
global_orthographic_density    0.119351
global_synonyms_count         -0.148586
rel_aoa                        0.121436
rel_clustering                 0.036302
rel_frequency                 -0.023078
rel_letters_count             -0.204276
rel_orthographic_density      -0.203291
rel_synonyms_count             0.056536
dtype: float64

Regressing global frequency with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.15588101836814117

intercept                                                -85.568551
global_aoa                                                 2.237900
global_clustering                                        -14.383939
global_frequency                                           3.061129
global_letters_count                                       2.027819
global_orthographic_density                               16.505349
global_synonyms_count                                     19.987982
rel_aoa                                                   -2.517479
rel_clustering                                            16.320461
rel_frequency                                             -2.543914
rel_letters_count                                         -4.154434
rel_orthographic_density                                 -18.112475
rel_synonyms_count                                       -15.742970
global_aoa * global_clustering                             0.456454
global_aoa * global_frequency                              0.131833
global_aoa * global_letters_count                          0.011937
global_aoa * global_orthographic_density                  -0.403708
global_aoa * global_synonyms_count                        -0.414236
global_aoa * rel_aoa                                       0.023093
global_aoa * rel_clustering                               -0.383427
global_aoa * rel_frequency                                -0.061578
global_aoa * rel_letters_count                             0.032297
global_aoa * rel_orthographic_density                      0.458185
global_aoa * rel_synonyms_count                            0.394743
global_clustering * global_frequency                       0.474856
global_clustering * global_letters_count                   0.331196
global_clustering * global_orthographic_density            2.325595
global_clustering * global_synonyms_count                  1.380340
global_clustering * rel_aoa                               -0.400703
global_clustering * rel_clustering                         0.001195
global_clustering * rel_frequency                         -0.385833
global_clustering * rel_letters_count                     -0.532780
global_clustering * rel_orthographic_density              -2.142896
global_clustering * rel_synonyms_count                    -1.083088
global_frequency * global_letters_count                   -0.013510
global_frequency * global_orthographic_density            -0.063326
global_frequency * global_synonyms_count                  -0.478618
global_frequency * rel_aoa                                -0.084385
global_frequency * rel_clustering                         -0.689512
global_frequency * rel_frequency                          -0.004674
global_frequency * rel_letters_count                      -0.010435
global_frequency * rel_orthographic_density                0.175344
global_frequency * rel_synonyms_count                      0.290983
global_letters_count * global_orthographic_density         0.208840
global_letters_count * global_synonyms_count              -0.243840
global_letters_count * rel_aoa                             0.125398
global_letters_count * rel_clustering                     -0.440823
global_letters_count * rel_frequency                       0.050700
global_letters_count * rel_letters_count                   0.040286
global_letters_count * rel_orthographic_density           -0.016086
global_letters_count * rel_synonyms_count                  0.148226
global_orthographic_density * global_synonyms_count       -1.318666
global_orthographic_density * rel_aoa                      0.167820
global_orthographic_density * rel_clustering              -1.961446
global_orthographic_density * rel_frequency                0.017892
global_orthographic_density * rel_letters_count            0.038246
global_orthographic_density * rel_orthographic_density     0.027694
global_orthographic_density * rel_synonyms_count           1.355235
global_synonyms_count * rel_aoa                            0.365439
global_synonyms_count * rel_clustering                    -1.193159
global_synonyms_count * rel_frequency                      0.551362
global_synonyms_count * rel_letters_count                  0.721581
global_synonyms_count * rel_orthographic_density           1.752205
global_synonyms_count * rel_synonyms_count                 0.053460
rel_aoa * rel_clustering                                   0.283631
rel_aoa * rel_frequency                                    0.017654
rel_aoa * rel_letters_count                               -0.153932
rel_aoa * rel_orthographic_density                        -0.176700
rel_aoa * rel_synonyms_count                              -0.318751
rel_clustering * rel_frequency                             0.555427
rel_clustering * rel_letters_count                         0.531558
rel_clustering * rel_orthographic_density                  1.668832
rel_clustering * rel_synonyms_count                        1.055495
rel_frequency * rel_letters_count                          0.000647
rel_frequency * rel_orthographic_density                  -0.046938
rel_frequency * rel_synonyms_count                        -0.383320
rel_letters_count * rel_orthographic_density              -0.087493
rel_letters_count * rel_synonyms_count                    -0.561976
rel_orthographic_density * rel_synonyms_count             -1.703577
dtype: float64

Regressing rel frequency with 680 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3225125520180617

intercept                      4.626631
global_aoa                    -0.094272
global_clustering              0.100595
global_frequency              -0.567740
global_letters_count           0.214186
global_orthographic_density    0.116025
global_synonyms_count         -0.174924
rel_aoa                        0.106286
rel_clustering                 0.015749
rel_frequency                  0.954471
rel_letters_count             -0.252354
rel_orthographic_density      -0.194680
rel_synonyms_count             0.073381
dtype: float64

Regressing rel frequency with 680 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.38192642433472723

intercept                                                -83.455459
global_aoa                                                 2.024487
global_clustering                                        -14.192028
global_frequency                                           1.875405
global_letters_count                                       1.892444
global_orthographic_density                               16.717436
global_synonyms_count                                     18.188303
rel_aoa                                                   -2.362906
rel_clustering                                            16.589242
rel_frequency                                             -1.460726
rel_letters_count                                         -3.807140
rel_orthographic_density                                 -18.130280
rel_synonyms_count                                       -14.069127
global_aoa * global_clustering                             0.446993
global_aoa * global_frequency                              0.134744
global_aoa * global_letters_count                          0.023610
global_aoa * global_orthographic_density                  -0.377799
global_aoa * global_synonyms_count                        -0.394070
global_aoa * rel_aoa                                       0.023215
global_aoa * rel_clustering                               -0.376475
global_aoa * rel_frequency                                -0.064526
global_aoa * rel_letters_count                             0.023814
global_aoa * rel_orthographic_density                      0.435018
global_aoa * rel_synonyms_count                            0.377352
global_clustering * global_frequency                       0.460583
global_clustering * global_letters_count                   0.362768
global_clustering * global_orthographic_density            2.340769
global_clustering * global_synonyms_count                  1.297565
global_clustering * rel_aoa                               -0.393194
global_clustering * rel_clustering                        -0.039603
global_clustering * rel_frequency                         -0.341876
global_clustering * rel_letters_count                     -0.516880
global_clustering * rel_orthographic_density              -2.105241
global_clustering * rel_synonyms_count                    -0.959867
global_frequency * global_letters_count                    0.030921
global_frequency * global_orthographic_density            -0.059572
global_frequency * global_synonyms_count                  -0.397113
global_frequency * rel_aoa                                -0.083111
global_frequency * rel_clustering                         -0.734752
global_frequency * rel_frequency                           0.014117
global_frequency * rel_letters_count                      -0.046093
global_frequency * rel_orthographic_density                0.178068
global_frequency * rel_synonyms_count                      0.241902
global_letters_count * global_orthographic_density         0.153522
global_letters_count * global_synonyms_count              -0.230534
global_letters_count * rel_aoa                             0.101164
global_letters_count * rel_clustering                     -0.456762
global_letters_count * rel_frequency                       0.046331
global_letters_count * rel_letters_count                   0.039019
global_letters_count * rel_orthographic_density            0.056950
global_letters_count * rel_synonyms_count                  0.138594
global_orthographic_density * global_synonyms_count       -1.291792
global_orthographic_density * rel_aoa                      0.148135
global_orthographic_density * rel_clustering              -1.970614
global_orthographic_density * rel_frequency                0.052201
global_orthographic_density * rel_letters_count            0.083829
global_orthographic_density * rel_orthographic_density     0.066961
global_orthographic_density * rel_synonyms_count           1.317142
global_synonyms_count * rel_aoa                            0.327706
global_synonyms_count * rel_clustering                    -1.180797
global_synonyms_count * rel_frequency                      0.445452
global_synonyms_count * rel_letters_count                  0.666615
global_synonyms_count * rel_orthographic_density           1.618416
global_synonyms_count * rel_synonyms_count                 0.060246
rel_aoa * rel_clustering                                   0.296595
rel_aoa * rel_frequency                                    0.007823
rel_aoa * rel_letters_count                               -0.132498
rel_aoa * rel_orthographic_density                        -0.148362
rel_aoa * rel_synonyms_count                              -0.280626
rel_clustering * rel_frequency                             0.569934
rel_clustering * rel_letters_count                         0.524333
rel_clustering * rel_orthographic_density                  1.658843
rel_clustering * rel_synonyms_count                        1.005056
rel_frequency * rel_letters_count                         -0.003575
rel_frequency * rel_orthographic_density                  -0.085274
rel_frequency * rel_synonyms_count                        -0.301795
rel_letters_count * rel_orthographic_density              -0.151515
rel_letters_count * rel_synonyms_count                    -0.515126
rel_orthographic_density * rel_synonyms_count             -1.546422
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 629 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.06969429924371007

intercept                      7.741006
global_aoa                     0.225327
global_clustering              0.155118
global_frequency              -0.123153
global_letters_count           0.025970
global_orthographic_density   -0.075833
global_synonyms_count          0.013282
dtype: float64

Regressing global aoa with 629 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.09152251202280393

intercept                                              10.545960
global_aoa                                             -0.822726
global_clustering                                       0.364884
global_frequency                                       -0.278221
global_letters_count                                    1.085694
global_orthographic_density                            -1.636934
global_synonyms_count                                  -0.031892
global_aoa * global_clustering                         -0.045587
global_aoa * global_frequency                           0.058587
global_aoa * global_letters_count                       0.034682
global_aoa * global_orthographic_density                0.031899
global_aoa * global_synonyms_count                     -0.031338
global_clustering * global_frequency                   -0.035129
global_clustering * global_letters_count                0.117282
global_clustering * global_orthographic_density        -0.206307
global_clustering * global_synonyms_count              -0.019174
global_frequency * global_letters_count                -0.071979
global_frequency * global_orthographic_density         -0.011052
global_frequency * global_synonyms_count                0.033113
global_letters_count * global_orthographic_density      0.020173
global_letters_count * global_synonyms_count           -0.044098
global_orthographic_density * global_synonyms_count     0.100601
dtype: float64

Regressing rel aoa with 629 measures, no interactions
           ^^^^^^^
R^2 = 0.022342793140904815

intercept                      1.756621
global_aoa                     0.091810
global_clustering              0.019999
global_frequency              -0.113456
global_letters_count           0.012262
global_orthographic_density   -0.013848
global_synonyms_count          0.062413
dtype: float64

Regressing rel aoa with 629 measures, with interactions
           ^^^^^^^
R^2 = 0.04459314165016004

intercept                                              9.458368
global_aoa                                            -0.513429
global_clustering                                      1.377439
global_frequency                                      -0.295723
global_letters_count                                   0.295550
global_orthographic_density                           -2.466756
global_synonyms_count                                 -0.818346
global_aoa * global_clustering                        -0.029843
global_aoa * global_frequency                          0.029213
global_aoa * global_letters_count                      0.010888
global_aoa * global_orthographic_density               0.093295
global_aoa * global_synonyms_count                    -0.060347
global_clustering * global_frequency                  -0.055538
global_clustering * global_letters_count              -0.027677
global_clustering * global_orthographic_density       -0.306361
global_clustering * global_synonyms_count             -0.288037
global_frequency * global_letters_count               -0.055138
global_frequency * global_orthographic_density         0.006443
global_frequency * global_synonyms_count              -0.015130
global_letters_count * global_orthographic_density    -0.010851
global_letters_count * global_synonyms_count          -0.056350
global_orthographic_density * global_synonyms_count    0.054342
dtype: float64

Regressing global aoa with 629 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.026589575700896484

intercept                   6.512758
rel_aoa                    -0.042402
rel_clustering              0.287466
rel_frequency              -0.047139
rel_letters_count           0.061654
rel_orthographic_density   -0.285894
rel_synonyms_count          0.002846
dtype: float64

Regressing global aoa with 629 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.06017002905581237

intercept                                        6.581028
rel_aoa                                         -0.140539
rel_clustering                                  -0.045755
rel_frequency                                   -0.058418
rel_letters_count                               -0.072499
rel_orthographic_density                        -0.519822
rel_synonyms_count                               0.043831
rel_aoa * rel_clustering                        -0.200208
rel_aoa * rel_frequency                         -0.042534
rel_aoa * rel_letters_count                      0.034028
rel_aoa * rel_orthographic_density               0.031986
rel_aoa * rel_synonyms_count                    -0.203629
rel_clustering * rel_frequency                   0.048851
rel_clustering * rel_letters_count               0.213636
rel_clustering * rel_orthographic_density       -0.077403
rel_clustering * rel_synonyms_count             -0.431857
rel_frequency * rel_letters_count               -0.024071
rel_frequency * rel_orthographic_density        -0.050866
rel_frequency * rel_synonyms_count              -0.198078
rel_letters_count * rel_orthographic_density     0.052953
rel_letters_count * rel_synonyms_count           0.055804
rel_orthographic_density * rel_synonyms_count    0.378627
dtype: float64

Regressing rel aoa with 629 measures, no interactions
           ^^^^^^^
R^2 = 0.12559584150466663

intercept                   0.785829
rel_aoa                     0.393179
rel_clustering             -0.067477
rel_frequency              -0.111731
rel_letters_count          -0.016207
rel_orthographic_density    0.035639
rel_synonyms_count          0.083561
dtype: float64

Regressing rel aoa with 629 measures, with interactions
           ^^^^^^^
R^2 = 0.15152828516439798

intercept                                        0.964967
rel_aoa                                          0.517175
rel_clustering                                  -0.427713
rel_frequency                                   -0.104797
rel_letters_count                               -0.036504
rel_orthographic_density                         0.218507
rel_synonyms_count                               0.179849
rel_aoa * rel_clustering                        -0.131376
rel_aoa * rel_frequency                          0.029730
rel_aoa * rel_letters_count                      0.032241
rel_aoa * rel_orthographic_density               0.068830
rel_aoa * rel_synonyms_count                    -0.164719
rel_clustering * rel_frequency                   0.027013
rel_clustering * rel_letters_count               0.232750
rel_clustering * rel_orthographic_density        0.069858
rel_clustering * rel_synonyms_count              0.032375
rel_frequency * rel_letters_count                0.027292
rel_frequency * rel_orthographic_density         0.121753
rel_frequency * rel_synonyms_count              -0.081781
rel_letters_count * rel_orthographic_density     0.030884
rel_letters_count * rel_synonyms_count           0.104041
rel_orthographic_density * rel_synonyms_count    0.431969
dtype: float64

Regressing global aoa with 629 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.10148133958855354

intercept                      5.932032
global_aoa                     0.491608
global_clustering              0.204626
global_frequency              -0.082749
global_letters_count           0.023860
global_orthographic_density    0.038906
global_synonyms_count         -0.114958
rel_aoa                       -0.388191
rel_clustering                -0.102687
rel_frequency                 -0.047283
rel_letters_count              0.008019
rel_orthographic_density      -0.060943
rel_synonyms_count             0.156231
dtype: float64

Regressing global aoa with 629 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.20666533454822578

intercept                                                 91.355861
global_aoa                                                -0.381367
global_clustering                                         14.667461
global_frequency                                          -1.126638
global_letters_count                                      -6.655980
global_orthographic_density                              -15.290332
global_synonyms_count                                    -14.376152
rel_aoa                                                    0.361130
rel_clustering                                           -11.016509
rel_frequency                                              2.899314
rel_letters_count                                          6.801163
rel_orthographic_density                                  11.203375
rel_synonyms_count                                         4.155096
global_aoa * global_clustering                             0.038515
global_aoa * global_frequency                             -0.020597
global_aoa * global_letters_count                          0.083383
global_aoa * global_orthographic_density                   0.441410
global_aoa * global_synonyms_count                         0.228858
global_aoa * rel_aoa                                       0.029423
global_aoa * rel_clustering                               -0.006344
global_aoa * rel_frequency                                 0.041580
global_aoa * rel_letters_count                            -0.109480
global_aoa * rel_orthographic_density                     -0.498167
global_aoa * rel_synonyms_count                           -0.052206
global_clustering * global_frequency                      -0.271778
global_clustering * global_letters_count                  -0.739291
global_clustering * global_orthographic_density           -3.663951
global_clustering * global_synonyms_count                 -0.384088
global_clustering * rel_aoa                                0.067664
global_clustering * rel_clustering                         0.222999
global_clustering * rel_frequency                          0.455447
global_clustering * rel_letters_count                      0.557286
global_clustering * rel_orthographic_density               3.227417
global_clustering * rel_synonyms_count                    -0.166415
global_frequency * global_letters_count                    0.233893
global_frequency * global_orthographic_density            -0.644839
global_frequency * global_synonyms_count                   0.371771
global_frequency * rel_aoa                                 0.050412
global_frequency * rel_clustering                          0.111247
global_frequency * rel_frequency                          -0.058988
global_frequency * rel_letters_count                      -0.311696
global_frequency * rel_orthographic_density                0.698249
global_frequency * rel_synonyms_count                      0.014563
global_letters_count * global_orthographic_density        -0.617171
global_letters_count * global_synonyms_count               0.747885
global_letters_count * rel_aoa                            -0.109077
global_letters_count * rel_clustering                      0.775320
global_letters_count * rel_frequency                      -0.236315
global_letters_count * rel_letters_count                   0.063743
global_letters_count * rel_orthographic_density            0.756954
global_letters_count * rel_synonyms_count                 -0.377485
global_orthographic_density * global_synonyms_count        1.730455
global_orthographic_density * rel_aoa                     -0.232911
global_orthographic_density * rel_clustering               3.193939
global_orthographic_density * rel_frequency                0.493812
global_orthographic_density * rel_letters_count            0.348641
global_orthographic_density * rel_orthographic_density     0.187475
global_orthographic_density * rel_synonyms_count          -1.642353
global_synonyms_count * rel_aoa                           -0.309780
global_synonyms_count * rel_clustering                     0.422567
global_synonyms_count * rel_frequency                     -0.445379
global_synonyms_count * rel_letters_count                 -1.291405
global_synonyms_count * rel_orthographic_density          -2.213996
global_synonyms_count * rel_synonyms_count                -0.105014
rel_aoa * rel_clustering                                  -0.249101
rel_aoa * rel_frequency                                   -0.039963
rel_aoa * rel_letters_count                                0.102101
rel_aoa * rel_orthographic_density                         0.258873
rel_aoa * rel_synonyms_count                              -0.008660
rel_clustering * rel_frequency                            -0.329146
rel_clustering * rel_letters_count                        -0.441269
rel_clustering * rel_orthographic_density                 -2.752056
rel_clustering * rel_synonyms_count                       -0.053445
rel_frequency * rel_letters_count                          0.223593
rel_frequency * rel_orthographic_density                  -0.553574
rel_frequency * rel_synonyms_count                         0.030075
rel_letters_count * rel_orthographic_density              -0.259937
rel_letters_count * rel_synonyms_count                     1.002779
rel_orthographic_density * rel_synonyms_count              2.602924
dtype: float64

Regressing rel aoa with 629 measures, no interactions
           ^^^^^^^
R^2 = 0.1582600288888818

intercept                      4.675375
global_aoa                    -0.330892
global_clustering              0.137302
global_frequency              -0.082597
global_letters_count          -0.022356
global_orthographic_density   -0.125048
global_synonyms_count          0.087569
rel_aoa                        0.622669
rel_clustering                -0.069027
rel_frequency                 -0.025978
rel_letters_count              0.035397
rel_orthographic_density       0.011472
rel_synonyms_count            -0.020595
dtype: float64

Regressing rel aoa with 629 measures, with interactions
           ^^^^^^^
R^2 = 0.26151239937189474

intercept                                                 68.968082
global_aoa                                                -1.722709
global_clustering                                         10.873616
global_frequency                                          -0.912384
global_letters_count                                      -3.208290
global_orthographic_density                              -14.028800
global_synonyms_count                                    -11.850214
rel_aoa                                                    2.150120
rel_clustering                                            -5.204249
rel_frequency                                              2.362469
rel_letters_count                                          4.290251
rel_orthographic_density                                  11.198617
rel_synonyms_count                                         3.721372
global_aoa * global_clustering                            -0.086401
global_aoa * global_frequency                             -0.008349
global_aoa * global_letters_count                          0.006690
global_aoa * global_orthographic_density                   0.417842
global_aoa * global_synonyms_count                         0.280820
global_aoa * rel_aoa                                      -0.006458
global_aoa * rel_clustering                                0.001482
global_aoa * rel_frequency                                 0.023918
global_aoa * rel_letters_count                            -0.044964
global_aoa * rel_orthographic_density                     -0.490081
global_aoa * rel_synonyms_count                           -0.063478
global_clustering * global_frequency                      -0.202853
global_clustering * global_letters_count                  -0.380138
global_clustering * global_orthographic_density           -2.749586
global_clustering * global_synonyms_count                 -0.739519
global_clustering * rel_aoa                                0.146779
global_clustering * rel_clustering                         0.241916
global_clustering * rel_frequency                          0.289927
global_clustering * rel_letters_count                      0.324697
global_clustering * rel_orthographic_density               2.464212
global_clustering * rel_synonyms_count                     0.214919
global_frequency * global_letters_count                    0.126422
global_frequency * global_orthographic_density            -0.342603
global_frequency * global_synonyms_count                   0.087878
global_frequency * rel_aoa                                 0.021048
global_frequency * rel_clustering                         -0.060761
global_frequency * rel_frequency                          -0.053190
global_frequency * rel_letters_count                      -0.203022
global_frequency * rel_orthographic_density                0.385876
global_frequency * rel_synonyms_count                      0.166042
global_letters_count * global_orthographic_density        -0.387272
global_letters_count * global_synonyms_count               0.574799
global_letters_count * rel_aoa                            -0.022891
global_letters_count * rel_clustering                      0.451004
global_letters_count * rel_frequency                      -0.162749
global_letters_count * rel_letters_count                   0.036673
global_letters_count * rel_orthographic_density            0.545974
global_letters_count * rel_synonyms_count                 -0.384412
global_orthographic_density * global_synonyms_count        1.306527
global_orthographic_density * rel_aoa                     -0.259094
global_orthographic_density * rel_clustering               2.193845
global_orthographic_density * rel_frequency                0.122699
global_orthographic_density * rel_letters_count            0.107654
global_orthographic_density * rel_orthographic_density     0.041439
global_orthographic_density * rel_synonyms_count          -1.241440
global_synonyms_count * rel_aoa                           -0.419059
global_synonyms_count * rel_clustering                     0.496189
global_synonyms_count * rel_frequency                     -0.182419
global_synonyms_count * rel_letters_count                 -0.835024
global_synonyms_count * rel_orthographic_density          -1.435959
global_synonyms_count * rel_synonyms_count                -0.065542
rel_aoa * rel_clustering                                  -0.244953
rel_aoa * rel_frequency                                   -0.025295
rel_aoa * rel_letters_count                                0.046904
rel_aoa * rel_orthographic_density                         0.326997
rel_aoa * rel_synonyms_count                               0.066780
rel_clustering * rel_frequency                            -0.031996
rel_clustering * rel_letters_count                        -0.125050
rel_clustering * rel_orthographic_density                 -1.814073
rel_clustering * rel_synonyms_count                       -0.228513
rel_frequency * rel_letters_count                          0.191133
rel_frequency * rel_orthographic_density                  -0.126725
rel_frequency * rel_synonyms_count                        -0.192319
rel_letters_count * rel_orthographic_density              -0.143691
rel_letters_count * rel_synonyms_count                     0.733135
rel_orthographic_density * rel_synonyms_count              1.878985
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 542 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.025660863683973356

intercept                     -4.602434
global_aoa                     0.000026
global_clustering              0.124596
global_frequency              -0.039994
global_letters_count          -0.013083
global_orthographic_density    0.002965
global_synonyms_count         -0.040497
dtype: float64

Regressing global clustering with 542 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.06119654764284921

intercept                                             -7.891858
global_aoa                                             0.188954
global_clustering                                     -0.062549
global_frequency                                      -0.084219
global_letters_count                                   0.538300
global_orthographic_density                            0.842107
global_synonyms_count                                  1.045205
global_aoa * global_clustering                         0.017611
global_aoa * global_frequency                         -0.000151
global_aoa * global_letters_count                     -0.012545
global_aoa * global_orthographic_density              -0.004482
global_aoa * global_synonyms_count                     0.002725
global_clustering * global_frequency                  -0.025670
global_clustering * global_letters_count               0.051109
global_clustering * global_orthographic_density        0.024550
global_clustering * global_synonyms_count             -0.007516
global_frequency * global_letters_count               -0.009178
global_frequency * global_orthographic_density        -0.039817
global_frequency * global_synonyms_count              -0.009169
global_letters_count * global_orthographic_density    -0.037887
global_letters_count * global_synonyms_count          -0.130699
global_orthographic_density * global_synonyms_count   -0.212236
dtype: float64

Regressing rel clustering with 542 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.016645369753290118

intercept                      1.127583
global_aoa                     0.001158
global_clustering              0.098056
global_frequency              -0.021547
global_letters_count           0.001155
global_orthographic_density    0.032062
global_synonyms_count         -0.061683
dtype: float64

Regressing rel clustering with 542 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.03725821085715564

intercept                                             -2.496811
global_aoa                                             0.125913
global_clustering                                     -0.295477
global_frequency                                       0.020668
global_letters_count                                   0.395108
global_orthographic_density                            0.757969
global_synonyms_count                                  1.349382
global_aoa * global_clustering                         0.002072
global_aoa * global_frequency                         -0.005561
global_aoa * global_letters_count                     -0.007863
global_aoa * global_orthographic_density              -0.007067
global_aoa * global_synonyms_count                    -0.007320
global_clustering * global_frequency                  -0.000284
global_clustering * global_letters_count               0.051255
global_clustering * global_orthographic_density        0.045313
global_clustering * global_synonyms_count              0.061033
global_frequency * global_letters_count                0.002458
global_frequency * global_orthographic_density        -0.019709
global_frequency * global_synonyms_count              -0.013061
global_letters_count * global_orthographic_density    -0.028257
global_letters_count * global_synonyms_count          -0.105579
global_orthographic_density * global_synonyms_count   -0.198555
dtype: float64

Regressing global clustering with 542 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.012077482047862964

intercept                  -5.835532
rel_aoa                    -0.008120
rel_clustering              0.093959
rel_frequency              -0.013119
rel_letters_count          -0.012524
rel_orthographic_density    0.003122
rel_synonyms_count         -0.090599
dtype: float64

Regressing global clustering with 542 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04603943121971488

intercept                                       -5.878477
rel_aoa                                         -0.002344
rel_clustering                                   0.055432
rel_frequency                                   -0.071727
rel_letters_count                               -0.042668
rel_orthographic_density                         0.008213
rel_synonyms_count                              -0.164416
rel_aoa * rel_clustering                        -0.010542
rel_aoa * rel_frequency                         -0.013472
rel_aoa * rel_letters_count                     -0.012459
rel_aoa * rel_orthographic_density               0.006687
rel_aoa * rel_synonyms_count                     0.035080
rel_clustering * rel_frequency                   0.036766
rel_clustering * rel_letters_count               0.064870
rel_clustering * rel_orthographic_density        0.010656
rel_clustering * rel_synonyms_count             -0.082078
rel_frequency * rel_letters_count                0.012289
rel_frequency * rel_orthographic_density        -0.025208
rel_frequency * rel_synonyms_count              -0.087264
rel_letters_count * rel_orthographic_density    -0.038144
rel_letters_count * rel_synonyms_count          -0.040713
rel_orthographic_density * rel_synonyms_count    0.046305
dtype: float64

Regressing rel clustering with 542 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1073472225976333

intercept                   0.299990
rel_aoa                    -0.024189
rel_clustering              0.358451
rel_frequency               0.003932
rel_letters_count           0.007638
rel_orthographic_density    0.016630
rel_synonyms_count         -0.014809
dtype: float64

Regressing rel clustering with 542 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1383545291591859

intercept                                        0.246098
rel_aoa                                         -0.019256
rel_clustering                                   0.333000
rel_frequency                                   -0.050492
rel_letters_count                               -0.010733
rel_orthographic_density                         0.010726
rel_synonyms_count                              -0.077460
rel_aoa * rel_clustering                         0.005458
rel_aoa * rel_frequency                         -0.008185
rel_aoa * rel_letters_count                     -0.011313
rel_aoa * rel_orthographic_density               0.000327
rel_aoa * rel_synonyms_count                     0.053528
rel_clustering * rel_frequency                   0.026452
rel_clustering * rel_letters_count               0.025010
rel_clustering * rel_orthographic_density       -0.039579
rel_clustering * rel_synonyms_count             -0.014490
rel_frequency * rel_letters_count                0.009281
rel_frequency * rel_orthographic_density        -0.031529
rel_frequency * rel_synonyms_count              -0.072376
rel_letters_count * rel_orthographic_density    -0.032774
rel_letters_count * rel_synonyms_count          -0.025460
rel_orthographic_density * rel_synonyms_count    0.089808
dtype: float64

Regressing global clustering with 542 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.03987735151543015

intercept                     -3.638919
global_aoa                     0.012810
global_clustering              0.173857
global_frequency              -0.085658
global_letters_count          -0.042736
global_orthographic_density   -0.034650
global_synonyms_count          0.108932
rel_aoa                       -0.021010
rel_clustering                -0.058674
rel_frequency                  0.049151
rel_letters_count              0.026148
rel_orthographic_density       0.043187
rel_synonyms_count            -0.187273
dtype: float64

Regressing global clustering with 542 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1604944877680884

intercept                                                 10.506050
global_aoa                                                -0.053834
global_clustering                                          3.423307
global_frequency                                          -1.534465
global_letters_count                                       1.664589
global_orthographic_density                                1.046477
global_synonyms_count                                     -2.883369
rel_aoa                                                    0.026588
rel_clustering                                            -3.849098
rel_frequency                                              1.004466
rel_letters_count                                         -0.956581
rel_orthographic_density                                  -0.411253
rel_synonyms_count                                        -0.596975
global_aoa * global_clustering                            -0.067624
global_aoa * global_frequency                             -0.009690
global_aoa * global_letters_count                         -0.053460
global_aoa * global_orthographic_density                  -0.040567
global_aoa * global_synonyms_count                         0.049244
global_aoa * rel_aoa                                       0.010343
global_aoa * rel_clustering                                0.175039
global_aoa * rel_frequency                                 0.022735
global_aoa * rel_letters_count                             0.039747
global_aoa * rel_orthographic_density                     -0.006351
global_aoa * rel_synonyms_count                            0.042041
global_clustering * global_frequency                      -0.285983
global_clustering * global_letters_count                   0.121874
global_clustering * global_orthographic_density           -0.018759
global_clustering * global_synonyms_count                 -0.257416
global_clustering * rel_aoa                                0.022143
global_clustering * rel_clustering                        -0.108098
global_clustering * rel_frequency                          0.201396
global_clustering * rel_letters_count                     -0.034187
global_clustering * rel_orthographic_density               0.052086
global_clustering * rel_synonyms_count                     0.051734
global_frequency * global_letters_count                   -0.045318
global_frequency * global_orthographic_density            -0.051839
global_frequency * global_synonyms_count                   0.181714
global_frequency * rel_aoa                                -0.005410
global_frequency * rel_clustering                          0.252890
global_frequency * rel_frequency                           0.012398
global_frequency * rel_letters_count                       0.047222
global_frequency * rel_orthographic_density                0.033203
global_frequency * rel_synonyms_count                     -0.052594
global_letters_count * global_orthographic_density        -0.042499
global_letters_count * global_synonyms_count              -0.128898
global_letters_count * rel_aoa                             0.008787
global_letters_count * rel_clustering                     -0.149355
global_letters_count * rel_frequency                       0.009760
global_letters_count * rel_letters_count                   0.004750
global_letters_count * rel_orthographic_density            0.086260
global_letters_count * rel_synonyms_count                  0.150909
global_orthographic_density * global_synonyms_count       -0.340148
global_orthographic_density * rel_aoa                      0.084351
global_orthographic_density * rel_clustering              -0.064629
global_orthographic_density * rel_frequency                0.029700
global_orthographic_density * rel_letters_count            0.008909
global_orthographic_density * rel_orthographic_density     0.065835
global_orthographic_density * rel_synonyms_count           0.365414
global_synonyms_count * rel_aoa                           -0.092688
global_synonyms_count * rel_clustering                     0.192827
global_synonyms_count * rel_frequency                     -0.266091
global_synonyms_count * rel_letters_count                 -0.157496
global_synonyms_count * rel_orthographic_density          -0.264492
global_synonyms_count * rel_synonyms_count                 0.070072
rel_aoa * rel_clustering                                  -0.096506
rel_aoa * rel_frequency                                   -0.001850
rel_aoa * rel_letters_count                               -0.022983
rel_aoa * rel_orthographic_density                        -0.042916
rel_aoa * rel_synonyms_count                               0.059273
rel_clustering * rel_frequency                            -0.160374
rel_clustering * rel_letters_count                         0.121836
rel_clustering * rel_orthographic_density                  0.103996
rel_clustering * rel_synonyms_count                       -0.061592
rel_frequency * rel_letters_count                         -0.015894
rel_frequency * rel_orthographic_density                  -0.053221
rel_frequency * rel_synonyms_count                         0.094852
rel_letters_count * rel_orthographic_density              -0.057787
rel_letters_count * rel_synonyms_count                     0.053349
rel_orthographic_density * rel_synonyms_count              0.230062
dtype: float64

Regressing rel clustering with 542 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.19511803864237642

intercept                     -2.389147
global_aoa                     0.017901
global_clustering             -0.556179
global_frequency              -0.069293
global_letters_count          -0.036893
global_orthographic_density    0.026559
global_synonyms_count          0.018223
rel_aoa                       -0.031733
rel_clustering                 0.821872
rel_frequency                  0.043043
rel_letters_count              0.034211
rel_orthographic_density      -0.019220
rel_synonyms_count            -0.083026
dtype: float64

Regressing rel clustering with 542 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2928718395037192

intercept                                                 3.240900
global_aoa                                                0.212444
global_clustering                                         1.637579
global_frequency                                         -0.653082
global_letters_count                                      1.524425
global_orthographic_density                               1.001090
global_synonyms_count                                    -1.512759
rel_aoa                                                  -0.305622
rel_clustering                                           -2.041735
rel_frequency                                             0.354577
rel_letters_count                                        -0.974482
rel_orthographic_density                                 -0.314328
rel_synonyms_count                                       -1.803543
global_aoa * global_clustering                           -0.053116
global_aoa * global_frequency                            -0.025207
global_aoa * global_letters_count                        -0.055708
global_aoa * global_orthographic_density                 -0.040080
global_aoa * global_synonyms_count                       -0.001090
global_aoa * rel_aoa                                      0.012542
global_aoa * rel_clustering                               0.143109
global_aoa * rel_frequency                                0.031274
global_aoa * rel_letters_count                            0.045841
global_aoa * rel_orthographic_density                    -0.017296
global_aoa * rel_synonyms_count                           0.093625
global_clustering * global_frequency                     -0.176661
global_clustering * global_letters_count                  0.086934
global_clustering * global_orthographic_density          -0.054916
global_clustering * global_synonyms_count                -0.282965
global_clustering * rel_aoa                               0.011444
global_clustering * rel_clustering                       -0.139830
global_clustering * rel_frequency                         0.116965
global_clustering * rel_letters_count                    -0.026668
global_clustering * rel_orthographic_density              0.094912
global_clustering * rel_synonyms_count                    0.118464
global_frequency * global_letters_count                  -0.050832
global_frequency * global_orthographic_density           -0.067197
global_frequency * global_synonyms_count                  0.038980
global_frequency * rel_aoa                                0.009345
global_frequency * rel_clustering                         0.149110
global_frequency * rel_frequency                          0.011569
global_frequency * rel_letters_count                      0.049499
global_frequency * rel_orthographic_density               0.045842
global_frequency * rel_synonyms_count                     0.079741
global_letters_count * global_orthographic_density       -0.039743
global_letters_count * global_synonyms_count             -0.130062
global_letters_count * rel_aoa                            0.025628
global_letters_count * rel_clustering                    -0.096494
global_letters_count * rel_frequency                      0.016139
global_letters_count * rel_letters_count                  0.003944
global_letters_count * rel_orthographic_density           0.095171
global_letters_count * rel_synonyms_count                 0.176675
global_orthographic_density * global_synonyms_count      -0.246483
global_orthographic_density * rel_aoa                     0.090920
global_orthographic_density * rel_clustering             -0.069663
global_orthographic_density * rel_frequency               0.037643
global_orthographic_density * rel_letters_count           0.013278
global_orthographic_density * rel_orthographic_density    0.066915
global_orthographic_density * rel_synonyms_count          0.312271
global_synonyms_count * rel_aoa                          -0.084461
global_synonyms_count * rel_clustering                    0.311895
global_synonyms_count * rel_frequency                    -0.145579
global_synonyms_count * rel_letters_count                -0.086737
global_synonyms_count * rel_orthographic_density         -0.308329
global_synonyms_count * rel_synonyms_count                0.071343
rel_aoa * rel_clustering                                 -0.085164
rel_aoa * rel_frequency                                  -0.011647
rel_aoa * rel_letters_count                              -0.032940
rel_aoa * rel_orthographic_density                       -0.032019
rel_aoa * rel_synonyms_count                              0.047461
rel_clustering * rel_frequency                           -0.083434
rel_clustering * rel_letters_count                        0.087181
rel_clustering * rel_orthographic_density                 0.084063
rel_clustering * rel_synonyms_count                      -0.213541
rel_frequency * rel_letters_count                        -0.015806
rel_frequency * rel_orthographic_density                 -0.051016
rel_frequency * rel_synonyms_count                       -0.012947
rel_letters_count * rel_orthographic_density             -0.061152
rel_letters_count * rel_synonyms_count                   -0.029130
rel_orthographic_density * rel_synonyms_count             0.217373
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0748920326571535

intercept                      3.785297
global_aoa                     0.114613
global_clustering             -0.109936
global_frequency               0.010054
global_letters_count           0.220641
global_orthographic_density   -0.038784
global_synonyms_count         -0.134034
dtype: float64

Regressing global letters_count with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09363164210437469

intercept                                             -12.919457
global_aoa                                              1.051481
global_clustering                                      -2.212995
global_frequency                                        1.246920
global_letters_count                                    0.911096
global_orthographic_density                            -0.922308
global_synonyms_count                                   2.469539
global_aoa * global_clustering                          0.157339
global_aoa * global_frequency                           0.018556
global_aoa * global_letters_count                      -0.030709
global_aoa * global_orthographic_density                0.017747
global_aoa * global_synonyms_count                     -0.094680
global_clustering * global_frequency                    0.151941
global_clustering * global_letters_count               -0.023788
global_clustering * global_orthographic_density        -0.202562
global_clustering * global_synonyms_count               0.330474
global_frequency * global_letters_count                -0.067306
global_frequency * global_orthographic_density         -0.048449
global_frequency * global_synonyms_count               -0.033979
global_letters_count * global_orthographic_density      0.000611
global_letters_count * global_synonyms_count            0.025144
global_orthographic_density * global_synonyms_count     0.089078
dtype: float64

Regressing rel letters_count with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0426535386026653

intercept                     -0.202352
global_aoa                     0.085715
global_clustering             -0.220583
global_frequency              -0.000962
global_letters_count           0.198243
global_orthographic_density    0.060455
global_synonyms_count         -0.224053
dtype: float64

Regressing rel letters_count with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.06329579555957832

intercept                                             -11.710874
global_aoa                                              1.002597
global_clustering                                      -1.664200
global_frequency                                        1.226910
global_letters_count                                    0.160191
global_orthographic_density                            -1.948830
global_synonyms_count                                   1.441926
global_aoa * global_clustering                          0.141693
global_aoa * global_frequency                          -0.003689
global_aoa * global_letters_count                      -0.013097
global_aoa * global_orthographic_density                0.032029
global_aoa * global_synonyms_count                     -0.113911
global_clustering * global_frequency                    0.152381
global_clustering * global_letters_count               -0.087839
global_clustering * global_orthographic_density        -0.313801
global_clustering * global_synonyms_count               0.213850
global_frequency * global_letters_count                -0.046010
global_frequency * global_orthographic_density         -0.019240
global_frequency * global_synonyms_count               -0.080716
global_letters_count * global_orthographic_density      0.000868
global_letters_count * global_synonyms_count            0.118855
global_orthographic_density * global_synonyms_count     0.269536
dtype: float64

Regressing global letters_count with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.05271558757698813

intercept                   5.885235
rel_aoa                    -0.040034
rel_clustering              0.093366
rel_frequency               0.043614
rel_letters_count           0.231264
rel_orthographic_density   -0.153808
rel_synonyms_count         -0.122497
dtype: float64

Regressing global letters_count with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07203245563769778

intercept                                        5.977566
rel_aoa                                         -0.071988
rel_clustering                                  -0.032284
rel_frequency                                    0.119700
rel_letters_count                                0.192309
rel_orthographic_density                        -0.307205
rel_synonyms_count                              -0.323044
rel_aoa * rel_clustering                         0.093289
rel_aoa * rel_frequency                         -0.024752
rel_aoa * rel_letters_count                     -0.033304
rel_aoa * rel_orthographic_density              -0.005996
rel_aoa * rel_synonyms_count                    -0.179404
rel_clustering * rel_frequency                   0.000720
rel_clustering * rel_letters_count              -0.022544
rel_clustering * rel_orthographic_density       -0.120968
rel_clustering * rel_synonyms_count             -0.240224
rel_frequency * rel_letters_count               -0.045440
rel_frequency * rel_orthographic_density        -0.053965
rel_frequency * rel_synonyms_count              -0.156432
rel_letters_count * rel_orthographic_density     0.033373
rel_letters_count * rel_synonyms_count           0.032096
rel_orthographic_density * rel_synonyms_count    0.074379
dtype: float64

Regressing rel letters_count with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1293762286852178

intercept                   1.617149
rel_aoa                    -0.044938
rel_clustering             -0.127368
rel_frequency              -0.154551
rel_letters_count           0.439565
rel_orthographic_density    0.148388
rel_synonyms_count         -0.101345
dtype: float64

Regressing rel letters_count with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.15697856984289227

intercept                                        1.726137
rel_aoa                                          0.056090
rel_clustering                                  -0.288084
rel_frequency                                   -0.074362
rel_letters_count                                0.489023
rel_orthographic_density                         0.123786
rel_synonyms_count                              -0.198953
rel_aoa * rel_clustering                         0.149008
rel_aoa * rel_frequency                          0.007452
rel_aoa * rel_letters_count                     -0.089012
rel_aoa * rel_orthographic_density              -0.082669
rel_aoa * rel_synonyms_count                    -0.174210
rel_clustering * rel_frequency                   0.023202
rel_clustering * rel_letters_count               0.003164
rel_clustering * rel_orthographic_density       -0.108404
rel_clustering * rel_synonyms_count             -0.092187
rel_frequency * rel_letters_count               -0.036048
rel_frequency * rel_orthographic_density        -0.010166
rel_frequency * rel_synonyms_count              -0.125533
rel_letters_count * rel_orthographic_density     0.050573
rel_letters_count * rel_synonyms_count           0.075828
rel_orthographic_density * rel_synonyms_count    0.240905
dtype: float64

Regressing global letters_count with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09079338125142511

intercept                      1.610996
global_aoa                     0.275993
global_clustering             -0.270023
global_frequency               0.071863
global_letters_count           0.166393
global_orthographic_density   -0.153940
global_synonyms_count         -0.127140
rel_aoa                       -0.237969
rel_clustering                 0.174980
rel_frequency                 -0.075727
rel_letters_count              0.059946
rel_orthographic_density       0.150885
rel_synonyms_count             0.002813
dtype: float64

Regressing global letters_count with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18060454161472106

intercept                                                 40.819677
global_aoa                                                 0.953728
global_clustering                                          5.859875
global_frequency                                          -0.563385
global_letters_count                                      -3.070325
global_orthographic_density                               -8.993636
global_synonyms_count                                     -0.388950
rel_aoa                                                   -1.437896
rel_clustering                                           -11.230143
rel_frequency                                              2.666882
rel_letters_count                                          3.736308
rel_orthographic_density                                   6.233838
rel_synonyms_count                                         1.037503
global_aoa * global_clustering                             0.278948
global_aoa * global_frequency                             -0.012018
global_aoa * global_letters_count                          0.025477
global_aoa * global_orthographic_density                   0.552998
global_aoa * global_synonyms_count                        -0.303327
global_aoa * rel_aoa                                       0.020249
global_aoa * rel_clustering                               -0.105556
global_aoa * rel_frequency                                 0.019221
global_aoa * rel_letters_count                            -0.127272
global_aoa * rel_orthographic_density                     -0.645810
global_aoa * rel_synonyms_count                            0.252504
global_clustering * global_frequency                      -0.147198
global_clustering * global_letters_count                  -0.463167
global_clustering * global_orthographic_density           -1.692936
global_clustering * global_synonyms_count                  0.346194
global_clustering * rel_aoa                               -0.176765
global_clustering * rel_clustering                         0.117589
global_clustering * rel_frequency                          0.331043
global_clustering * rel_letters_count                      0.278698
global_clustering * rel_orthographic_density               1.387623
global_clustering * rel_synonyms_count                     0.338060
global_frequency * global_letters_count                    0.061562
global_frequency * global_orthographic_density            -0.269527
global_frequency * global_synonyms_count                   0.168765
global_frequency * rel_aoa                                 0.038764
global_frequency * rel_clustering                          0.645224
global_frequency * rel_frequency                          -0.031212
global_frequency * rel_letters_count                      -0.146130
global_frequency * rel_orthographic_density                0.276460
global_frequency * rel_synonyms_count                      0.005796
global_letters_count * global_orthographic_density        -0.373762
global_letters_count * global_synonyms_count               0.608556
global_letters_count * rel_aoa                             0.051916
global_letters_count * rel_clustering                      0.611244
global_letters_count * rel_frequency                      -0.119238
global_letters_count * rel_letters_count                   0.075079
global_letters_count * rel_orthographic_density            0.578388
global_letters_count * rel_synonyms_count                 -0.314743
global_orthographic_density * global_synonyms_count       -0.112805
global_orthographic_density * rel_aoa                     -0.310468
global_orthographic_density * rel_clustering               1.058475
global_orthographic_density * rel_frequency                0.064254
global_orthographic_density * rel_letters_count            0.155421
global_orthographic_density * rel_orthographic_density     0.152627
global_orthographic_density * rel_synonyms_count           0.414178
global_synonyms_count * rel_aoa                            0.239654
global_synonyms_count * rel_clustering                     0.233478
global_synonyms_count * rel_frequency                     -0.332016
global_synonyms_count * rel_letters_count                 -0.836147
global_synonyms_count * rel_orthographic_density           0.006195
global_synonyms_count * rel_synonyms_count                -0.431498
rel_aoa * rel_clustering                                   0.160387
rel_aoa * rel_frequency                                   -0.043745
rel_aoa * rel_letters_count                               -0.065506
rel_aoa * rel_orthographic_density                         0.342102
rel_aoa * rel_synonyms_count                              -0.354698
rel_clustering * rel_frequency                            -0.661124
rel_clustering * rel_letters_count                        -0.459356
rel_clustering * rel_orthographic_density                 -0.806439
rel_clustering * rel_synonyms_count                       -0.833443
rel_frequency * rel_letters_count                          0.110992
rel_frequency * rel_orthographic_density                  -0.109727
rel_frequency * rel_synonyms_count                         0.139238
rel_letters_count * rel_orthographic_density              -0.171119
rel_letters_count * rel_synonyms_count                     0.581425
rel_orthographic_density * rel_synonyms_count             -0.147206
dtype: float64

Regressing rel letters_count with 680 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.19119162287909597

intercept                      0.828714
global_aoa                     0.223421
global_clustering             -0.278828
global_frequency               0.099437
global_letters_count          -0.697392
global_orthographic_density   -0.125789
global_synonyms_count         -0.202424
rel_aoa                       -0.185621
rel_clustering                 0.195223
rel_frequency                 -0.110302
rel_letters_count              0.962642
rel_orthographic_density       0.092529
rel_synonyms_count             0.091921
dtype: float64

Regressing rel letters_count with 680 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2677108551235694

intercept                                                 32.220045
global_aoa                                                 0.111733
global_clustering                                          4.212682
global_frequency                                          -0.513693
global_letters_count                                      -2.950019
global_orthographic_density                               -6.741517
global_synonyms_count                                      0.774715
rel_aoa                                                   -0.617129
rel_clustering                                            -9.722406
rel_frequency                                              2.414919
rel_letters_count                                          3.799665
rel_orthographic_density                                   3.678298
rel_synonyms_count                                         0.208943
global_aoa * global_clustering                             0.222182
global_aoa * global_frequency                              0.024424
global_aoa * global_letters_count                          0.064694
global_aoa * global_orthographic_density                   0.455499
global_aoa * global_synonyms_count                        -0.348997
global_aoa * rel_aoa                                       0.014367
global_aoa * rel_clustering                               -0.039496
global_aoa * rel_frequency                                -0.022401
global_aoa * rel_letters_count                            -0.142860
global_aoa * rel_orthographic_density                     -0.556494
global_aoa * rel_synonyms_count                            0.249034
global_clustering * global_frequency                      -0.114288
global_clustering * global_letters_count                  -0.309327
global_clustering * global_orthographic_density           -1.290674
global_clustering * global_synonyms_count                  0.422910
global_clustering * rel_aoa                               -0.124018
global_clustering * rel_clustering                         0.096474
global_clustering * rel_frequency                          0.274410
global_clustering * rel_letters_count                      0.147086
global_clustering * rel_orthographic_density               0.999897
global_clustering * rel_synonyms_count                     0.214347
global_frequency * global_letters_count                    0.027091
global_frequency * global_orthographic_density            -0.217655
global_frequency * global_synonyms_count                   0.136861
global_frequency * rel_aoa                                 0.010722
global_frequency * rel_clustering                          0.597897
global_frequency * rel_frequency                          -0.036120
global_frequency * rel_letters_count                      -0.111382
global_frequency * rel_orthographic_density                0.254720
global_frequency * rel_synonyms_count                      0.024003
global_letters_count * global_orthographic_density        -0.280204
global_letters_count * global_synonyms_count               0.621054
global_letters_count * rel_aoa                             0.006690
global_letters_count * rel_clustering                      0.471973
global_letters_count * rel_frequency                      -0.073630
global_letters_count * rel_letters_count                   0.059117
global_letters_count * rel_orthographic_density            0.491867
global_letters_count * rel_synonyms_count                 -0.352871
global_orthographic_density * global_synonyms_count       -0.221217
global_orthographic_density * rel_aoa                     -0.232273
global_orthographic_density * rel_clustering               0.678578
global_orthographic_density * rel_frequency                0.043485
global_orthographic_density * rel_letters_count            0.061538
global_orthographic_density * rel_orthographic_density     0.149526
global_orthographic_density * rel_synonyms_count           0.511646
global_synonyms_count * rel_aoa                            0.323327
global_synonyms_count * rel_clustering                     0.142799
global_synonyms_count * rel_frequency                     -0.301779
global_synonyms_count * rel_letters_count                 -0.815452
global_synonyms_count * rel_orthographic_density           0.157431
global_synonyms_count * rel_synonyms_count                -0.425638
rel_aoa * rel_clustering                                   0.103569
rel_aoa * rel_frequency                                   -0.006290
rel_aoa * rel_letters_count                               -0.039697
rel_aoa * rel_orthographic_density                         0.272109
rel_aoa * rel_synonyms_count                              -0.376450
rel_clustering * rel_frequency                            -0.619609
rel_clustering * rel_letters_count                        -0.363070
rel_clustering * rel_orthographic_density                 -0.474717
rel_clustering * rel_synonyms_count                       -0.666748
rel_frequency * rel_letters_count                          0.064380
rel_frequency * rel_orthographic_density                  -0.128917
rel_frequency * rel_synonyms_count                         0.135784
rel_letters_count * rel_orthographic_density              -0.081741
rel_letters_count * rel_synonyms_count                     0.580738
rel_orthographic_density * rel_synonyms_count             -0.283463
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 659 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.007288942468654969

intercept                      0.488711
global_aoa                    -0.009783
global_clustering              0.012410
global_frequency              -0.003711
global_letters_count           0.000482
global_orthographic_density    0.001609
global_synonyms_count          0.083465
dtype: float64

Regressing global synonyms_count with 659 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.03336962350803763

intercept                                             -4.139577
global_aoa                                             0.497794
global_clustering                                     -0.362156
global_frequency                                       0.125560
global_letters_count                                   0.282588
global_orthographic_density                            0.675267
global_synonyms_count                                 -0.042773
global_aoa * global_clustering                         0.043376
global_aoa * global_frequency                         -0.008936
global_aoa * global_letters_count                     -0.024283
global_aoa * global_orthographic_density              -0.019505
global_aoa * global_synonyms_count                     0.022827
global_clustering * global_frequency                   0.005170
global_clustering * global_letters_count               0.002728
global_clustering * global_orthographic_density        0.025628
global_clustering * global_synonyms_count              0.040998
global_frequency * global_letters_count               -0.005390
global_frequency * global_orthographic_density        -0.016216
global_frequency * global_synonyms_count               0.034134
global_letters_count * global_orthographic_density    -0.039020
global_letters_count * global_synonyms_count          -0.006749
global_orthographic_density * global_synonyms_count   -0.040883
dtype: float64

Regressing rel synonyms_count with 659 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.004351144771157278

intercept                      0.057047
global_aoa                    -0.006745
global_clustering             -0.008234
global_frequency              -0.007153
global_letters_count           0.001961
global_orthographic_density    0.009756
global_synonyms_count          0.054166
dtype: float64

Regressing rel synonyms_count with 659 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.032169698297463745

intercept                                             -3.952641
global_aoa                                             0.383587
global_clustering                                     -0.438355
global_frequency                                       0.118211
global_letters_count                                   0.200609
global_orthographic_density                            0.486202
global_synonyms_count                                  0.028532
global_aoa * global_clustering                         0.047956
global_aoa * global_frequency                          0.001463
global_aoa * global_letters_count                     -0.018381
global_aoa * global_orthographic_density              -0.010079
global_aoa * global_synonyms_count                     0.024122
global_clustering * global_frequency                   0.014707
global_clustering * global_letters_count              -0.005126
global_clustering * global_orthographic_density        0.011555
global_clustering * global_synonyms_count              0.060333
global_frequency * global_letters_count               -0.006624
global_frequency * global_orthographic_density        -0.011381
global_frequency * global_synonyms_count               0.027732
global_letters_count * global_orthographic_density    -0.038077
global_letters_count * global_synonyms_count           0.005544
global_orthographic_density * global_synonyms_count   -0.046577
dtype: float64

Regressing global synonyms_count with 659 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0049711065687053635

intercept                   0.354177
rel_aoa                    -0.014334
rel_clustering             -0.016019
rel_frequency              -0.003032
rel_letters_count           0.001853
rel_orthographic_density    0.002987
rel_synonyms_count          0.057944
dtype: float64

Regressing global synonyms_count with 659 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.030206002966692957

intercept                                        0.398731
rel_aoa                                         -0.046757
rel_clustering                                  -0.055815
rel_frequency                                    0.005444
rel_letters_count                               -0.017653
rel_orthographic_density                         0.087006
rel_synonyms_count                              -0.074031
rel_aoa * rel_clustering                         0.021106
rel_aoa * rel_frequency                         -0.006719
rel_aoa * rel_letters_count                     -0.004925
rel_aoa * rel_orthographic_density              -0.018448
rel_aoa * rel_synonyms_count                     0.020305
rel_clustering * rel_frequency                  -0.007800
rel_clustering * rel_letters_count               0.007720
rel_clustering * rel_orthographic_density        0.015846
rel_clustering * rel_synonyms_count             -0.040036
rel_frequency * rel_letters_count                0.003753
rel_frequency * rel_orthographic_density         0.010707
rel_frequency * rel_synonyms_count               0.000856
rel_letters_count * rel_orthographic_density    -0.025666
rel_letters_count * rel_synonyms_count          -0.002698
rel_orthographic_density * rel_synonyms_count   -0.123193
dtype: float64

Regressing rel synonyms_count with 659 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.054755203232281446

intercept                   0.034727
rel_aoa                    -0.018053
rel_clustering              0.048061
rel_frequency               0.006391
rel_letters_count           0.001891
rel_orthographic_density   -0.011649
rel_synonyms_count          0.228804
dtype: float64

Regressing rel synonyms_count with 659 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.0772651889549798

intercept                                        0.079121
rel_aoa                                         -0.023881
rel_clustering                                   0.011574
rel_frequency                                    0.008238
rel_letters_count                               -0.023183
rel_orthographic_density                         0.066524
rel_synonyms_count                               0.172703
rel_aoa * rel_clustering                         0.042332
rel_aoa * rel_frequency                          0.002808
rel_aoa * rel_letters_count                     -0.000108
rel_aoa * rel_orthographic_density               0.000858
rel_aoa * rel_synonyms_count                     0.016168
rel_clustering * rel_frequency                   0.001195
rel_clustering * rel_letters_count               0.008426
rel_clustering * rel_orthographic_density        0.007894
rel_clustering * rel_synonyms_count             -0.039501
rel_frequency * rel_letters_count                0.003034
rel_frequency * rel_orthographic_density         0.012141
rel_frequency * rel_synonyms_count               0.007191
rel_letters_count * rel_orthographic_density    -0.025311
rel_letters_count * rel_synonyms_count           0.006880
rel_orthographic_density * rel_synonyms_count   -0.055192
dtype: float64

Regressing global synonyms_count with 659 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.010758055495362906

intercept                      0.811400
global_aoa                     0.002577
global_clustering              0.053641
global_frequency              -0.012907
global_letters_count          -0.015008
global_orthographic_density    0.011851
global_synonyms_count          0.133607
rel_aoa                       -0.017297
rel_clustering                -0.052373
rel_frequency                  0.010555
rel_letters_count              0.015730
rel_orthographic_density      -0.010645
rel_synonyms_count            -0.062289
dtype: float64

Regressing global synonyms_count with 659 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1285744074664571

intercept                                                 10.675221
global_aoa                                                -0.301679
global_clustering                                          1.940651
global_frequency                                          -0.287274
global_letters_count                                      -0.102582
global_orthographic_density                               -0.498603
global_synonyms_count                                      1.002815
rel_aoa                                                    0.238129
rel_clustering                                            -2.095787
rel_frequency                                              0.763463
rel_letters_count                                          0.165904
rel_orthographic_density                                   1.067071
rel_synonyms_count                                        -3.262466
global_aoa * global_clustering                            -0.013093
global_aoa * global_frequency                              0.020139
global_aoa * global_letters_count                          0.013903
global_aoa * global_orthographic_density                  -0.035672
global_aoa * global_synonyms_count                        -0.049895
global_aoa * rel_aoa                                      -0.011646
global_aoa * rel_clustering                                0.068787
global_aoa * rel_frequency                                -0.055347
global_aoa * rel_letters_count                            -0.045686
global_aoa * rel_orthographic_density                      0.030142
global_aoa * rel_synonyms_count                            0.107446
global_clustering * global_frequency                      -0.087199
global_clustering * global_letters_count                  -0.039941
global_clustering * global_orthographic_density           -0.248446
global_clustering * global_synonyms_count                  0.033643
global_clustering * rel_aoa                               -0.010510
global_clustering * rel_clustering                        -0.021399
global_clustering * rel_frequency                          0.091413
global_clustering * rel_letters_count                     -0.009495
global_clustering * rel_orthographic_density               0.176257
global_clustering * rel_synonyms_count                    -0.082195
global_frequency * global_letters_count                   -0.035092
global_frequency * global_orthographic_density            -0.093470
global_frequency * global_synonyms_count                  -0.121193
global_frequency * rel_aoa                                -0.001533
global_frequency * rel_clustering                          0.048557
global_frequency * rel_frequency                           0.005058
global_frequency * rel_letters_count                       0.007534
global_frequency * rel_orthographic_density                0.025595
global_frequency * rel_synonyms_count                      0.214567
global_letters_count * global_orthographic_density        -0.006780
global_letters_count * global_synonyms_count               0.176389
global_letters_count * rel_aoa                            -0.030111
global_letters_count * rel_clustering                      0.027027
global_letters_count * rel_frequency                       0.007441
global_letters_count * rel_letters_count                   0.008478
global_letters_count * rel_orthographic_density           -0.029878
global_letters_count * rel_synonyms_count                 -0.126068
global_orthographic_density * global_synonyms_count        0.164316
global_orthographic_density * rel_aoa                     -0.064800
global_orthographic_density * rel_clustering               0.356749
global_orthographic_density * rel_frequency                0.009827
global_orthographic_density * rel_letters_count            0.016141
global_orthographic_density * rel_orthographic_density    -0.063907
global_orthographic_density * rel_synonyms_count          -0.014886
global_synonyms_count * rel_aoa                            0.086163
global_synonyms_count * rel_clustering                     0.118146
global_synonyms_count * rel_frequency                      0.062116
global_synonyms_count * rel_letters_count                 -0.161964
global_synonyms_count * rel_orthographic_density           0.014168
global_synonyms_count * rel_synonyms_count                 0.098705
rel_aoa * rel_clustering                                   0.009694
rel_aoa * rel_frequency                                    0.020989
rel_aoa * rel_letters_count                                0.042084
rel_aoa * rel_orthographic_density                         0.029840
rel_aoa * rel_synonyms_count                              -0.104532
rel_clustering * rel_frequency                            -0.023923
rel_clustering * rel_letters_count                         0.036809
rel_clustering * rel_orthographic_density                 -0.244203
rel_clustering * rel_synonyms_count                       -0.109045
rel_frequency * rel_letters_count                          0.022722
rel_frequency * rel_orthographic_density                   0.030884
rel_frequency * rel_synonyms_count                        -0.147722
rel_letters_count * rel_orthographic_density              -0.033945
rel_letters_count * rel_synonyms_count                     0.112956
rel_orthographic_density * rel_synonyms_count             -0.211183
dtype: float64

Regressing rel synonyms_count with 659 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.13301993515793398

intercept                      0.457952
global_aoa                    -0.000853
global_clustering              0.011345
global_frequency              -0.023144
global_letters_count          -0.006864
global_orthographic_density    0.058469
global_synonyms_count         -0.587298
rel_aoa                       -0.012097
rel_clustering                 0.000017
rel_frequency                  0.019112
rel_letters_count              0.008277
rel_orthographic_density      -0.057212
rel_synonyms_count             0.768252
dtype: float64

Regressing rel synonyms_count with 659 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.23092097861462124

intercept                                                 14.165999
global_aoa                                                -0.396858
global_clustering                                          2.242136
global_frequency                                          -0.655699
global_letters_count                                      -0.388972
global_orthographic_density                               -0.347682
global_synonyms_count                                     -0.938988
rel_aoa                                                    0.420110
rel_clustering                                            -2.268360
rel_frequency                                              0.932468
rel_letters_count                                          0.268894
rel_orthographic_density                                   0.779775
rel_synonyms_count                                        -1.067703
global_aoa * global_clustering                            -0.020272
global_aoa * global_frequency                              0.021113
global_aoa * global_letters_count                          0.019034
global_aoa * global_orthographic_density                  -0.031939
global_aoa * global_synonyms_count                        -0.027003
global_aoa * rel_aoa                                      -0.008160
global_aoa * rel_clustering                                0.070148
global_aoa * rel_frequency                                -0.043827
global_aoa * rel_letters_count                            -0.037778
global_aoa * rel_orthographic_density                      0.022807
global_aoa * rel_synonyms_count                            0.100922
global_clustering * global_frequency                      -0.121523
global_clustering * global_letters_count                  -0.065834
global_clustering * global_orthographic_density           -0.183261
global_clustering * global_synonyms_count                  0.091229
global_clustering * rel_aoa                                0.015275
global_clustering * rel_clustering                        -0.015062
global_clustering * rel_frequency                          0.108200
global_clustering * rel_letters_count                      0.009490
global_clustering * rel_orthographic_density               0.121328
global_clustering * rel_synonyms_count                    -0.080687
global_frequency * global_letters_count                   -0.024475
global_frequency * global_orthographic_density            -0.065930
global_frequency * global_synonyms_count                  -0.011181
global_frequency * rel_aoa                                -0.004355
global_frequency * rel_clustering                          0.091462
global_frequency * rel_frequency                           0.003120
global_frequency * rel_letters_count                       0.003482
global_frequency * rel_orthographic_density                0.013103
global_frequency * rel_synonyms_count                      0.130916
global_letters_count * global_orthographic_density        -0.006269
global_letters_count * global_synonyms_count               0.230403
global_letters_count * rel_aoa                            -0.028035
global_letters_count * rel_clustering                      0.037533
global_letters_count * rel_frequency                       0.000563
global_letters_count * rel_letters_count                   0.006730
global_letters_count * rel_orthographic_density           -0.015641
global_letters_count * rel_synonyms_count                 -0.185151
global_orthographic_density * global_synonyms_count        0.160615
global_orthographic_density * rel_aoa                     -0.055035
global_orthographic_density * rel_clustering               0.247436
global_orthographic_density * rel_frequency               -0.008567
global_orthographic_density * rel_letters_count            0.023274
global_orthographic_density * rel_orthographic_density    -0.043622
global_orthographic_density * rel_synonyms_count          -0.054090
global_synonyms_count * rel_aoa                            0.043791
global_synonyms_count * rel_clustering                     0.031469
global_synonyms_count * rel_frequency                     -0.012691
global_synonyms_count * rel_letters_count                 -0.234996
global_synonyms_count * rel_orthographic_density          -0.080140
global_synonyms_count * rel_synonyms_count                 0.102507
rel_aoa * rel_clustering                                  -0.011640
rel_aoa * rel_frequency                                    0.018999
rel_aoa * rel_letters_count                                0.034745
rel_aoa * rel_orthographic_density                         0.044201
rel_aoa * rel_synonyms_count                              -0.091913
rel_clustering * rel_frequency                            -0.055332
rel_clustering * rel_letters_count                         0.020468
rel_clustering * rel_orthographic_density                 -0.165442
rel_clustering * rel_synonyms_count                       -0.090104
rel_frequency * rel_letters_count                          0.019966
rel_frequency * rel_orthographic_density                   0.045827
rel_frequency * rel_synonyms_count                        -0.090289
rel_letters_count * rel_orthographic_density              -0.038924
rel_letters_count * rel_synonyms_count                     0.181525
rel_orthographic_density * rel_synonyms_count             -0.074755
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 551 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08207138640060474

intercept                      2.222301
global_aoa                    -0.042051
global_clustering              0.038466
global_frequency               0.004772
global_letters_count          -0.096881
global_orthographic_density    0.067304
global_synonyms_count         -0.008289
dtype: float64

Regressing global orthographic_density with 551 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10171284893220624

intercept                                              3.664054
global_aoa                                            -0.158509
global_clustering                                      0.118363
global_frequency                                      -0.107880
global_letters_count                                  -0.293435
global_orthographic_density                            0.607351
global_synonyms_count                                 -0.725250
global_aoa * global_clustering                         0.026241
global_aoa * global_frequency                          0.023996
global_aoa * global_letters_count                      0.003008
global_aoa * global_orthographic_density               0.015832
global_aoa * global_synonyms_count                     0.038295
global_clustering * global_frequency                  -0.007295
global_clustering * global_letters_count              -0.038435
global_clustering * global_orthographic_density        0.064843
global_clustering * global_synonyms_count             -0.081966
global_frequency * global_letters_count               -0.004197
global_frequency * global_orthographic_density        -0.024666
global_frequency * global_synonyms_count              -0.026533
global_letters_count * global_orthographic_density    -0.011617
global_letters_count * global_synonyms_count           0.009145
global_orthographic_density * global_synonyms_count    0.172827
dtype: float64

Regressing rel orthographic_density with 551 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06799145607727608

intercept                      0.019080
global_aoa                    -0.023267
global_clustering              0.069250
global_frequency               0.016953
global_letters_count          -0.106970
global_orthographic_density    0.018401
global_synonyms_count          0.009881
dtype: float64

Regressing rel orthographic_density with 551 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08556023561183479

intercept                                              1.677134
global_aoa                                            -0.054774
global_clustering                                      0.304708
global_frequency                                      -0.034533
global_letters_count                                  -0.372178
global_orthographic_density                            0.245824
global_synonyms_count                                 -0.405347
global_aoa * global_clustering                         0.043335
global_aoa * global_frequency                          0.030042
global_aoa * global_letters_count                     -0.001689
global_aoa * global_orthographic_density               0.010452
global_aoa * global_synonyms_count                     0.019527
global_clustering * global_frequency                  -0.004743
global_clustering * global_letters_count              -0.081341
global_clustering * global_orthographic_density        0.027769
global_clustering * global_synonyms_count             -0.083157
global_frequency * global_letters_count               -0.021262
global_frequency * global_orthographic_density        -0.016479
global_frequency * global_synonyms_count              -0.024934
global_letters_count * global_orthographic_density     0.005311
global_letters_count * global_synonyms_count          -0.004131
global_orthographic_density * global_synonyms_count    0.071738
dtype: float64

Regressing global orthographic_density with 551 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0633425572716182

intercept                   1.625834
rel_aoa                     0.023987
rel_clustering             -0.012609
rel_frequency              -0.003586
rel_letters_count          -0.105012
rel_orthographic_density    0.112699
rel_synonyms_count         -0.019251
dtype: float64

Regressing global orthographic_density with 551 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08278991565401661

intercept                                        1.627978
rel_aoa                                          0.113830
rel_clustering                                   0.120167
rel_frequency                                   -0.012139
rel_letters_count                               -0.103987
rel_orthographic_density                         0.168682
rel_synonyms_count                               0.165712
rel_aoa * rel_clustering                         0.074428
rel_aoa * rel_frequency                          0.028690
rel_aoa * rel_letters_count                      0.005522
rel_aoa * rel_orthographic_density               0.049820
rel_aoa * rel_synonyms_count                     0.041400
rel_clustering * rel_frequency                   0.016930
rel_clustering * rel_letters_count              -0.033414
rel_clustering * rel_orthographic_density        0.074974
rel_clustering * rel_synonyms_count              0.017621
rel_frequency * rel_letters_count                0.003220
rel_frequency * rel_orthographic_density         0.029208
rel_frequency * rel_synonyms_count               0.034945
rel_letters_count * rel_orthographic_density    -0.015877
rel_letters_count * rel_synonyms_count           0.012223
rel_orthographic_density * rel_synonyms_count    0.138782
dtype: float64

Regressing rel orthographic_density with 551 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11550190217928913

intercept                  -0.505363
rel_aoa                     0.023370
rel_clustering              0.027463
rel_frequency               0.042353
rel_letters_count          -0.098633
rel_orthographic_density    0.176058
rel_synonyms_count         -0.031718
dtype: float64

Regressing rel orthographic_density with 551 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1271408056282055

intercept                                       -0.470176
rel_aoa                                          0.085615
rel_clustering                                   0.139858
rel_frequency                                    0.067098
rel_letters_count                               -0.091225
rel_orthographic_density                         0.211149
rel_synonyms_count                               0.047654
rel_aoa * rel_clustering                         0.056185
rel_aoa * rel_frequency                          0.015811
rel_aoa * rel_letters_count                      0.002671
rel_aoa * rel_orthographic_density               0.042872
rel_aoa * rel_synonyms_count                     0.037361
rel_clustering * rel_frequency                   0.005444
rel_clustering * rel_letters_count              -0.041113
rel_clustering * rel_orthographic_density        0.052601
rel_clustering * rel_synonyms_count             -0.012825
rel_frequency * rel_letters_count               -0.004920
rel_frequency * rel_orthographic_density         0.029095
rel_frequency * rel_synonyms_count               0.015501
rel_letters_count * rel_orthographic_density     0.001772
rel_letters_count * rel_synonyms_count          -0.004046
rel_orthographic_density * rel_synonyms_count    0.047944
dtype: float64

Regressing global orthographic_density with 551 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0957335201837517

intercept                      2.912812
global_aoa                    -0.112776
global_clustering              0.060351
global_frequency              -0.030868
global_letters_count          -0.061116
global_orthographic_density    0.125407
global_synonyms_count          0.027035
rel_aoa                        0.105518
rel_clustering                -0.014548
rel_frequency                  0.043155
rel_letters_count             -0.043129
rel_orthographic_density      -0.077074
rel_synonyms_count            -0.050072
dtype: float64

Regressing global orthographic_density with 551 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18504065620485888

intercept                                                -16.908536
global_aoa                                                -0.648772
global_clustering                                         -0.973090
global_frequency                                           0.417242
global_letters_count                                       1.645683
global_orthographic_density                                9.008390
global_synonyms_count                                      7.317795
rel_aoa                                                    0.640687
rel_clustering                                             3.810518
rel_frequency                                             -0.516784
rel_letters_count                                         -1.248459
rel_orthographic_density                                  -7.868137
rel_synonyms_count                                        -4.589014
global_aoa * global_clustering                            -0.191636
global_aoa * global_frequency                              0.005467
global_aoa * global_letters_count                         -0.027882
global_aoa * global_orthographic_density                  -0.254127
global_aoa * global_synonyms_count                         0.018406
global_aoa * rel_aoa                                      -0.008462
global_aoa * rel_clustering                                0.176901
global_aoa * rel_frequency                                 0.002676
global_aoa * rel_letters_count                             0.044212
global_aoa * rel_orthographic_density                      0.300433
global_aoa * rel_synonyms_count                           -0.061949
global_clustering * global_frequency                      -0.029043
global_clustering * global_letters_count                   0.107715
global_clustering * global_orthographic_density            0.836487
global_clustering * global_synonyms_count                 -0.072345
global_clustering * rel_aoa                                0.219898
global_clustering * rel_clustering                         0.067354
global_clustering * rel_frequency                         -0.004969
global_clustering * rel_letters_count                     -0.018844
global_clustering * rel_orthographic_density              -0.729224
global_clustering * rel_synonyms_count                    -0.064718
global_frequency * global_letters_count                   -0.030357
global_frequency * global_orthographic_density            -0.158597
global_frequency * global_synonyms_count                  -0.456441
global_frequency * rel_aoa                                 0.060599
global_frequency * rel_clustering                         -0.127099
global_frequency * rel_frequency                           0.001135
global_frequency * rel_letters_count                       0.022084
global_frequency * rel_orthographic_density                0.106939
global_frequency * rel_synonyms_count                      0.246349
global_letters_count * global_orthographic_density        -0.110175
global_letters_count * global_synonyms_count              -0.525030
global_letters_count * rel_aoa                             0.054031
global_letters_count * rel_clustering                     -0.219600
global_letters_count * rel_frequency                       0.050224
global_letters_count * rel_letters_count                  -0.011243
global_letters_count * rel_orthographic_density            0.105787
global_letters_count * rel_synonyms_count                  0.353431
global_orthographic_density * global_synonyms_count       -0.115381
global_orthographic_density * rel_aoa                      0.009599
global_orthographic_density * rel_clustering              -0.811684
global_orthographic_density * rel_frequency                0.057010
global_orthographic_density * rel_letters_count            0.143411
global_orthographic_density * rel_orthographic_density    -0.062084
global_orthographic_density * rel_synonyms_count           0.067250
global_synonyms_count * rel_aoa                            0.050963
global_synonyms_count * rel_clustering                     0.274340
global_synonyms_count * rel_frequency                      0.483722
global_synonyms_count * rel_letters_count                  0.519953
global_synonyms_count * rel_orthographic_density           0.256604
global_synonyms_count * rel_synonyms_count                -0.115369
rel_aoa * rel_clustering                                  -0.148204
rel_aoa * rel_frequency                                   -0.034273
rel_aoa * rel_letters_count                               -0.047368
rel_aoa * rel_orthographic_density                         0.028270
rel_aoa * rel_synonyms_count                               0.034394
rel_clustering * rel_frequency                             0.164491
rel_clustering * rel_letters_count                         0.069537
rel_clustering * rel_orthographic_density                  0.714689
rel_clustering * rel_synonyms_count                       -0.164498
rel_frequency * rel_letters_count                         -0.044180
rel_frequency * rel_orthographic_density                   0.005026
rel_frequency * rel_synonyms_count                        -0.320886
rel_letters_count * rel_orthographic_density              -0.193196
rel_letters_count * rel_synonyms_count                    -0.403000
rel_orthographic_density * rel_synonyms_count             -0.178490
dtype: float64

Regressing rel orthographic_density with 551 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15546604426989175

intercept                      1.947392
global_aoa                    -0.087384
global_clustering              0.058367
global_frequency              -0.027300
global_letters_count          -0.039244
global_orthographic_density   -0.533117
global_synonyms_count          0.059811
rel_aoa                        0.075326
rel_clustering                 0.000839
rel_frequency                  0.052811
rel_letters_count             -0.064830
rel_orthographic_density       0.649756
rel_synonyms_count            -0.089038
dtype: float64

Regressing rel orthographic_density with 551 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2395899814657326

intercept                                                -15.419021
global_aoa                                                -0.269878
global_clustering                                         -0.747392
global_frequency                                           0.601844
global_letters_count                                       0.918876
global_orthographic_density                                6.449438
global_synonyms_count                                      7.345995
rel_aoa                                                    0.326433
rel_clustering                                             3.153855
rel_frequency                                             -0.790567
rel_letters_count                                         -0.805492
rel_orthographic_density                                  -5.332108
rel_synonyms_count                                        -4.585434
global_aoa * global_clustering                            -0.151145
global_aoa * global_frequency                              0.002961
global_aoa * global_letters_count                         -0.030296
global_aoa * global_orthographic_density                  -0.273960
global_aoa * global_synonyms_count                        -0.002009
global_aoa * rel_aoa                                      -0.003299
global_aoa * rel_clustering                                0.146153
global_aoa * rel_frequency                                 0.018342
global_aoa * rel_letters_count                             0.051954
global_aoa * rel_orthographic_density                      0.324393
global_aoa * rel_synonyms_count                           -0.014824
global_clustering * global_frequency                      -0.001887
global_clustering * global_letters_count                   0.033696
global_clustering * global_orthographic_density            0.636800
global_clustering * global_synonyms_count                 -0.083780
global_clustering * rel_aoa                                0.181004
global_clustering * rel_clustering                         0.069170
global_clustering * rel_frequency                         -0.021129
global_clustering * rel_letters_count                      0.028618
global_clustering * rel_orthographic_density              -0.539747
global_clustering * rel_synonyms_count                    -0.018419
global_frequency * global_letters_count                   -0.032318
global_frequency * global_orthographic_density            -0.150010
global_frequency * global_synonyms_count                  -0.461161
global_frequency * rel_aoa                                 0.061490
global_frequency * rel_clustering                         -0.152652
global_frequency * rel_frequency                           0.005072
global_frequency * rel_letters_count                       0.022700
global_frequency * rel_orthographic_density                0.100562
global_frequency * rel_synonyms_count                      0.238961
global_letters_count * global_orthographic_density         0.035701
global_letters_count * global_synonyms_count              -0.518098
global_letters_count * rel_aoa                             0.048806
global_letters_count * rel_clustering                     -0.123744
global_letters_count * rel_frequency                       0.051847
global_letters_count * rel_letters_count                  -0.012192
global_letters_count * rel_orthographic_density           -0.031413
global_letters_count * rel_synonyms_count                  0.351855
global_orthographic_density * global_synonyms_count       -0.095796
global_orthographic_density * rel_aoa                      0.012199
global_orthographic_density * rel_clustering              -0.505073
global_orthographic_density * rel_frequency                0.088662
global_orthographic_density * rel_letters_count            0.043620
global_orthographic_density * rel_orthographic_density    -0.041712
global_orthographic_density * rel_synonyms_count           0.103301
global_synonyms_count * rel_aoa                            0.025767
global_synonyms_count * rel_clustering                     0.408610
global_synonyms_count * rel_frequency                      0.489577
global_synonyms_count * rel_letters_count                  0.474612
global_synonyms_count * rel_orthographic_density           0.122207
global_synonyms_count * rel_synonyms_count                -0.117298
rel_aoa * rel_clustering                                  -0.134761
rel_aoa * rel_frequency                                   -0.049554
rel_aoa * rel_letters_count                               -0.052681
rel_aoa * rel_orthographic_density                         0.006840
rel_aoa * rel_synonyms_count                               0.026531
rel_clustering * rel_frequency                             0.174314
rel_clustering * rel_letters_count                        -0.002941
rel_clustering * rel_orthographic_density                  0.407791
rel_clustering * rel_synonyms_count                       -0.334379
rel_frequency * rel_letters_count                         -0.053465
rel_frequency * rel_orthographic_density                  -0.028992
rel_frequency * rel_synonyms_count                        -0.323487
rel_letters_count * rel_orthographic_density              -0.105085
rel_letters_count * rel_synonyms_count                    -0.375663
rel_orthographic_density * rel_synonyms_count             -0.141100
dtype: float64