Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.discrete, source=Source.majority, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 4417 substitutions for model Model(time=Time.discrete, source=Source.majority, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)
100% (4417 of 4417) |######################| Elapsed Time: 0:01:24 Time: 0:01:24

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | **  | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | ns. |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | **  |
H_00 | ns. | ns. | *   | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | **  |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | **  |
H_00 | ns. | ns. | *   | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | **  | **  | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | ns. |
H_00 | *   | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | **  | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | **  | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | **  | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | **  |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | **  | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | **  | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | **  |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | **  |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *   |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *   | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | **  |
H_00 | ns. | ns. | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *   | **  | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *   | *** | *   |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | **  |
H_00 | ns. | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | *** | *   |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *   | **  | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *   | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | **  |
H_00 | ns. | ns. | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | **  |
H_00 | ns. | **  | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *   | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | ns. | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | ns. | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | **  |
H_00 | ns. | **  | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.54135829  0.17744755  0.07176496  0.07130394  0.03529199  0.0285309
  0.02038716  0.01818248  0.01400208  0.00971662]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.463937 0.242983 -0.091234 0.236037 0.221445 -0.458947 0.218019 0.264778 -0.421182 0.280782 -0.167703 -0.004282
Component-1 0.386310 -0.363836 0.144710 -0.277832 -0.283946 -0.396128 0.121129 -0.294111 -0.461663 0.203369 -0.163799 0.014763
Component-2 -0.706621 -0.321573 0.048700 -0.071202 -0.596543 0.058636 0.071099 -0.096863 0.065361 -0.086535 0.043194 0.049717

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (4417 of 4417) |######################| Elapsed Time: 0:01:17 Time: 0:01:17

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | *   | *** | *   |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *   | **  | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.64406974  0.21304137]

Out[35]:
aoa frequency letters_count
Component-0 -0.745493 0.338447 -0.574189
Component-1 0.411159 -0.444515 -0.795836

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (4417 of 4417) |######################| Elapsed Time: 0:00:30 Time: 0:00:30

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 620 (cluster-unique) substitutions, but the PCA is in fact computed on 471 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
   ** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
    * global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
   ** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.04921646667188051

intercept                      8.305603
global_aoa                    -0.039234
global_clustering              0.257282
global_frequency               0.291896
global_letters_count          -0.008305
global_orthographic_density   -0.170180
global_synonyms_count          0.052503
dtype: float64

Regressing global frequency with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.1281874942778527

intercept                                              9.952529
global_aoa                                            -1.014514
global_clustering                                      0.202186
global_frequency                                       0.686849
global_letters_count                                  -0.447581
global_orthographic_density                            1.257273
global_synonyms_count                                  1.675972
global_aoa * global_clustering                        -0.021971
global_aoa * global_frequency                          0.021926
global_aoa * global_letters_count                      0.072685
global_aoa * global_orthographic_density               0.177891
global_aoa * global_synonyms_count                    -0.060273
global_clustering * global_frequency                   0.009250
global_clustering * global_letters_count              -0.043175
global_clustering * global_orthographic_density        0.272688
global_clustering * global_synonyms_count             -0.076008
global_frequency * global_letters_count               -0.043416
global_frequency * global_orthographic_density        -0.143719
global_frequency * global_synonyms_count              -0.118815
global_letters_count * global_orthographic_density     0.068224
global_letters_count * global_synonyms_count          -0.060369
global_orthographic_density * global_synonyms_count   -0.226604
dtype: float64

Regressing rel frequency with 363 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.05061894590241933

intercept                     -3.390280
global_aoa                    -0.026089
global_clustering              0.311293
global_frequency               0.297908
global_letters_count           0.046180
global_orthographic_density   -0.262341
global_synonyms_count          0.147998
dtype: float64

Regressing rel frequency with 363 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.11299441722068082

intercept                                              6.832626
global_aoa                                            -0.917671
global_clustering                                      2.096198
global_frequency                                      -0.013856
global_letters_count                                  -0.522664
global_orthographic_density                            1.193379
global_synonyms_count                                 -0.090726
global_aoa * global_clustering                        -0.015770
global_aoa * global_frequency                          0.029356
global_aoa * global_letters_count                      0.059878
global_aoa * global_orthographic_density               0.129867
global_aoa * global_synonyms_count                     0.022158
global_clustering * global_frequency                  -0.135967
global_clustering * global_letters_count              -0.127894
global_clustering * global_orthographic_density        0.282361
global_clustering * global_synonyms_count             -0.315380
global_frequency * global_letters_count               -0.073983
global_frequency * global_orthographic_density        -0.130163
global_frequency * global_synonyms_count              -0.123707
global_letters_count * global_orthographic_density     0.114837
global_letters_count * global_synonyms_count          -0.074664
global_orthographic_density * global_synonyms_count   -0.155196
dtype: float64

Regressing global frequency with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.033908710712241

intercept                   9.389448
rel_aoa                     0.031147
rel_clustering              0.063129
rel_frequency               0.203192
rel_letters_count          -0.031171
rel_orthographic_density   -0.142929
rel_synonyms_count          0.027972
dtype: float64

Regressing global frequency with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09024675554246853

intercept                                        9.369011
rel_aoa                                          0.201024
rel_clustering                                   0.037547
rel_frequency                                    0.212769
rel_letters_count                               -0.007371
rel_orthographic_density                        -0.459983
rel_synonyms_count                               0.117942
rel_aoa * rel_clustering                        -0.007403
rel_aoa * rel_frequency                          0.023289
rel_aoa * rel_letters_count                     -0.010368
rel_aoa * rel_orthographic_density               0.066919
rel_aoa * rel_synonyms_count                    -0.134371
rel_clustering * rel_frequency                  -0.087294
rel_clustering * rel_letters_count               0.048640
rel_clustering * rel_orthographic_density        0.418665
rel_clustering * rel_synonyms_count             -0.005774
rel_frequency * rel_letters_count               -0.027904
rel_frequency * rel_orthographic_density        -0.045852
rel_frequency * rel_synonyms_count              -0.014942
rel_letters_count * rel_orthographic_density     0.065150
rel_letters_count * rel_synonyms_count          -0.172395
rel_orthographic_density * rel_synonyms_count   -0.373317
dtype: float64

Regressing rel frequency with 363 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.23189618511126198

intercept                  -1.461264
rel_aoa                    -0.000549
rel_clustering              0.332553
rel_frequency               0.591861
rel_letters_count          -0.103399
rel_orthographic_density   -0.393357
rel_synonyms_count          0.103124
dtype: float64

Regressing rel frequency with 363 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.28361471180600273

intercept                                       -1.547735
rel_aoa                                          0.028037
rel_clustering                                   0.161659
rel_frequency                                    0.621403
rel_letters_count                               -0.012036
rel_orthographic_density                        -0.721877
rel_synonyms_count                               0.144298
rel_aoa * rel_clustering                        -0.053798
rel_aoa * rel_frequency                         -0.055197
rel_aoa * rel_letters_count                      0.014644
rel_aoa * rel_orthographic_density               0.182359
rel_aoa * rel_synonyms_count                    -0.052435
rel_clustering * rel_frequency                  -0.173814
rel_clustering * rel_letters_count              -0.026338
rel_clustering * rel_orthographic_density        0.266210
rel_clustering * rel_synonyms_count             -0.162459
rel_frequency * rel_letters_count               -0.001088
rel_frequency * rel_orthographic_density        -0.058190
rel_frequency * rel_synonyms_count              -0.054272
rel_letters_count * rel_orthographic_density     0.045692
rel_letters_count * rel_synonyms_count          -0.125596
rel_orthographic_density * rel_synonyms_count   -0.164865
dtype: float64

Regressing global frequency with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.05914066634739201

intercept                      10.204979
global_aoa                     -0.091899
global_clustering               0.545759
global_frequency                0.234654
global_letters_count            0.080349
global_orthographic_density     0.096321
global_synonyms_count          -0.126541
rel_aoa                         0.080559
rel_clustering                 -0.327024
rel_frequency                   0.064421
rel_letters_count              -0.101848
rel_orthographic_density       -0.292566
rel_synonyms_count              0.184790
dtype: float64

Regressing global frequency with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.26517120837153085

intercept                                                 17.451836
global_aoa                                                -0.041363
global_clustering                                         -5.733456
global_frequency                                          -1.388242
global_letters_count                                      -5.367415
global_orthographic_density                               -0.848862
global_synonyms_count                                     -7.989103
rel_aoa                                                    1.050615
rel_clustering                                             1.800979
rel_frequency                                             -0.952678
rel_letters_count                                          2.991464
rel_orthographic_density                                   8.312012
rel_synonyms_count                                        21.773513
global_aoa * global_clustering                             0.355633
global_aoa * global_frequency                              0.008442
global_aoa * global_letters_count                          0.285183
global_aoa * global_orthographic_density                   0.495123
global_aoa * global_synonyms_count                        -0.353179
global_aoa * rel_aoa                                      -0.046377
global_aoa * rel_clustering                               -0.408929
global_aoa * rel_frequency                                 0.006883
global_aoa * rel_letters_count                            -0.176315
global_aoa * rel_orthographic_density                     -0.273546
global_aoa * rel_synonyms_count                            0.260875
global_clustering * global_frequency                       0.147125
global_clustering * global_letters_count                   0.060840
global_clustering * global_orthographic_density            0.811641
global_clustering * global_synonyms_count                  1.224427
global_clustering * rel_aoa                               -0.313888
global_clustering * rel_clustering                         0.200668
global_clustering * rel_frequency                         -0.405312
global_clustering * rel_letters_count                     -0.161470
global_clustering * rel_orthographic_density               0.026183
global_clustering * rel_synonyms_count                    -0.965303
global_frequency * global_letters_count                    0.305412
global_frequency * global_orthographic_density             0.222843
global_frequency * global_synonyms_count                   0.619644
global_frequency * rel_aoa                                -0.060576
global_frequency * rel_clustering                          0.298920
global_frequency * rel_frequency                          -0.007082
global_frequency * rel_letters_count                      -0.248713
global_frequency * rel_orthographic_density               -0.451197
global_frequency * rel_synonyms_count                     -1.427280
global_letters_count * global_orthographic_density        -0.094582
global_letters_count * global_synonyms_count               1.663528
global_letters_count * rel_aoa                            -0.233050
global_letters_count * rel_clustering                      0.044869
global_letters_count * rel_frequency                      -0.082293
global_letters_count * rel_letters_count                   0.033433
global_letters_count * rel_orthographic_density           -0.229375
global_letters_count * rel_synonyms_count                 -2.351607
global_orthographic_density * global_synonyms_count        1.164542
global_orthographic_density * rel_aoa                     -0.467460
global_orthographic_density * rel_clustering              -0.891126
global_orthographic_density * rel_frequency               -0.242671
global_orthographic_density * rel_letters_count           -0.053348
global_orthographic_density * rel_orthographic_density    -0.651949
global_orthographic_density * rel_synonyms_count          -0.982213
global_synonyms_count * rel_aoa                           -0.029261
global_synonyms_count * rel_clustering                    -0.932133
global_synonyms_count * rel_frequency                     -0.645129
global_synonyms_count * rel_letters_count                 -1.024602
global_synonyms_count * rel_orthographic_density          -1.175992
global_synonyms_count * rel_synonyms_count                -0.191286
rel_aoa * rel_clustering                                   0.266055
rel_aoa * rel_frequency                                    0.026987
rel_aoa * rel_letters_count                                0.130475
rel_aoa * rel_orthographic_density                         0.237875
rel_aoa * rel_synonyms_count                               0.029580
rel_clustering * rel_frequency                            -0.038663
rel_clustering * rel_letters_count                         0.182673
rel_clustering * rel_orthographic_density                  0.321611
rel_clustering * rel_synonyms_count                        0.457985
rel_frequency * rel_letters_count                          0.030306
rel_frequency * rel_orthographic_density                   0.330666
rel_frequency * rel_synonyms_count                         1.288586
rel_letters_count * rel_orthographic_density               0.276804
rel_letters_count * rel_synonyms_count                     1.453969
rel_orthographic_density * rel_synonyms_count              0.575347
dtype: float64

Regressing rel frequency with 363 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.32464272894601953

intercept                      8.634722
global_aoa                    -0.070612
global_clustering              0.663690
global_frequency              -0.609588
global_letters_count           0.151780
global_orthographic_density    0.194437
global_synonyms_count         -0.217786
rel_aoa                        0.047723
rel_clustering                -0.362160
rel_frequency                  0.957020
rel_letters_count             -0.157995
rel_orthographic_density      -0.352470
rel_synonyms_count             0.281904
dtype: float64

Regressing rel frequency with 363 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.4768587718178876

intercept                                                 13.092356
global_aoa                                                 0.617633
global_clustering                                         -5.106817
global_frequency                                          -2.460141
global_letters_count                                      -6.050267
global_orthographic_density                                4.104504
global_synonyms_count                                     -7.462766
rel_aoa                                                    0.761614
rel_clustering                                             1.549935
rel_frequency                                              0.362455
rel_letters_count                                          3.195183
rel_orthographic_density                                   2.594602
rel_synonyms_count                                        22.371761
global_aoa * global_clustering                             0.306769
global_aoa * global_frequency                             -0.044882
global_aoa * global_letters_count                          0.260137
global_aoa * global_orthographic_density                   0.377898
global_aoa * global_synonyms_count                        -0.275078
global_aoa * rel_aoa                                      -0.030946
global_aoa * rel_clustering                               -0.370783
global_aoa * rel_frequency                                 0.061832
global_aoa * rel_letters_count                            -0.139361
global_aoa * rel_orthographic_density                     -0.137738
global_aoa * rel_synonyms_count                            0.158074
global_clustering * global_frequency                       0.076947
global_clustering * global_letters_count                   0.045811
global_clustering * global_orthographic_density            1.167793
global_clustering * global_synonyms_count                  1.342928
global_clustering * rel_aoa                               -0.323084
global_clustering * rel_clustering                         0.209663
global_clustering * rel_frequency                         -0.305579
global_clustering * rel_letters_count                     -0.160740
global_clustering * rel_orthographic_density              -0.341354
global_clustering * rel_synonyms_count                    -1.123481
global_frequency * global_letters_count                    0.402223
global_frequency * global_orthographic_density             0.079912
global_frequency * global_synonyms_count                   0.628910
global_frequency * rel_aoa                                -0.030832
global_frequency * rel_clustering                          0.341507
global_frequency * rel_frequency                          -0.014963
global_frequency * rel_letters_count                      -0.306821
global_frequency * rel_orthographic_density               -0.275020
global_frequency * rel_synonyms_count                     -1.503311
global_letters_count * global_orthographic_density        -0.140989
global_letters_count * global_synonyms_count               1.614648
global_letters_count * rel_aoa                            -0.287867
global_letters_count * rel_clustering                      0.044410
global_letters_count * rel_frequency                      -0.154902
global_letters_count * rel_letters_count                   0.030094
global_letters_count * rel_orthographic_density           -0.129049
global_letters_count * rel_synonyms_count                 -2.350768
global_orthographic_density * global_synonyms_count        1.122916
global_orthographic_density * rel_aoa                     -0.489186
global_orthographic_density * rel_clustering              -1.152693
global_orthographic_density * rel_frequency               -0.118217
global_orthographic_density * rel_letters_count            0.001695
global_orthographic_density * rel_orthographic_density    -0.563296
global_orthographic_density * rel_synonyms_count          -1.088294
global_synonyms_count * rel_aoa                            0.021906
global_synonyms_count * rel_clustering                    -1.071457
global_synonyms_count * rel_frequency                     -0.674527
global_synonyms_count * rel_letters_count                 -0.990457
global_synonyms_count * rel_orthographic_density          -0.835599
global_synonyms_count * rel_synonyms_count                -0.171652
rel_aoa * rel_clustering                                   0.250252
rel_aoa * rel_frequency                                   -0.017265
rel_aoa * rel_letters_count                                0.147367
rel_aoa * rel_orthographic_density                         0.219252
rel_aoa * rel_synonyms_count                              -0.027209
rel_clustering * rel_frequency                            -0.153256
rel_clustering * rel_letters_count                         0.145497
rel_clustering * rel_orthographic_density                  0.591292
rel_clustering * rel_synonyms_count                        0.623961
rel_frequency * rel_letters_count                          0.076005
rel_frequency * rel_orthographic_density                   0.203341
rel_frequency * rel_synonyms_count                         1.359934
rel_letters_count * rel_orthographic_density               0.183508
rel_letters_count * rel_synonyms_count                     1.452625
rel_orthographic_density * rel_synonyms_count              0.329870
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 325 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.08765891670551129

intercept                      3.104063
global_aoa                     0.365139
global_clustering             -0.301137
global_frequency              -0.087691
global_letters_count           0.048081
global_orthographic_density    0.128011
global_synonyms_count          0.192992
dtype: float64

Regressing global aoa with 325 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.18206122843092754

intercept                                             -5.104405
global_aoa                                             1.844127
global_clustering                                     -0.640242
global_frequency                                       0.687574
global_letters_count                                   0.761261
global_orthographic_density                           -4.561062
global_synonyms_count                                 -5.044618
global_aoa * global_clustering                         0.187103
global_aoa * global_frequency                         -0.009031
global_aoa * global_letters_count                     -0.075769
global_aoa * global_orthographic_density               0.020652
global_aoa * global_synonyms_count                     0.276283
global_clustering * global_frequency                   0.059813
global_clustering * global_letters_count              -0.078303
global_clustering * global_orthographic_density       -0.669320
global_clustering * global_synonyms_count             -0.365908
global_frequency * global_letters_count               -0.059938
global_frequency * global_orthographic_density         0.061949
global_frequency * global_synonyms_count              -0.042414
global_letters_count * global_orthographic_density    -0.060826
global_letters_count * global_synonyms_count           0.079459
global_orthographic_density * global_synonyms_count    0.978645
dtype: float64

Regressing rel aoa with 325 measures, no interactions
           ^^^^^^^
R^2 = 0.04863705710258115

intercept                      0.030572
global_aoa                     0.122744
global_clustering             -0.226698
global_frequency              -0.261447
global_letters_count           0.095510
global_orthographic_density    0.238730
global_synonyms_count          0.190229
dtype: float64

Regressing rel aoa with 325 measures, with interactions
           ^^^^^^^
R^2 = 0.15832116329448964

intercept                                             -3.721923
global_aoa                                             2.093858
global_clustering                                     -0.724134
global_frequency                                       0.295843
global_letters_count                                  -0.921193
global_orthographic_density                           -5.328107
global_synonyms_count                                 -2.429560
global_aoa * global_clustering                         0.184203
global_aoa * global_frequency                         -0.093859
global_aoa * global_letters_count                     -0.048658
global_aoa * global_orthographic_density               0.124655
global_aoa * global_synonyms_count                     0.165770
global_clustering * global_frequency                   0.086542
global_clustering * global_letters_count              -0.135370
global_clustering * global_orthographic_density       -0.532070
global_clustering * global_synonyms_count             -0.161640
global_frequency * global_letters_count                0.072168
global_frequency * global_orthographic_density         0.187135
global_frequency * global_synonyms_count              -0.132205
global_letters_count * global_orthographic_density    -0.100656
global_letters_count * global_synonyms_count           0.099152
global_orthographic_density * global_synonyms_count    0.931890
dtype: float64

Regressing global aoa with 325 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.021877547712287226

intercept                   6.980006
rel_aoa                     0.129237
rel_clustering              0.022984
rel_frequency               0.098973
rel_letters_count          -0.016628
rel_orthographic_density   -0.260613
rel_synonyms_count          0.223466
dtype: float64

Regressing global aoa with 325 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.1497026448594715

intercept                                        7.012000
rel_aoa                                         -0.129800
rel_clustering                                   0.369906
rel_frequency                                    0.146947
rel_letters_count                               -0.148043
rel_orthographic_density                        -0.050661
rel_synonyms_count                               0.968431
rel_aoa * rel_clustering                         0.256825
rel_aoa * rel_frequency                         -0.119039
rel_aoa * rel_letters_count                      0.021393
rel_aoa * rel_orthographic_density               0.264639
rel_aoa * rel_synonyms_count                     0.406748
rel_clustering * rel_frequency                   0.272947
rel_clustering * rel_letters_count              -0.021581
rel_clustering * rel_orthographic_density       -0.335997
rel_clustering * rel_synonyms_count             -0.096754
rel_frequency * rel_letters_count               -0.003884
rel_frequency * rel_orthographic_density         0.053895
rel_frequency * rel_synonyms_count              -0.025264
rel_letters_count * rel_orthographic_density    -0.126025
rel_letters_count * rel_synonyms_count           0.018118
rel_orthographic_density * rel_synonyms_count    1.159029
dtype: float64

Regressing rel aoa with 325 measures, no interactions
           ^^^^^^^
R^2 = 0.19137974936996438

intercept                   0.403828
rel_aoa                     0.501534
rel_clustering             -0.237181
rel_frequency              -0.134934
rel_letters_count           0.022662
rel_orthographic_density    0.283275
rel_synonyms_count          0.065212
dtype: float64

Regressing rel aoa with 325 measures, with interactions
           ^^^^^^^
R^2 = 0.27726331117091463

intercept                                        0.696979
rel_aoa                                          0.510367
rel_clustering                                   0.124436
rel_frequency                                   -0.016917
rel_letters_count                                0.009420
rel_orthographic_density                         1.012642
rel_synonyms_count                               0.345112
rel_aoa * rel_clustering                         0.134598
rel_aoa * rel_frequency                         -0.054652
rel_aoa * rel_letters_count                     -0.025095
rel_aoa * rel_orthographic_density               0.192707
rel_aoa * rel_synonyms_count                     0.170765
rel_clustering * rel_frequency                   0.199827
rel_clustering * rel_letters_count              -0.114837
rel_clustering * rel_orthographic_density       -0.361825
rel_clustering * rel_synonyms_count             -0.027691
rel_frequency * rel_letters_count                0.023933
rel_frequency * rel_orthographic_density         0.220772
rel_frequency * rel_synonyms_count              -0.123513
rel_letters_count * rel_orthographic_density    -0.132854
rel_letters_count * rel_synonyms_count           0.006692
rel_orthographic_density * rel_synonyms_count    0.722675
dtype: float64

Regressing global aoa with 325 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.11522682203969636

intercept                      2.877697
global_aoa                     0.478136
global_clustering             -0.406021
global_frequency              -0.269138
global_letters_count           0.168485
global_orthographic_density    0.359186
global_synonyms_count          0.401911
rel_aoa                       -0.186878
rel_clustering                 0.100053
rel_frequency                  0.159398
rel_letters_count             -0.133112
rel_orthographic_density      -0.200539
rel_synonyms_count            -0.278908
dtype: float64

Regressing global aoa with 325 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.3504405793764501

intercept                                                 31.557588
global_aoa                                                 3.627129
global_clustering                                          9.441600
global_frequency                                           0.826839
global_letters_count                                      -0.223904
global_orthographic_density                              -15.210884
global_synonyms_count                                    -25.988039
rel_aoa                                                   -2.474572
rel_clustering                                             5.360760
rel_frequency                                              3.602362
rel_letters_count                                          5.452400
rel_orthographic_density                                   9.841188
rel_synonyms_count                                        14.495816
global_aoa * global_clustering                             0.271953
global_aoa * global_frequency                              0.056513
global_aoa * global_letters_count                         -0.341632
global_aoa * global_orthographic_density                  -0.274552
global_aoa * global_synonyms_count                         0.310278
global_aoa * rel_aoa                                       0.072147
global_aoa * rel_clustering                               -0.316984
global_aoa * rel_frequency                                 0.007648
global_aoa * rel_letters_count                             0.109091
global_aoa * rel_orthographic_density                     -0.021519
global_aoa * rel_synonyms_count                           -0.224171
global_clustering * global_frequency                      -0.169706
global_clustering * global_letters_count                  -0.889983
global_clustering * global_orthographic_density           -2.647198
global_clustering * global_synonyms_count                 -2.875337
global_clustering * rel_aoa                               -0.208481
global_clustering * rel_clustering                         0.075761
global_clustering * rel_frequency                          0.529692
global_clustering * rel_letters_count                      1.003690
global_clustering * rel_orthographic_density               1.349769
global_clustering * rel_synonyms_count                     2.735131
global_frequency * global_letters_count                   -0.362424
global_frequency * global_orthographic_density            -0.221918
global_frequency * global_synonyms_count                   0.282079
global_frequency * rel_aoa                                -0.038109
global_frequency * rel_clustering                         -0.599966
global_frequency * rel_frequency                          -0.063635
global_frequency * rel_letters_count                       0.091181
global_frequency * rel_orthographic_density                0.013347
global_frequency * rel_synonyms_count                      0.215904
global_letters_count * global_orthographic_density         0.738497
global_letters_count * global_synonyms_count               0.328103
global_letters_count * rel_aoa                             0.084289
global_letters_count * rel_clustering                      0.274586
global_letters_count * rel_frequency                       0.101212
global_letters_count * rel_letters_count                   0.031681
global_letters_count * rel_orthographic_density           -0.389232
global_letters_count * rel_synonyms_count                  0.879452
global_orthographic_density * global_synonyms_count        1.630334
global_orthographic_density * rel_aoa                      0.221339
global_orthographic_density * rel_clustering               1.563152
global_orthographic_density * rel_frequency               -0.142630
global_orthographic_density * rel_letters_count           -0.670530
global_orthographic_density * rel_orthographic_density     0.654575
global_orthographic_density * rel_synonyms_count          -1.443654
global_synonyms_count * rel_aoa                            0.301045
global_synonyms_count * rel_clustering                     1.469982
global_synonyms_count * rel_frequency                     -0.505684
global_synonyms_count * rel_letters_count                 -0.952904
global_synonyms_count * rel_orthographic_density          -1.378314
global_synonyms_count * rel_synonyms_count                 0.255870
rel_aoa * rel_clustering                                   0.514694
rel_aoa * rel_frequency                                   -0.047572
rel_aoa * rel_letters_count                                0.024160
rel_aoa * rel_orthographic_density                         0.294399
rel_aoa * rel_synonyms_count                              -0.074473
rel_clustering * rel_frequency                             0.385499
rel_clustering * rel_letters_count                        -0.519782
rel_clustering * rel_orthographic_density                 -0.618670
rel_clustering * rel_synonyms_count                       -1.346605
rel_frequency * rel_letters_count                          0.024882
rel_frequency * rel_orthographic_density                   0.447017
rel_frequency * rel_synonyms_count                         0.050333
rel_letters_count * rel_orthographic_density               0.521514
rel_letters_count * rel_synonyms_count                     0.093017
rel_orthographic_density * rel_synonyms_count              2.686834
dtype: float64

Regressing rel aoa with 325 measures, no interactions
           ^^^^^^^
R^2 = 0.23702840276663106

intercept                      0.829290
global_aoa                    -0.361209
global_clustering             -0.418694
global_frequency              -0.198431
global_letters_count           0.166569
global_orthographic_density    0.227145
global_synonyms_count          0.637630
rel_aoa                        0.756550
rel_clustering                 0.234645
rel_frequency                  0.034306
rel_letters_count             -0.083411
rel_orthographic_density      -0.031019
rel_synonyms_count            -0.498588
dtype: float64

Regressing rel aoa with 325 measures, with interactions
           ^^^^^^^
R^2 = 0.4449323230664066

intercept                                                 17.957292
global_aoa                                                 1.227183
global_clustering                                          5.055182
global_frequency                                           0.095500
global_letters_count                                       0.011958
global_orthographic_density                              -10.243494
global_synonyms_count                                    -22.124655
rel_aoa                                                   -1.039662
rel_clustering                                             8.347419
rel_frequency                                              2.054879
rel_letters_count                                          5.760051
rel_orthographic_density                                   6.508185
rel_synonyms_count                                        12.237177
global_aoa * global_clustering                             0.210683
global_aoa * global_frequency                              0.075963
global_aoa * global_letters_count                         -0.198655
global_aoa * global_orthographic_density                  -0.162756
global_aoa * global_synonyms_count                         0.559986
global_aoa * rel_aoa                                       0.031555
global_aoa * rel_clustering                               -0.128117
global_aoa * rel_frequency                                -0.050326
global_aoa * rel_letters_count                            -0.014744
global_aoa * rel_orthographic_density                     -0.051345
global_aoa * rel_synonyms_count                           -0.437040
global_clustering * global_frequency                      -0.157592
global_clustering * global_letters_count                  -0.568268
global_clustering * global_orthographic_density           -1.374018
global_clustering * global_synonyms_count                 -2.895097
global_clustering * rel_aoa                               -0.287641
global_clustering * rel_clustering                         0.139073
global_clustering * rel_frequency                          0.258103
global_clustering * rel_letters_count                      0.919979
global_clustering * rel_orthographic_density               0.683673
global_clustering * rel_synonyms_count                     2.633626
global_frequency * global_letters_count                   -0.275055
global_frequency * global_orthographic_density             0.018427
global_frequency * global_synonyms_count                  -0.093302
global_frequency * rel_aoa                                 0.009392
global_frequency * rel_clustering                         -0.578502
global_frequency * rel_frequency                          -0.001939
global_frequency * rel_letters_count                       0.074402
global_frequency * rel_orthographic_density               -0.087524
global_frequency * rel_synonyms_count                      0.420038
global_letters_count * global_orthographic_density         0.591678
global_letters_count * global_synonyms_count               0.169853
global_letters_count * rel_aoa                            -0.052014
global_letters_count * rel_clustering                     -0.003604
global_letters_count * rel_frequency                       0.047382
global_letters_count * rel_letters_count                   0.052203
global_letters_count * rel_orthographic_density           -0.248913
global_letters_count * rel_synonyms_count                  0.726559
global_orthographic_density * global_synonyms_count        1.427624
global_orthographic_density * rel_aoa                     -0.045154
global_orthographic_density * rel_clustering               0.484658
global_orthographic_density * rel_frequency               -0.265465
global_orthographic_density * rel_letters_count           -0.559099
global_orthographic_density * rel_orthographic_density     0.451350
global_orthographic_density * rel_synonyms_count          -1.131064
global_synonyms_count * rel_aoa                            0.205374
global_synonyms_count * rel_clustering                     1.118140
global_synonyms_count * rel_frequency                     -0.155958
global_synonyms_count * rel_letters_count                 -0.674140
global_synonyms_count * rel_orthographic_density          -0.945495
global_synonyms_count * rel_synonyms_count                 0.147588
rel_aoa * rel_clustering                                   0.451838
rel_aoa * rel_frequency                                   -0.041961
rel_aoa * rel_letters_count                                0.099896
rel_aoa * rel_orthographic_density                         0.326020
rel_aoa * rel_synonyms_count                              -0.155643
rel_clustering * rel_frequency                             0.565730
rel_clustering * rel_letters_count                        -0.406534
rel_clustering * rel_orthographic_density                  0.057227
rel_clustering * rel_synonyms_count                       -1.009667
rel_frequency * rel_letters_count                          0.055362
rel_frequency * rel_orthographic_density                   0.446669
rel_frequency * rel_synonyms_count                        -0.268571
rel_letters_count * rel_orthographic_density               0.436974
rel_letters_count * rel_synonyms_count                     0.017837
rel_orthographic_density * rel_synonyms_count              1.757045
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 284 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.050638473371792614

intercept                     -4.233109
global_aoa                    -0.032028
global_clustering              0.023386
global_frequency              -0.117915
global_letters_count          -0.008697
global_orthographic_density   -0.016726
global_synonyms_count         -0.036282
dtype: float64

Regressing global clustering with 284 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1416179786385382

intercept                                             -6.090148
global_aoa                                             0.294430
global_clustering                                      0.271469
global_frequency                                       0.128402
global_letters_count                                   0.208182
global_orthographic_density                           -0.077131
global_synonyms_count                                 -1.070096
global_aoa * global_clustering                         0.000066
global_aoa * global_frequency                         -0.045742
global_aoa * global_letters_count                      0.003709
global_aoa * global_orthographic_density               0.032887
global_aoa * global_synonyms_count                     0.069887
global_clustering * global_frequency                  -0.026531
global_clustering * global_letters_count               0.031576
global_clustering * global_orthographic_density       -0.088335
global_clustering * global_synonyms_count             -0.211564
global_frequency * global_letters_count               -0.002416
global_frequency * global_orthographic_density        -0.054799
global_frequency * global_synonyms_count               0.020403
global_letters_count * global_orthographic_density    -0.031343
global_letters_count * global_synonyms_count          -0.097898
global_orthographic_density * global_synonyms_count   -0.262359
dtype: float64

Regressing rel clustering with 284 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.05243656278608666

intercept                      1.663380
global_aoa                    -0.019656
global_clustering             -0.019650
global_frequency              -0.113447
global_letters_count          -0.025919
global_orthographic_density   -0.030702
global_synonyms_count         -0.109129
dtype: float64

Regressing rel clustering with 284 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.11857556641408407

intercept                                              1.867427
global_aoa                                             0.180458
global_clustering                                      0.555839
global_frequency                                       0.027494
global_letters_count                                   0.067805
global_orthographic_density                            0.104038
global_synonyms_count                                 -1.566321
global_aoa * global_clustering                        -0.016167
global_aoa * global_frequency                         -0.035059
global_aoa * global_letters_count                     -0.002858
global_aoa * global_orthographic_density               0.014642
global_aoa * global_synonyms_count                     0.066251
global_clustering * global_frequency                  -0.035144
global_clustering * global_letters_count              -0.000924
global_clustering * global_orthographic_density       -0.044654
global_clustering * global_synonyms_count             -0.264951
global_frequency * global_letters_count               -0.006582
global_frequency * global_orthographic_density        -0.049818
global_frequency * global_synonyms_count               0.016592
global_letters_count * global_orthographic_density    -0.000140
global_letters_count * global_synonyms_count          -0.075376
global_orthographic_density * global_synonyms_count   -0.220161
dtype: float64

Regressing global clustering with 284 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0062361309487181105

intercept                  -5.796136
rel_aoa                    -0.005406
rel_clustering              0.014352
rel_frequency              -0.029364
rel_letters_count          -0.020461
rel_orthographic_density   -0.007555
rel_synonyms_count         -0.020792
dtype: float64

Regressing global clustering with 284 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0911623941309816

intercept                                       -5.789236
rel_aoa                                         -0.006813
rel_clustering                                  -0.014985
rel_frequency                                   -0.034715
rel_letters_count                                0.009228
rel_orthographic_density                         0.081638
rel_synonyms_count                              -0.070085
rel_aoa * rel_clustering                        -0.004117
rel_aoa * rel_frequency                         -0.033235
rel_aoa * rel_letters_count                     -0.011558
rel_aoa * rel_orthographic_density               0.067577
rel_aoa * rel_synonyms_count                     0.116639
rel_clustering * rel_frequency                   0.011615
rel_clustering * rel_letters_count              -0.012795
rel_clustering * rel_orthographic_density       -0.089373
rel_clustering * rel_synonyms_count             -0.208791
rel_frequency * rel_letters_count                0.015970
rel_frequency * rel_orthographic_density         0.013080
rel_frequency * rel_synonyms_count              -0.021744
rel_letters_count * rel_orthographic_density    -0.031768
rel_letters_count * rel_synonyms_count          -0.058519
rel_orthographic_density * rel_synonyms_count   -0.088862
dtype: float64

Regressing rel clustering with 284 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.0562417365160538

intercept                   0.254823
rel_aoa                    -0.029885
rel_clustering              0.222561
rel_frequency              -0.022388
rel_letters_count          -0.007515
rel_orthographic_density   -0.014645
rel_synonyms_count         -0.019609
dtype: float64

Regressing rel clustering with 284 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.11092621276261183

intercept                                        0.286082
rel_aoa                                         -0.008795
rel_clustering                                   0.168344
rel_frequency                                   -0.010816
rel_letters_count                                0.010252
rel_orthographic_density                         0.022339
rel_synonyms_count                              -0.055970
rel_aoa * rel_clustering                        -0.021617
rel_aoa * rel_frequency                         -0.018276
rel_aoa * rel_letters_count                     -0.030167
rel_aoa * rel_orthographic_density              -0.004054
rel_aoa * rel_synonyms_count                     0.109241
rel_clustering * rel_frequency                  -0.006168
rel_clustering * rel_letters_count               0.016114
rel_clustering * rel_orthographic_density       -0.012093
rel_clustering * rel_synonyms_count             -0.133483
rel_frequency * rel_letters_count                0.002788
rel_frequency * rel_orthographic_density        -0.006340
rel_frequency * rel_synonyms_count              -0.007918
rel_letters_count * rel_orthographic_density    -0.017608
rel_letters_count * rel_synonyms_count          -0.030258
rel_orthographic_density * rel_synonyms_count   -0.030219
dtype: float64

Regressing global clustering with 284 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.09133309370846443

intercept                     -1.150625
global_aoa                    -0.050539
global_clustering              0.208582
global_frequency              -0.241651
global_letters_count          -0.059052
global_orthographic_density   -0.056761
global_synonyms_count         -0.134229
rel_aoa                        0.030288
rel_clustering                -0.198923
rel_frequency                  0.135692
rel_letters_count              0.044271
rel_orthographic_density       0.065264
rel_synonyms_count             0.070925
dtype: float64

Regressing global clustering with 284 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3382618422221638

intercept                                                 20.172679
global_aoa                                                -0.290638
global_clustering                                          2.038190
global_frequency                                          -2.066535
global_letters_count                                       0.213710
global_orthographic_density                               -5.051673
global_synonyms_count                                     -4.032025
rel_aoa                                                    0.266521
rel_clustering                                            -2.168345
rel_frequency                                              1.740580
rel_letters_count                                          0.305776
rel_orthographic_density                                   5.638380
rel_synonyms_count                                         1.351339
global_aoa * global_clustering                            -0.068565
global_aoa * global_frequency                              0.014342
global_aoa * global_letters_count                         -0.031479
global_aoa * global_orthographic_density                  -0.170047
global_aoa * global_synonyms_count                        -0.005408
global_aoa * rel_aoa                                       0.001976
global_aoa * rel_clustering                                0.145286
global_aoa * rel_frequency                                -0.048112
global_aoa * rel_letters_count                             0.047875
global_aoa * rel_orthographic_density                      0.156061
global_aoa * rel_synonyms_count                            0.119168
global_clustering * global_frequency                      -0.149256
global_clustering * global_letters_count                   0.195437
global_clustering * global_orthographic_density           -0.254829
global_clustering * global_synonyms_count                 -0.929072
global_clustering * rel_aoa                                0.084135
global_clustering * rel_clustering                        -0.205380
global_clustering * rel_frequency                          0.118282
global_clustering * rel_letters_count                     -0.041356
global_clustering * rel_orthographic_density               0.209857
global_clustering * rel_synonyms_count                     0.981127
global_frequency * global_letters_count                    0.058521
global_frequency * global_orthographic_density             0.313889
global_frequency * global_synonyms_count                   0.136976
global_frequency * rel_aoa                                -0.016457
global_frequency * rel_clustering                         -0.046189
global_frequency * rel_frequency                           0.013158
global_frequency * rel_letters_count                      -0.048326
global_frequency * rel_orthographic_density               -0.338831
global_frequency * rel_synonyms_count                      0.052805
global_letters_count * global_orthographic_density         0.239212
global_letters_count * global_synonyms_count              -0.286827
global_letters_count * rel_aoa                             0.063113
global_letters_count * rel_clustering                     -0.099756
global_letters_count * rel_frequency                      -0.020775
global_letters_count * rel_letters_count                   0.002357
global_letters_count * rel_orthographic_density           -0.343698
global_letters_count * rel_synonyms_count                  0.407429
global_orthographic_density * global_synonyms_count       -1.047591
global_orthographic_density * rel_aoa                      0.153554
global_orthographic_density * rel_clustering               0.481278
global_orthographic_density * rel_frequency               -0.284535
global_orthographic_density * rel_letters_count           -0.143753
global_orthographic_density * rel_orthographic_density    -0.031824
global_orthographic_density * rel_synonyms_count           0.871810
global_synonyms_count * rel_aoa                           -0.071566
global_synonyms_count * rel_clustering                     0.387857
global_synonyms_count * rel_frequency                     -0.186083
global_synonyms_count * rel_letters_count                 -0.032325
global_synonyms_count * rel_orthographic_density           0.235367
global_synonyms_count * rel_synonyms_count                 0.001331
rel_aoa * rel_clustering                                  -0.065434
rel_aoa * rel_frequency                                    0.011473
rel_aoa * rel_letters_count                               -0.075325
rel_aoa * rel_orthographic_density                        -0.061865
rel_aoa * rel_synonyms_count                               0.098155
rel_clustering * rel_frequency                             0.043428
rel_clustering * rel_letters_count                        -0.026848
rel_clustering * rel_orthographic_density                 -0.407255
rel_clustering * rel_synonyms_count                       -0.637130
rel_frequency * rel_letters_count                          0.014206
rel_frequency * rel_orthographic_density                   0.280631
rel_frequency * rel_synonyms_count                        -0.010170
rel_letters_count * rel_orthographic_density               0.227560
rel_letters_count * rel_synonyms_count                    -0.163438
rel_orthographic_density * rel_synonyms_count             -0.077318
dtype: float64

Regressing rel clustering with 284 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2037254635011716

intercept                     -0.648258
global_aoa                    -0.036706
global_clustering             -0.646265
global_frequency              -0.214807
global_letters_count          -0.068775
global_orthographic_density   -0.068555
global_synonyms_count         -0.136477
rel_aoa                        0.018773
rel_clustering                 0.747233
rel_frequency                  0.114750
rel_letters_count              0.059500
rel_orthographic_density       0.066358
rel_synonyms_count             0.074675
dtype: float64

Regressing rel clustering with 284 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.40170239678938613

intercept                                                 20.077489
global_aoa                                                -0.464624
global_clustering                                          0.521824
global_frequency                                          -1.929998
global_letters_count                                      -0.254693
global_orthographic_density                               -5.273310
global_synonyms_count                                     -5.230788
rel_aoa                                                    0.345916
rel_clustering                                            -0.502356
rel_frequency                                              1.393646
rel_letters_count                                          0.738850
rel_orthographic_density                                   6.045720
rel_synonyms_count                                         2.493078
global_aoa * global_clustering                            -0.047576
global_aoa * global_frequency                              0.019480
global_aoa * global_letters_count                          0.005977
global_aoa * global_orthographic_density                  -0.109346
global_aoa * global_synonyms_count                        -0.010120
global_aoa * rel_aoa                                       0.002255
global_aoa * rel_clustering                                0.070135
global_aoa * rel_frequency                                -0.057475
global_aoa * rel_letters_count                             0.013014
global_aoa * rel_orthographic_density                      0.111146
global_aoa * rel_synonyms_count                            0.100492
global_clustering * global_frequency                      -0.100675
global_clustering * global_letters_count                   0.216310
global_clustering * global_orthographic_density           -0.263604
global_clustering * global_synonyms_count                 -1.030905
global_clustering * rel_aoa                                0.050793
global_clustering * rel_clustering                        -0.237027
global_clustering * rel_frequency                          0.057305
global_clustering * rel_letters_count                     -0.052563
global_clustering * rel_orthographic_density               0.306046
global_clustering * rel_synonyms_count                     1.081268
global_frequency * global_letters_count                    0.095360
global_frequency * global_orthographic_density             0.302920
global_frequency * global_synonyms_count                   0.018228
global_frequency * rel_aoa                                -0.026170
global_frequency * rel_clustering                         -0.083341
global_frequency * rel_frequency                           0.006609
global_frequency * rel_letters_count                      -0.066709
global_frequency * rel_orthographic_density               -0.312387
global_frequency * rel_synonyms_count                      0.181350
global_letters_count * global_orthographic_density         0.220545
global_letters_count * global_synonyms_count              -0.080668
global_letters_count * rel_aoa                             0.031927
global_letters_count * rel_clustering                     -0.097503
global_letters_count * rel_frequency                      -0.024433
global_letters_count * rel_letters_count                  -0.001072
global_letters_count * rel_orthographic_density           -0.294026
global_letters_count * rel_synonyms_count                  0.200581
global_orthographic_density * global_synonyms_count       -0.560442
global_orthographic_density * rel_aoa                      0.120904
global_orthographic_density * rel_clustering               0.419518
global_orthographic_density * rel_frequency               -0.239090
global_orthographic_density * rel_letters_count           -0.156616
global_orthographic_density * rel_orthographic_density    -0.022136
global_orthographic_density * rel_synonyms_count           0.395982
global_synonyms_count * rel_aoa                           -0.054915
global_synonyms_count * rel_clustering                     0.478000
global_synonyms_count * rel_frequency                     -0.122608
global_synonyms_count * rel_letters_count                 -0.166741
global_synonyms_count * rel_orthographic_density          -0.070887
global_synonyms_count * rel_synonyms_count                 0.007757
rel_aoa * rel_clustering                                  -0.017815
rel_aoa * rel_frequency                                    0.028197
rel_aoa * rel_letters_count                               -0.049933
rel_aoa * rel_orthographic_density                        -0.056488
rel_aoa * rel_synonyms_count                               0.066351
rel_clustering * rel_frequency                             0.082444
rel_clustering * rel_letters_count                        -0.019071
rel_clustering * rel_orthographic_density                 -0.434248
rel_clustering * rel_synonyms_count                       -0.702522
rel_frequency * rel_letters_count                          0.015632
rel_frequency * rel_orthographic_density                   0.245945
rel_frequency * rel_synonyms_count                        -0.076991
rel_letters_count * rel_orthographic_density               0.212477
rel_letters_count * rel_synonyms_count                    -0.004037
rel_orthographic_density * rel_synonyms_count              0.191352
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.04510038459319465

intercept                      3.101288
global_aoa                     0.084913
global_clustering             -0.256353
global_frequency              -0.005323
global_letters_count           0.168921
global_orthographic_density    0.025663
global_synonyms_count         -0.086008
dtype: float64

Regressing global letters_count with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1062088321230984

intercept                                             -14.619790
global_aoa                                              1.363074
global_clustering                                      -2.274034
global_frequency                                        1.445045
global_letters_count                                    0.803535
global_orthographic_density                            -1.899625
global_synonyms_count                                   1.066185
global_aoa * global_clustering                          0.139604
global_aoa * global_frequency                           0.029613
global_aoa * global_letters_count                      -0.107346
global_aoa * global_orthographic_density               -0.117741
global_aoa * global_synonyms_count                      0.142233
global_clustering * global_frequency                    0.250875
global_clustering * global_letters_count               -0.133284
global_clustering * global_orthographic_density        -0.337942
global_clustering * global_synonyms_count               0.149001
global_frequency * global_letters_count                -0.055328
global_frequency * global_orthographic_density          0.120393
global_frequency * global_synonyms_count               -0.053061
global_letters_count * global_orthographic_density     -0.060886
global_letters_count * global_synonyms_count           -0.111233
global_orthographic_density * global_synonyms_count    -0.068963
dtype: float64

Regressing rel letters_count with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.02161022688703429

intercept                      0.164985
global_aoa                     0.015665
global_clustering             -0.258595
global_frequency              -0.070203
global_letters_count           0.154623
global_orthographic_density    0.112882
global_synonyms_count         -0.152628
dtype: float64

Regressing rel letters_count with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.09327713034980634

intercept                                             -18.206701
global_aoa                                              1.326274
global_clustering                                      -3.674324
global_frequency                                        1.089939
global_letters_count                                    0.320409
global_orthographic_density                            -3.101788
global_synonyms_count                                   2.132430
global_aoa * global_clustering                          0.145687
global_aoa * global_frequency                           0.018049
global_aoa * global_letters_count                      -0.097388
global_aoa * global_orthographic_density               -0.060516
global_aoa * global_synonyms_count                      0.076616
global_clustering * global_frequency                    0.327542
global_clustering * global_letters_count               -0.039188
global_clustering * global_orthographic_density        -0.293091
global_clustering * global_synonyms_count               0.207356
global_frequency * global_letters_count                 0.049451
global_frequency * global_orthographic_density          0.265320
global_frequency * global_synonyms_count               -0.103779
global_letters_count * global_orthographic_density     -0.094191
global_letters_count * global_synonyms_count           -0.090363
global_orthographic_density * global_synonyms_count    -0.095940
dtype: float64

Regressing global letters_count with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.025526616729713018

intercept                   5.904757
rel_aoa                     0.004892
rel_clustering              0.026567
rel_frequency               0.055541
rel_letters_count           0.127637
rel_orthographic_density   -0.105122
rel_synonyms_count         -0.057063
dtype: float64

Regressing global letters_count with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0588414451331526

intercept                                        5.887647
rel_aoa                                         -0.206471
rel_clustering                                   0.226402
rel_frequency                                    0.106920
rel_letters_count                                0.254037
rel_orthographic_density                         0.114434
rel_synonyms_count                              -0.141521
rel_aoa * rel_clustering                        -0.010696
rel_aoa * rel_frequency                         -0.050903
rel_aoa * rel_letters_count                     -0.006578
rel_aoa * rel_orthographic_density              -0.075454
rel_aoa * rel_synonyms_count                     0.143779
rel_clustering * rel_frequency                   0.090993
rel_clustering * rel_letters_count              -0.063973
rel_clustering * rel_orthographic_density       -0.256394
rel_clustering * rel_synonyms_count              0.100600
rel_frequency * rel_letters_count                0.017212
rel_frequency * rel_orthographic_density         0.079690
rel_frequency * rel_synonyms_count              -0.031970
rel_letters_count * rel_orthographic_density     0.041429
rel_letters_count * rel_synonyms_count          -0.013639
rel_orthographic_density * rel_synonyms_count    0.123105
dtype: float64

Regressing rel letters_count with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.10602139260778887

intercept                   1.339289
rel_aoa                     0.015978
rel_clustering             -0.125450
rel_frequency              -0.163702
rel_letters_count           0.338854
rel_orthographic_density    0.265768
rel_synonyms_count         -0.113754
dtype: float64

Regressing rel letters_count with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.14634444534307456

intercept                                        1.395768
rel_aoa                                         -0.063158
rel_clustering                                   0.122211
rel_frequency                                   -0.115825
rel_letters_count                                0.486801
rel_orthographic_density                         0.603464
rel_synonyms_count                              -0.228183
rel_aoa * rel_clustering                         0.022921
rel_aoa * rel_frequency                         -0.007477
rel_aoa * rel_letters_count                     -0.057329
rel_aoa * rel_orthographic_density              -0.170863
rel_aoa * rel_synonyms_count                     0.162358
rel_clustering * rel_frequency                   0.152198
rel_clustering * rel_letters_count               0.006574
rel_clustering * rel_orthographic_density       -0.168219
rel_clustering * rel_synonyms_count              0.192199
rel_frequency * rel_letters_count                0.019901
rel_frequency * rel_orthographic_density         0.123523
rel_frequency * rel_synonyms_count              -0.015576
rel_letters_count * rel_orthographic_density     0.046397
rel_letters_count * rel_synonyms_count          -0.017027
rel_orthographic_density * rel_synonyms_count    0.085029
dtype: float64

Regressing global letters_count with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.061017628945866376

intercept                     -1.734606
global_aoa                     0.111275
global_clustering             -0.886008
global_frequency               0.034194
global_letters_count           0.258856
global_orthographic_density   -0.034220
global_synonyms_count          0.151912
rel_aoa                       -0.054342
rel_clustering                 0.715481
rel_frequency                 -0.050679
rel_letters_count             -0.085921
rel_orthographic_density       0.060427
rel_synonyms_count            -0.234822
dtype: float64

Regressing global letters_count with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.25246190660300616

intercept                                                -41.049176
global_aoa                                                 4.788521
global_clustering                                         -0.684638
global_frequency                                           4.277714
global_letters_count                                      -0.272334
global_orthographic_density                                0.696768
global_synonyms_count                                      3.297308
rel_aoa                                                   -4.434209
rel_clustering                                            -0.604654
rel_frequency                                             -0.354158
rel_letters_count                                          3.022071
rel_orthographic_density                                  -5.731204
rel_synonyms_count                                        -6.703788
global_aoa * global_clustering                             0.609656
global_aoa * global_frequency                              0.091857
global_aoa * global_letters_count                         -0.299623
global_aoa * global_orthographic_density                  -0.213797
global_aoa * global_synonyms_count                         0.568583
global_aoa * rel_aoa                                       0.037271
global_aoa * rel_clustering                               -0.329450
global_aoa * rel_frequency                                 0.039640
global_aoa * rel_letters_count                             0.018940
global_aoa * rel_orthographic_density                     -0.023709
global_aoa * rel_synonyms_count                           -0.473078
global_clustering * global_frequency                       0.278357
global_clustering * global_letters_count                  -1.195768
global_clustering * global_orthographic_density           -0.658841
global_clustering * global_synonyms_count                 -0.722261
global_clustering * rel_aoa                               -0.356689
global_clustering * rel_clustering                        -0.006816
global_clustering * rel_frequency                          0.230689
global_clustering * rel_letters_count                      1.081557
global_clustering * rel_orthographic_density              -0.191420
global_clustering * rel_synonyms_count                     0.221439
global_frequency * global_letters_count                   -0.459438
global_frequency * global_orthographic_density            -0.289452
global_frequency * global_synonyms_count                  -0.725276
global_frequency * rel_aoa                                 0.061813
global_frequency * rel_clustering                         -0.101905
global_frequency * rel_frequency                           0.030158
global_frequency * rel_letters_count                       0.311617
global_frequency * rel_orthographic_density                0.362942
global_frequency * rel_synonyms_count                      0.816860
global_letters_count * global_orthographic_density         0.045611
global_letters_count * global_synonyms_count              -0.659460
global_letters_count * rel_aoa                             0.250408
global_letters_count * rel_clustering                      0.914668
global_letters_count * rel_frequency                       0.159811
global_letters_count * rel_letters_count                   0.088747
global_letters_count * rel_orthographic_density            0.001331
global_letters_count * rel_synonyms_count                  0.825750
global_orthographic_density * global_synonyms_count        0.075547
global_orthographic_density * rel_aoa                     -0.004782
global_orthographic_density * rel_clustering               0.237327
global_orthographic_density * rel_frequency                0.129637
global_orthographic_density * rel_letters_count           -0.081281
global_orthographic_density * rel_orthographic_density     0.407829
global_orthographic_density * rel_synonyms_count          -0.953366
global_synonyms_count * rel_aoa                           -0.401366
global_synonyms_count * rel_clustering                     0.789508
global_synonyms_count * rel_frequency                      0.389836
global_synonyms_count * rel_letters_count                  0.181516
global_synonyms_count * rel_orthographic_density           0.293697
global_synonyms_count * rel_synonyms_count                -0.217170
rel_aoa * rel_clustering                                   0.273334
rel_aoa * rel_frequency                                   -0.159212
rel_aoa * rel_letters_count                               -0.106877
rel_aoa * rel_orthographic_density                         0.157981
rel_aoa * rel_synonyms_count                               0.498265
rel_clustering * rel_frequency                            -0.126046
rel_clustering * rel_letters_count                        -0.836722
rel_clustering * rel_orthographic_density                  0.516530
rel_clustering * rel_synonyms_count                       -0.442671
rel_frequency * rel_letters_count                         -0.036721
rel_frequency * rel_orthographic_density                   0.027066
rel_frequency * rel_synonyms_count                        -0.543720
rel_letters_count * rel_orthographic_density               0.314478
rel_letters_count * rel_synonyms_count                    -0.420111
rel_orthographic_density * rel_synonyms_count              0.625753
dtype: float64

Regressing rel letters_count with 363 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17455413011886922

intercept                     -2.116392
global_aoa                     0.055438
global_clustering             -0.746333
global_frequency               0.091964
global_letters_count          -0.519189
global_orthographic_density   -0.072471
global_synonyms_count          0.292167
rel_aoa                       -0.010262
rel_clustering                 0.594513
rel_frequency                 -0.112695
rel_letters_count              0.721090
rel_orthographic_density       0.076754
rel_synonyms_count            -0.391063
dtype: float64

Regressing rel letters_count with 363 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3347042074563037

intercept                                                -40.236852
global_aoa                                                 3.448342
global_clustering                                         -4.199598
global_frequency                                           3.612891
global_letters_count                                      -1.221804
global_orthographic_density                               -4.501475
global_synonyms_count                                      5.099285
rel_aoa                                                   -3.688395
rel_clustering                                             2.857380
rel_frequency                                             -0.521418
rel_letters_count                                          3.379815
rel_orthographic_density                                  -3.572707
rel_synonyms_count                                        -6.675621
global_aoa * global_clustering                             0.585506
global_aoa * global_frequency                              0.153761
global_aoa * global_letters_count                         -0.231915
global_aoa * global_orthographic_density                  -0.157429
global_aoa * global_synonyms_count                         0.478684
global_aoa * rel_aoa                                       0.030094
global_aoa * rel_clustering                               -0.296660
global_aoa * rel_frequency                                -0.031597
global_aoa * rel_letters_count                            -0.022594
global_aoa * rel_orthographic_density                     -0.059853
global_aoa * rel_synonyms_count                           -0.450677
global_clustering * global_frequency                       0.472278
global_clustering * global_letters_count                  -0.837887
global_clustering * global_orthographic_density           -0.682078
global_clustering * global_synonyms_count                 -0.443219
global_clustering * rel_aoa                               -0.439898
global_clustering * rel_clustering                        -0.116392
global_clustering * rel_frequency                         -0.002035
global_clustering * rel_letters_count                      0.674660
global_clustering * rel_orthographic_density              -0.384472
global_clustering * rel_synonyms_count                    -0.131966
global_frequency * global_letters_count                   -0.277661
global_frequency * global_orthographic_density             0.116141
global_frequency * global_synonyms_count                  -0.709331
global_frequency * rel_aoa                                -0.002554
global_frequency * rel_clustering                         -0.343474
global_frequency * rel_frequency                           0.038156
global_frequency * rel_letters_count                       0.154805
global_frequency * rel_orthographic_density                0.077583
global_frequency * rel_synonyms_count                      0.673683
global_letters_count * global_orthographic_density         0.103850
global_letters_count * global_synonyms_count              -0.508329
global_letters_count * rel_aoa                             0.185028
global_letters_count * rel_clustering                      0.583375
global_letters_count * rel_frequency                       0.072099
global_letters_count * rel_letters_count                   0.064765
global_letters_count * rel_orthographic_density            0.002283
global_letters_count * rel_synonyms_count                  0.591859
global_orthographic_density * global_synonyms_count       -0.028487
global_orthographic_density * rel_aoa                     -0.047957
global_orthographic_density * rel_clustering               0.134833
global_orthographic_density * rel_frequency               -0.171760
global_orthographic_density * rel_letters_count           -0.085156
global_orthographic_density * rel_orthographic_density     0.406598
global_orthographic_density * rel_synonyms_count          -0.917172
global_synonyms_count * rel_aoa                           -0.299191
global_synonyms_count * rel_clustering                     0.625194
global_synonyms_count * rel_frequency                      0.394592
global_synonyms_count * rel_letters_count                  0.056798
global_synonyms_count * rel_orthographic_density           0.434252
global_synonyms_count * rel_synonyms_count                -0.238765
rel_aoa * rel_clustering                                   0.341005
rel_aoa * rel_frequency                                   -0.075570
rel_aoa * rel_letters_count                               -0.052189
rel_aoa * rel_orthographic_density                         0.206809
rel_aoa * rel_synonyms_count                               0.479201
rel_clustering * rel_frequency                             0.126666
rel_clustering * rel_letters_count                        -0.462003
rel_clustering * rel_orthographic_density                  0.768364
rel_clustering * rel_synonyms_count                       -0.142583
rel_frequency * rel_letters_count                          0.030101
rel_frequency * rel_orthographic_density                   0.202995
rel_frequency * rel_synonyms_count                        -0.417557
rel_letters_count * rel_orthographic_density               0.252395
rel_letters_count * rel_synonyms_count                    -0.195269
rel_orthographic_density * rel_synonyms_count              0.599341
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 353 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07102730992049866

intercept                      0.551113
global_aoa                     0.004304
global_clustering             -0.018776
global_frequency              -0.017437
global_letters_count          -0.035587
global_orthographic_density    0.022279
global_synonyms_count          0.197838
dtype: float64

Regressing global synonyms_count with 353 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10150456090149362

intercept                                             -0.314684
global_aoa                                             0.276878
global_clustering                                      0.295852
global_frequency                                       0.052537
global_letters_count                                   0.132992
global_orthographic_density                            0.149330
global_synonyms_count                                 -0.591193
global_aoa * global_clustering                        -0.015769
global_aoa * global_frequency                         -0.026165
global_aoa * global_letters_count                     -0.015914
global_aoa * global_orthographic_density              -0.027867
global_aoa * global_synonyms_count                     0.019086
global_clustering * global_frequency                  -0.022070
global_clustering * global_letters_count               0.007354
global_clustering * global_orthographic_density       -0.023575
global_clustering * global_synonyms_count             -0.054552
global_frequency * global_letters_count               -0.001631
global_frequency * global_orthographic_density        -0.013532
global_frequency * global_synonyms_count               0.016772
global_letters_count * global_orthographic_density     0.008277
global_letters_count * global_synonyms_count           0.020379
global_orthographic_density * global_synonyms_count    0.043172
dtype: float64

Regressing rel synonyms_count with 353 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.04686141801965693

intercept                      0.148693
global_aoa                     0.001599
global_clustering             -0.035451
global_frequency              -0.010200
global_letters_count          -0.036715
global_orthographic_density    0.007484
global_synonyms_count          0.133297
dtype: float64

Regressing rel synonyms_count with 353 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07843476033584207

intercept                                             -0.122151
global_aoa                                             0.241019
global_clustering                                      0.312447
global_frequency                                      -0.069587
global_letters_count                                   0.159496
global_orthographic_density                            0.302809
global_synonyms_count                                 -0.734377
global_aoa * global_clustering                        -0.015278
global_aoa * global_frequency                         -0.020439
global_aoa * global_letters_count                     -0.016893
global_aoa * global_orthographic_density              -0.034524
global_aoa * global_synonyms_count                     0.017857
global_clustering * global_frequency                  -0.034292
global_clustering * global_letters_count               0.011502
global_clustering * global_orthographic_density        0.012301
global_clustering * global_synonyms_count             -0.056366
global_frequency * global_letters_count               -0.001142
global_frequency * global_orthographic_density        -0.002180
global_frequency * global_synonyms_count               0.019995
global_letters_count * global_orthographic_density     0.005464
global_letters_count * global_synonyms_count           0.027635
global_orthographic_density * global_synonyms_count    0.053196
dtype: float64

Regressing global synonyms_count with 353 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07751444634080118

intercept                   0.491320
rel_aoa                     0.043657
rel_clustering             -0.058681
rel_frequency              -0.011159
rel_letters_count          -0.044532
rel_orthographic_density    0.035357
rel_synonyms_count          0.180042
dtype: float64

Regressing global synonyms_count with 353 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11740592538489314

intercept                                        0.529129
rel_aoa                                          0.020173
rel_clustering                                  -0.125597
rel_frequency                                   -0.000131
rel_letters_count                               -0.084406
rel_orthographic_density                         0.023506
rel_synonyms_count                               0.100359
rel_aoa * rel_clustering                        -0.028024
rel_aoa * rel_frequency                         -0.019186
rel_aoa * rel_letters_count                      0.009811
rel_aoa * rel_orthographic_density               0.037828
rel_aoa * rel_synonyms_count                     0.017429
rel_clustering * rel_frequency                  -0.013750
rel_clustering * rel_letters_count               0.026755
rel_clustering * rel_orthographic_density        0.006224
rel_clustering * rel_synonyms_count             -0.078032
rel_frequency * rel_letters_count               -0.001621
rel_frequency * rel_orthographic_density        -0.011219
rel_frequency * rel_synonyms_count               0.008470
rel_letters_count * rel_orthographic_density    -0.018762
rel_letters_count * rel_synonyms_count           0.057870
rel_orthographic_density * rel_synonyms_count    0.039015
dtype: float64

Regressing rel synonyms_count with 353 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.11482056174791921

intercept                   0.157510
rel_aoa                     0.018048
rel_clustering             -0.014963
rel_frequency              -0.005337
rel_letters_count          -0.042612
rel_orthographic_density   -0.005212
rel_synonyms_count          0.289492
dtype: float64

Regressing rel synonyms_count with 353 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.15570005065418902

intercept                                        0.212750
rel_aoa                                         -0.010562
rel_clustering                                  -0.133763
rel_frequency                                    0.018497
rel_letters_count                               -0.085121
rel_orthographic_density                         0.001873
rel_synonyms_count                               0.327418
rel_aoa * rel_clustering                        -0.013855
rel_aoa * rel_frequency                         -0.014158
rel_aoa * rel_letters_count                      0.006673
rel_aoa * rel_orthographic_density               0.013465
rel_aoa * rel_synonyms_count                    -0.013026
rel_clustering * rel_frequency                  -0.030910
rel_clustering * rel_letters_count               0.031059
rel_clustering * rel_orthographic_density        0.013010
rel_clustering * rel_synonyms_count             -0.057598
rel_frequency * rel_letters_count               -0.004872
rel_frequency * rel_orthographic_density        -0.004961
rel_frequency * rel_synonyms_count               0.030187
rel_letters_count * rel_orthographic_density    -0.012471
rel_letters_count * rel_synonyms_count           0.068075
rel_orthographic_density * rel_synonyms_count    0.085656
dtype: float64

Regressing global synonyms_count with 353 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09666810714534846

intercept                      1.481036
global_aoa                    -0.041266
global_clustering              0.080408
global_frequency              -0.033489
global_letters_count          -0.002706
global_orthographic_density    0.044650
global_synonyms_count          0.141714
rel_aoa                        0.069456
rel_clustering                -0.108504
rel_frequency                  0.021948
rel_letters_count             -0.036075
rel_orthographic_density      -0.017401
rel_synonyms_count             0.049662
dtype: float64

Regressing global synonyms_count with 353 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3077507182619169

intercept                                                  1.889708
global_aoa                                                 1.442009
global_clustering                                          2.191221
global_frequency                                          -0.485791
global_letters_count                                       0.246265
global_orthographic_density                                2.959773
global_synonyms_count                                      5.693639
rel_aoa                                                   -0.229408
rel_clustering                                            -3.266521
rel_frequency                                              0.773391
rel_letters_count                                         -0.002797
rel_orthographic_density                                  -1.779343
rel_synonyms_count                                       -11.624813
global_aoa * global_clustering                             0.011248
global_aoa * global_frequency                             -0.062197
global_aoa * global_letters_count                         -0.090099
global_aoa * global_orthographic_density                  -0.162162
global_aoa * global_synonyms_count                        -0.051618
global_aoa * rel_aoa                                      -0.018916
global_aoa * rel_clustering                                0.021687
global_aoa * rel_frequency                                 0.048971
global_aoa * rel_letters_count                             0.055230
global_aoa * rel_orthographic_density                      0.096468
global_aoa * rel_synonyms_count                            0.139470
global_clustering * global_frequency                      -0.187998
global_clustering * global_letters_count                  -0.090557
global_clustering * global_orthographic_density            0.129593
global_clustering * global_synonyms_count                  0.527707
global_clustering * rel_aoa                                0.002793
global_clustering * rel_clustering                        -0.020752
global_clustering * rel_frequency                          0.185816
global_clustering * rel_letters_count                      0.086934
global_clustering * rel_orthographic_density              -0.073106
global_clustering * rel_synonyms_count                    -0.912540
global_frequency * global_letters_count                   -0.022289
global_frequency * global_orthographic_density            -0.120600
global_frequency * global_synonyms_count                   0.137746
global_frequency * rel_aoa                                 0.024973
global_frequency * rel_clustering                          0.190542
global_frequency * rel_frequency                          -0.012010
global_frequency * rel_letters_count                      -0.011933
global_frequency * rel_orthographic_density                0.077341
global_frequency * rel_synonyms_count                      0.038515
global_letters_count * global_orthographic_density         0.076320
global_letters_count * global_synonyms_count              -0.394681
global_letters_count * rel_aoa                             0.049621
global_letters_count * rel_clustering                      0.160062
global_letters_count * rel_frequency                       0.015150
global_letters_count * rel_letters_count                   0.006046
global_letters_count * rel_orthographic_density           -0.046922
global_letters_count * rel_synonyms_count                  0.598629
global_orthographic_density * global_synonyms_count       -0.898330
global_orthographic_density * rel_aoa                     -0.026053
global_orthographic_density * rel_clustering               0.000887
global_orthographic_density * rel_frequency                0.071595
global_orthographic_density * rel_letters_count            0.122841
global_orthographic_density * rel_orthographic_density     0.168283
global_orthographic_density * rel_synonyms_count           1.013839
global_synonyms_count * rel_aoa                           -0.140488
global_synonyms_count * rel_clustering                    -0.313070
global_synonyms_count * rel_frequency                     -0.095283
global_synonyms_count * rel_letters_count                  0.186899
global_synonyms_count * rel_orthographic_density           0.309540
global_synonyms_count * rel_synonyms_count                 0.174803
rel_aoa * rel_clustering                                  -0.051507
rel_aoa * rel_frequency                                   -0.051038
rel_aoa * rel_letters_count                               -0.005274
rel_aoa * rel_orthographic_density                         0.092807
rel_aoa * rel_synonyms_count                               0.086177
rel_clustering * rel_frequency                            -0.216466
rel_clustering * rel_letters_count                        -0.116944
rel_clustering * rel_orthographic_density                  0.003290
rel_clustering * rel_synonyms_count                        0.603596
rel_frequency * rel_letters_count                          0.023610
rel_frequency * rel_orthographic_density                  -0.025523
rel_frequency * rel_synonyms_count                        -0.057795
rel_letters_count * rel_orthographic_density              -0.069398
rel_letters_count * rel_synonyms_count                    -0.279814
rel_orthographic_density * rel_synonyms_count             -0.304256
dtype: float64

Regressing rel synonyms_count with 353 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.19850337570925303

intercept                      0.830449
global_aoa                    -0.039603
global_clustering              0.022541
global_frequency              -0.026354
global_letters_count           0.006256
global_orthographic_density    0.059201
global_synonyms_count         -0.552820
rel_aoa                        0.055413
rel_clustering                -0.039761
rel_frequency                  0.017204
rel_letters_count             -0.043188
rel_orthographic_density      -0.066144
rel_synonyms_count             0.812803
dtype: float64

Regressing rel synonyms_count with 353 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.3928768799899093

intercept                                                 -2.869870
global_aoa                                                 1.036140
global_clustering                                          1.347739
global_frequency                                          -0.268956
global_letters_count                                       0.273281
global_orthographic_density                                4.213503
global_synonyms_count                                      6.317736
rel_aoa                                                    0.059147
rel_clustering                                            -2.328268
rel_frequency                                              0.187479
rel_letters_count                                         -0.397754
rel_orthographic_density                                  -3.464210
rel_synonyms_count                                       -11.679325
global_aoa * global_clustering                             0.012991
global_aoa * global_frequency                             -0.035981
global_aoa * global_letters_count                         -0.063759
global_aoa * global_orthographic_density                  -0.155857
global_aoa * global_synonyms_count                        -0.020465
global_aoa * rel_aoa                                      -0.015178
global_aoa * rel_clustering                                0.038210
global_aoa * rel_frequency                                 0.038214
global_aoa * rel_letters_count                             0.050232
global_aoa * rel_orthographic_density                      0.125435
global_aoa * rel_synonyms_count                            0.109797
global_clustering * global_frequency                      -0.145257
global_clustering * global_letters_count                  -0.068086
global_clustering * global_orthographic_density            0.212150
global_clustering * global_synonyms_count                  0.529926
global_clustering * rel_aoa                               -0.017018
global_clustering * rel_clustering                        -0.017498
global_clustering * rel_frequency                          0.115075
global_clustering * rel_letters_count                      0.061214
global_clustering * rel_orthographic_density              -0.112424
global_clustering * rel_synonyms_count                    -0.891335
global_frequency * global_letters_count                   -0.012847
global_frequency * global_orthographic_density            -0.162080
global_frequency * global_synonyms_count                  -0.024001
global_frequency * rel_aoa                                -0.006196
global_frequency * rel_clustering                          0.125448
global_frequency * rel_frequency                          -0.013374
global_frequency * rel_letters_count                       0.000203
global_frequency * rel_orthographic_density                0.161790
global_frequency * rel_synonyms_count                      0.178598
global_letters_count * global_orthographic_density         0.003414
global_letters_count * global_synonyms_count              -0.381238
global_letters_count * rel_aoa                             0.021188
global_letters_count * rel_clustering                      0.117356
global_letters_count * rel_frequency                       0.018123
global_letters_count * rel_letters_count                   0.006364
global_letters_count * rel_orthographic_density            0.014294
global_letters_count * rel_synonyms_count                  0.528510
global_orthographic_density * global_synonyms_count       -0.733173
global_orthographic_density * rel_aoa                     -0.029910
global_orthographic_density * rel_clustering              -0.018298
global_orthographic_density * rel_frequency                0.150200
global_orthographic_density * rel_letters_count            0.172690
global_orthographic_density * rel_orthographic_density     0.152338
global_orthographic_density * rel_synonyms_count           0.909698
global_synonyms_count * rel_aoa                           -0.093246
global_synonyms_count * rel_clustering                    -0.254447
global_synonyms_count * rel_frequency                      0.080900
global_synonyms_count * rel_letters_count                  0.249153
global_synonyms_count * rel_orthographic_density           0.394196
global_synonyms_count * rel_synonyms_count                 0.190539
rel_aoa * rel_clustering                                  -0.048486
rel_aoa * rel_frequency                                   -0.031783
rel_aoa * rel_letters_count                                0.004298
rel_aoa * rel_orthographic_density                         0.067192
rel_aoa * rel_synonyms_count                               0.025559
rel_clustering * rel_frequency                            -0.141207
rel_clustering * rel_letters_count                        -0.092348
rel_clustering * rel_orthographic_density                 -0.027408
rel_clustering * rel_synonyms_count                        0.520015
rel_frequency * rel_letters_count                          0.006209
rel_frequency * rel_orthographic_density                  -0.117671
rel_frequency * rel_synonyms_count                        -0.215751
rel_letters_count * rel_orthographic_density              -0.118661
rel_letters_count * rel_synonyms_count                    -0.289796
rel_orthographic_density * rel_synonyms_count             -0.446359
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 301 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0796733420303598

intercept                      1.212593
global_aoa                    -0.003989
global_clustering              0.112266
global_frequency               0.022672
global_letters_count           0.023361
global_orthographic_density    0.305927
global_synonyms_count          0.120550
dtype: float64

Regressing global orthographic_density with 301 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1203054333374246

intercept                                              4.478995
global_aoa                                            -0.401452
global_clustering                                      0.821128
global_frequency                                       0.151999
global_letters_count                                  -0.266499
global_orthographic_density                            1.298247
global_synonyms_count                                 -0.160868
global_aoa * global_clustering                        -0.057861
global_aoa * global_frequency                         -0.023496
global_aoa * global_letters_count                      0.034692
global_aoa * global_orthographic_density               0.056257
global_aoa * global_synonyms_count                    -0.018783
global_clustering * global_frequency                  -0.033022
global_clustering * global_letters_count              -0.011347
global_clustering * global_orthographic_density        0.070610
global_clustering * global_synonyms_count             -0.096115
global_frequency * global_letters_count               -0.004756
global_frequency * global_orthographic_density        -0.103639
global_frequency * global_synonyms_count              -0.016358
global_letters_count * global_orthographic_density    -0.000953
global_letters_count * global_synonyms_count           0.005977
global_orthographic_density * global_synonyms_count   -0.049639
dtype: float64

Regressing rel orthographic_density with 301 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.065759883875253

intercept                     -1.201429
global_aoa                     0.017047
global_clustering              0.099839
global_frequency               0.050772
global_letters_count          -0.002938
global_orthographic_density    0.226524
global_synonyms_count          0.137351
dtype: float64

Regressing rel orthographic_density with 301 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10977873014967798

intercept                                              4.109347
global_aoa                                            -0.300388
global_clustering                                      1.331622
global_frequency                                       0.152252
global_letters_count                                  -0.501706
global_orthographic_density                            1.046582
global_synonyms_count                                 -0.383433
global_aoa * global_clustering                        -0.031241
global_aoa * global_frequency                         -0.012118
global_aoa * global_letters_count                      0.035590
global_aoa * global_orthographic_density               0.027831
global_aoa * global_synonyms_count                    -0.025827
global_clustering * global_frequency                  -0.055328
global_clustering * global_letters_count              -0.081460
global_clustering * global_orthographic_density        0.012096
global_clustering * global_synonyms_count             -0.135004
global_frequency * global_letters_count               -0.030101
global_frequency * global_orthographic_density        -0.116477
global_frequency * global_synonyms_count              -0.042302
global_letters_count * global_orthographic_density     0.023652
global_letters_count * global_synonyms_count           0.047933
global_orthographic_density * global_synonyms_count    0.001461
dtype: float64

Regressing global orthographic_density with 301 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06939168947070296

intercept                   1.467427
rel_aoa                     0.025361
rel_clustering              0.038560
rel_frequency              -0.009216
rel_letters_count           0.062538
rel_orthographic_density    0.366426
rel_synonyms_count          0.164209
dtype: float64

Regressing global orthographic_density with 301 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1117488498618351

intercept                                        1.392167
rel_aoa                                          0.123118
rel_clustering                                   0.094030
rel_frequency                                   -0.085402
rel_letters_count                               -0.011549
rel_orthographic_density                         0.245747
rel_synonyms_count                               0.149044
rel_aoa * rel_clustering                        -0.022239
rel_aoa * rel_frequency                          0.020661
rel_aoa * rel_letters_count                     -0.007210
rel_aoa * rel_orthographic_density               0.007316
rel_aoa * rel_synonyms_count                    -0.053454
rel_clustering * rel_frequency                  -0.002767
rel_clustering * rel_letters_count               0.001869
rel_clustering * rel_orthographic_density        0.067243
rel_clustering * rel_synonyms_count             -0.020029
rel_frequency * rel_letters_count               -0.003760
rel_frequency * rel_orthographic_density        -0.080312
rel_frequency * rel_synonyms_count               0.018518
rel_letters_count * rel_orthographic_density    -0.052917
rel_letters_count * rel_synonyms_count           0.041222
rel_orthographic_density * rel_synonyms_count   -0.026695
dtype: float64

Regressing rel orthographic_density with 301 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1228602585329396

intercept                  -0.531163
rel_aoa                     0.022203
rel_clustering              0.042020
rel_frequency               0.047060
rel_letters_count           0.056337
rel_orthographic_density    0.423725
rel_synonyms_count          0.133092
dtype: float64

Regressing rel orthographic_density with 301 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15434380583948015

intercept                                       -0.567231
rel_aoa                                          0.086733
rel_clustering                                   0.064794
rel_frequency                                    0.013438
rel_letters_count                               -0.005763
rel_orthographic_density                         0.288550
rel_synonyms_count                              -0.023885
rel_aoa * rel_clustering                        -0.018520
rel_aoa * rel_frequency                          0.010308
rel_aoa * rel_letters_count                      0.002462
rel_aoa * rel_orthographic_density               0.024826
rel_aoa * rel_synonyms_count                    -0.084700
rel_clustering * rel_frequency                  -0.021120
rel_clustering * rel_letters_count              -0.021855
rel_clustering * rel_orthographic_density        0.038758
rel_clustering * rel_synonyms_count             -0.040398
rel_frequency * rel_letters_count               -0.014268
rel_frequency * rel_orthographic_density        -0.067022
rel_frequency * rel_synonyms_count              -0.010795
rel_letters_count * rel_orthographic_density    -0.023890
rel_letters_count * rel_synonyms_count           0.045017
rel_orthographic_density * rel_synonyms_count   -0.098404
dtype: float64

Regressing global orthographic_density with 301 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10892338921433986

intercept                      3.948600
global_aoa                    -0.013719
global_clustering              0.305726
global_frequency              -0.022671
global_letters_count          -0.155321
global_orthographic_density    0.224636
global_synonyms_count         -0.096163
rel_aoa                        0.025764
rel_clustering                -0.197672
rel_frequency                  0.064466
rel_letters_count              0.195216
rel_orthographic_density       0.086981
rel_synonyms_count             0.265767
dtype: float64

Regressing global orthographic_density with 301 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2946937164356933

intercept                                                -2.549255
global_aoa                                               -0.029654
global_clustering                                         0.986358
global_frequency                                         -0.070905
global_letters_count                                      1.594180
global_orthographic_density                               5.034105
global_synonyms_count                                     0.028281
rel_aoa                                                  -0.953657
rel_clustering                                           -0.176725
rel_frequency                                            -0.868701
rel_letters_count                                        -1.482381
rel_orthographic_density                                 -3.438566
rel_synonyms_count                                        3.700232
global_aoa * global_clustering                           -0.128512
global_aoa * global_frequency                            -0.097910
global_aoa * global_letters_count                         0.002779
global_aoa * global_orthographic_density                  0.112809
global_aoa * global_synonyms_count                       -0.225331
global_aoa * rel_aoa                                      0.002392
global_aoa * rel_clustering                               0.083682
global_aoa * rel_frequency                                0.024801
global_aoa * rel_letters_count                            0.064350
global_aoa * rel_orthographic_density                    -0.058157
global_aoa * rel_synonyms_count                           0.345276
global_clustering * global_frequency                     -0.157227
global_clustering * global_letters_count                  0.274951
global_clustering * global_orthographic_density           0.317039
global_clustering * global_synonyms_count                -0.598656
global_clustering * rel_aoa                              -0.081174
global_clustering * rel_clustering                        0.103126
global_clustering * rel_frequency                        -0.120066
global_clustering * rel_letters_count                    -0.064865
global_clustering * rel_orthographic_density              0.298744
global_clustering * rel_synonyms_count                    0.998600
global_frequency * global_letters_count                   0.046674
global_frequency * global_orthographic_density           -0.249329
global_frequency * global_synonyms_count                  0.056224
global_frequency * rel_aoa                                0.082142
global_frequency * rel_clustering                         0.209397
global_frequency * rel_frequency                         -0.004477
global_frequency * rel_letters_count                      0.013937
global_frequency * rel_orthographic_density               0.384441
global_frequency * rel_synonyms_count                    -0.087802
global_letters_count * global_orthographic_density       -0.206708
global_letters_count * global_synonyms_count             -0.206711
global_letters_count * rel_aoa                           -0.003193
global_letters_count * rel_clustering                    -0.297428
global_letters_count * rel_frequency                     -0.027130
global_letters_count * rel_letters_count                 -0.003319
global_letters_count * rel_orthographic_density           0.343183
global_letters_count * rel_synonyms_count                -0.134614
global_orthographic_density * global_synonyms_count      -1.040771
global_orthographic_density * rel_aoa                    -0.154668
global_orthographic_density * rel_clustering             -0.434602
global_orthographic_density * rel_frequency               0.038942
global_orthographic_density * rel_letters_count           0.175847
global_orthographic_density * rel_orthographic_density   -0.163181
global_orthographic_density * rel_synonyms_count          1.004750
global_synonyms_count * rel_aoa                           0.306164
global_synonyms_count * rel_clustering                    0.334783
global_synonyms_count * rel_frequency                     0.021769
global_synonyms_count * rel_letters_count                 0.289811
global_synonyms_count * rel_orthographic_density          0.905218
global_synonyms_count * rel_synonyms_count               -0.122960
rel_aoa * rel_clustering                                 -0.035022
rel_aoa * rel_frequency                                  -0.011229
rel_aoa * rel_letters_count                              -0.061974
rel_aoa * rel_orthographic_density                        0.082218
rel_aoa * rel_synonyms_count                             -0.496914
rel_clustering * rel_frequency                            0.040943
rel_clustering * rel_letters_count                        0.111802
rel_clustering * rel_orthographic_density                -0.063082
rel_clustering * rel_synonyms_count                      -0.777453
rel_frequency * rel_letters_count                        -0.042543
rel_frequency * rel_orthographic_density                 -0.285138
rel_frequency * rel_synonyms_count                        0.026278
rel_letters_count * rel_orthographic_density             -0.391140
rel_letters_count * rel_synonyms_count                    0.022444
rel_orthographic_density * rel_synonyms_count            -0.972993
dtype: float64

Regressing rel orthographic_density with 301 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16324109343048954

intercept                      2.704373
global_aoa                     0.022017
global_clustering              0.311356
global_frequency              -0.004483
global_letters_count          -0.128292
global_orthographic_density   -0.401565
global_synonyms_count         -0.028706
rel_aoa                       -0.005729
rel_clustering                -0.219738
rel_frequency                  0.052893
rel_letters_count              0.150584
rel_orthographic_density       0.770351
rel_synonyms_count             0.164606
dtype: float64

Regressing rel orthographic_density with 301 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3409455297557312

intercept                                                -4.518858
global_aoa                                                0.502380
global_clustering                                         0.676386
global_frequency                                          0.087469
global_letters_count                                      1.179503
global_orthographic_density                               2.756896
global_synonyms_count                                     0.443655
rel_aoa                                                  -1.188507
rel_clustering                                           -0.819364
rel_frequency                                            -1.188307
rel_letters_count                                        -1.057457
rel_orthographic_density                                 -0.986245
rel_synonyms_count                                        3.713221
global_aoa * global_clustering                           -0.057317
global_aoa * global_frequency                            -0.086914
global_aoa * global_letters_count                         0.020091
global_aoa * global_orthographic_density                 -0.005163
global_aoa * global_synonyms_count                       -0.155763
global_aoa * rel_aoa                                      0.003896
global_aoa * rel_clustering                               0.068656
global_aoa * rel_frequency                                0.038960
global_aoa * rel_letters_count                            0.060968
global_aoa * rel_orthographic_density                     0.094008
global_aoa * rel_synonyms_count                           0.290060
global_clustering * global_frequency                     -0.122916
global_clustering * global_letters_count                  0.259977
global_clustering * global_orthographic_density           0.119300
global_clustering * global_synonyms_count                -0.629148
global_clustering * rel_aoa                              -0.092592
global_clustering * rel_clustering                        0.065859
global_clustering * rel_frequency                        -0.130498
global_clustering * rel_letters_count                    -0.007644
global_clustering * rel_orthographic_density              0.583655
global_clustering * rel_synonyms_count                    1.036050
global_frequency * global_letters_count                   0.038208
global_frequency * global_orthographic_density           -0.212943
global_frequency * global_synonyms_count                  0.009415
global_frequency * rel_aoa                                0.085980
global_frequency * rel_clustering                         0.177590
global_frequency * rel_frequency                          0.002553
global_frequency * rel_letters_count                      0.027710
global_frequency * rel_orthographic_density               0.368851
global_frequency * rel_synonyms_count                    -0.097502
global_letters_count * global_orthographic_density       -0.039070
global_letters_count * global_synonyms_count             -0.380663
global_letters_count * rel_aoa                           -0.018533
global_letters_count * rel_clustering                    -0.314772
global_letters_count * rel_frequency                     -0.039647
global_letters_count * rel_letters_count                 -0.001680
global_letters_count * rel_orthographic_density           0.173292
global_letters_count * rel_synonyms_count                 0.008103
global_orthographic_density * global_synonyms_count      -0.876964
global_orthographic_density * rel_aoa                    -0.083743
global_orthographic_density * rel_clustering             -0.010289
global_orthographic_density * rel_frequency               0.095735
global_orthographic_density * rel_letters_count           0.060402
global_orthographic_density * rel_orthographic_density   -0.103510
global_orthographic_density * rel_synonyms_count          0.962859
global_synonyms_count * rel_aoa                           0.198824
global_synonyms_count * rel_clustering                    0.524563
global_synonyms_count * rel_frequency                     0.084479
global_synonyms_count * rel_letters_count                 0.361282
global_synonyms_count * rel_orthographic_density          0.713269
global_synonyms_count * rel_synonyms_count               -0.159107
rel_aoa * rel_clustering                                 -0.047242
rel_aoa * rel_frequency                                  -0.030614
rel_aoa * rel_letters_count                              -0.061526
rel_aoa * rel_orthographic_density                       -0.028741
rel_aoa * rel_synonyms_count                             -0.419988
rel_clustering * rel_frequency                            0.059944
rel_clustering * rel_letters_count                        0.068978
rel_clustering * rel_orthographic_density                -0.578974
rel_clustering * rel_synonyms_count                      -1.043384
rel_frequency * rel_letters_count                        -0.039494
rel_frequency * rel_orthographic_density                 -0.346074
rel_frequency * rel_synonyms_count                        0.000603
rel_letters_count * rel_orthographic_density             -0.266752
rel_letters_count * rel_synonyms_count                   -0.013030
rel_orthographic_density * rel_synonyms_count            -0.949412
dtype: float64