Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.majority, past=Past.all, durl=Durl.exclude_past, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 1339 substitutions for model Model(time=Time.continuous, source=Source.majority, past=Past.all, durl=Durl.exclude_past, max_distance=1)
100% (1339 of 1339) |######################| Elapsed Time: 0:00:45 Time: 0:00:45

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *** | *** | *** | *   |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *   |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | ns. |
H_00 | **  | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | ns. | *** | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *   |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | ns. | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | ns. | *** | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | **  |
H_00 | **  | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | **  | **  |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | ns. | ns. | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *   |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | **  | **  |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | ns. | ns. | ns. | **  |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | **  | ns. |
H_00 | ns. | *   | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | **  | ns. |
H_00 | ns. | *** | *** | **  |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *   | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | **  | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | **  |
H_00 | ns. | *   | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *   | **  | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | **  | ns. |
H_00 | ns. | *** | *** | **  |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | **  |
H_00 | ns. | *   | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *   |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *   | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | ns. | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | ns. | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | *** | *** |
H_00 | ns. | *   | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | ns. | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | ns. | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | *** | *** |
H_00 | ns. | *   | ns. | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.53320712  0.17207545  0.08660655  0.06916266  0.03472511  0.02820126
  0.01923263  0.01884692  0.01718276  0.0084874   0.00716216]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.452739 0.303477 -0.084487 0.241345 0.225124 -0.426287 0.241328 0.292942 -0.392105 0.291327 -0.149862 -0.003428
Component-1 0.280266 -0.417758 0.138758 -0.288747 -0.277323 -0.408413 0.172332 -0.299298 -0.432574 0.258819 -0.163807 0.019325
Component-2 -0.649400 -0.077016 0.118020 -0.052190 -0.728462 0.099367 0.017990 0.002987 0.059497 -0.077130 0.005896 0.072857

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (1339 of 1339) |######################| Elapsed Time: 0:00:53 Time: 0:00:53

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *   | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | ns. | **  | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.67080117  0.18712907]

Out[35]:
aoa frequency letters_count
Component-0 -0.742691 0.361478 -0.563688
Component-1 0.455320 -0.344640 -0.820918

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (1339 of 1339) |######################| Elapsed Time: 0:00:10 Time: 0:00:10

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1202 (cluster-unique) substitutions, but the PCA is in fact computed on 946 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
   ** global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.057351369132081616

intercept                      5.148375
global_aoa                     0.036491
global_clustering             -0.084663
global_frequency               0.316077
global_letters_count          -0.009004
global_orthographic_density   -0.050470
global_synonyms_count          0.001106
dtype: float64

Regressing global frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07229490977250796

intercept                                              5.381044
global_aoa                                             0.257646
global_clustering                                      0.766508
global_frequency                                       0.812318
global_letters_count                                  -0.472796
global_orthographic_density                            0.830475
global_synonyms_count                                  0.136857
global_aoa * global_clustering                        -0.011621
global_aoa * global_frequency                         -0.016892
global_aoa * global_letters_count                     -0.009925
global_aoa * global_orthographic_density              -0.078321
global_aoa * global_synonyms_count                     0.019096
global_clustering * global_frequency                   0.005972
global_clustering * global_letters_count              -0.149409
global_clustering * global_orthographic_density       -0.001348
global_clustering * global_synonyms_count              0.080750
global_frequency * global_letters_count               -0.043670
global_frequency * global_orthographic_density        -0.082273
global_frequency * global_synonyms_count              -0.019359
global_letters_count * global_orthographic_density     0.069896
global_letters_count * global_synonyms_count           0.044321
global_orthographic_density * global_synonyms_count    0.078747
dtype: float64

Regressing rel frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.03764585520297492

intercept                     -7.662716
global_aoa                     0.043283
global_clustering             -0.056521
global_frequency               0.288980
global_letters_count           0.151879
global_orthographic_density    0.100372
global_synonyms_count          0.139852
dtype: float64

Regressing rel frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.05352261131088143

intercept                                             -10.287499
global_aoa                                              0.444066
global_clustering                                       0.501854
global_frequency                                        0.757918
global_letters_count                                    0.095420
global_orthographic_density                             1.931460
global_synonyms_count                                  -1.006563
global_aoa * global_clustering                          0.017722
global_aoa * global_frequency                          -0.020399
global_aoa * global_letters_count                       0.000778
global_aoa * global_orthographic_density               -0.129492
global_aoa * global_synonyms_count                      0.102161
global_clustering * global_frequency                   -0.024661
global_clustering * global_letters_count               -0.087569
global_clustering * global_orthographic_density         0.051848
global_clustering * global_synonyms_count               0.060381
global_frequency * global_letters_count                -0.058996
global_frequency * global_orthographic_density         -0.135120
global_frequency * global_synonyms_count                0.067133
global_letters_count * global_orthographic_density      0.097713
global_letters_count * global_synonyms_count            0.011107
global_orthographic_density * global_synonyms_count     0.113595
dtype: float64

Regressing global frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.048731487735675594

intercept                   9.409707
rel_aoa                     0.037628
rel_clustering             -0.102222
rel_frequency               0.231636
rel_letters_count          -0.062454
rel_orthographic_density   -0.083874
rel_synonyms_count         -0.136857
dtype: float64

Regressing global frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.0625622900554037

intercept                                        9.218600
rel_aoa                                          0.104855
rel_clustering                                   0.101966
rel_frequency                                    0.158111
rel_letters_count                                0.012183
rel_orthographic_density                        -0.177362
rel_synonyms_count                               0.126124
rel_aoa * rel_clustering                         0.032587
rel_aoa * rel_frequency                          0.022410
rel_aoa * rel_letters_count                     -0.013422
rel_aoa * rel_orthographic_density              -0.001023
rel_aoa * rel_synonyms_count                     0.049150
rel_clustering * rel_frequency                   0.015037
rel_clustering * rel_letters_count              -0.098339
rel_clustering * rel_orthographic_density       -0.009343
rel_clustering * rel_synonyms_count              0.235814
rel_frequency * rel_letters_count                0.008163
rel_frequency * rel_orthographic_density        -0.039637
rel_frequency * rel_synonyms_count               0.095545
rel_letters_count * rel_orthographic_density     0.000651
rel_letters_count * rel_synonyms_count          -0.049250
rel_orthographic_density * rel_synonyms_count   -0.023321
dtype: float64

Regressing rel frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.229122849534243

intercept                  -1.678934
rel_aoa                     0.036116
rel_clustering              0.126170
rel_frequency               0.624868
rel_letters_count          -0.110558
rel_orthographic_density   -0.200695
rel_synonyms_count         -0.006214
dtype: float64

Regressing rel frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.25107276611884943

intercept                                       -1.896473
rel_aoa                                         -0.019989
rel_clustering                                   0.181190
rel_frequency                                    0.560604
rel_letters_count                                0.002630
rel_orthographic_density                        -0.319727
rel_synonyms_count                               0.232824
rel_aoa * rel_clustering                        -0.010550
rel_aoa * rel_frequency                         -0.037549
rel_aoa * rel_letters_count                      0.029257
rel_aoa * rel_orthographic_density               0.118695
rel_aoa * rel_synonyms_count                     0.173497
rel_clustering * rel_frequency                  -0.003378
rel_clustering * rel_letters_count              -0.116602
rel_clustering * rel_orthographic_density       -0.210603
rel_clustering * rel_synonyms_count              0.218071
rel_frequency * rel_letters_count                0.027176
rel_frequency * rel_orthographic_density        -0.051743
rel_frequency * rel_synonyms_count               0.064433
rel_letters_count * rel_orthographic_density    -0.007904
rel_letters_count * rel_synonyms_count          -0.031920
rel_orthographic_density * rel_synonyms_count    0.186186
dtype: float64

Regressing global frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06716461096168747

intercept                      3.784304
global_aoa                     0.008320
global_clustering             -0.160401
global_frequency               0.320034
global_letters_count           0.202912
global_orthographic_density    0.025644
global_synonyms_count          0.305772
rel_aoa                        0.033821
rel_clustering                 0.074226
rel_frequency                 -0.011380
rel_letters_count             -0.246746
rel_orthographic_density      -0.091352
rel_synonyms_count            -0.398634
dtype: float64

Regressing global frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.14925669881478987

intercept                                                -32.207040
global_aoa                                                 1.454503
global_clustering                                         -8.260949
global_frequency                                           0.333190
global_letters_count                                       0.577501
global_orthographic_density                                5.827971
global_synonyms_count                                      9.453936
rel_aoa                                                    0.030199
rel_clustering                                            10.470172
rel_frequency                                              0.970850
rel_letters_count                                         -1.631407
rel_orthographic_density                                  -5.602373
rel_synonyms_count                                        -8.854185
global_aoa * global_clustering                             0.437269
global_aoa * global_frequency                              0.159192
global_aoa * global_letters_count                         -0.025678
global_aoa * global_orthographic_density                  -0.143642
global_aoa * global_synonyms_count                        -0.463468
global_aoa * rel_aoa                                      -0.011404
global_aoa * rel_clustering                               -0.437698
global_aoa * rel_frequency                                -0.074901
global_aoa * rel_letters_count                             0.027313
global_aoa * rel_orthographic_density                      0.007763
global_aoa * rel_synonyms_count                            0.382361
global_clustering * global_frequency                       0.211484
global_clustering * global_letters_count                   0.014033
global_clustering * global_orthographic_density            1.628313
global_clustering * global_synonyms_count                  0.615411
global_clustering * rel_aoa                               -0.411517
global_clustering * rel_clustering                         0.216180
global_clustering * rel_frequency                         -0.079392
global_clustering * rel_letters_count                     -0.179209
global_clustering * rel_orthographic_density              -1.326067
global_clustering * rel_synonyms_count                    -0.240959
global_frequency * global_letters_count                   -0.064479
global_frequency * global_orthographic_density             0.367651
global_frequency * global_synonyms_count                  -0.239481
global_frequency * rel_aoa                                -0.281387
global_frequency * rel_clustering                         -0.247536
global_frequency * rel_frequency                          -0.008167
global_frequency * rel_letters_count                       0.044160
global_frequency * rel_orthographic_density               -0.253449
global_frequency * rel_synonyms_count                      0.292161
global_letters_count * global_orthographic_density         0.207894
global_letters_count * global_synonyms_count               0.258470
global_letters_count * rel_aoa                             0.051848
global_letters_count * rel_clustering                     -0.248854
global_letters_count * rel_frequency                      -0.075535
global_letters_count * rel_letters_count                   0.002240
global_letters_count * rel_orthographic_density            0.009781
global_letters_count * rel_synonyms_count                  0.035062
global_orthographic_density * global_synonyms_count       -0.873299
global_orthographic_density * rel_aoa                      0.155190
global_orthographic_density * rel_clustering              -1.348247
global_orthographic_density * rel_frequency               -0.380072
global_orthographic_density * rel_letters_count           -0.119567
global_orthographic_density * rel_orthographic_density     0.001117
global_orthographic_density * rel_synonyms_count           1.025298
global_synonyms_count * rel_aoa                            0.222088
global_synonyms_count * rel_clustering                    -0.952418
global_synonyms_count * rel_frequency                     -0.015121
global_synonyms_count * rel_letters_count                  0.140909
global_synonyms_count * rel_orthographic_density           0.905638
global_synonyms_count * rel_synonyms_count                 0.085215
rel_aoa * rel_clustering                                   0.343245
rel_aoa * rel_frequency                                    0.146872
rel_aoa * rel_letters_count                               -0.026478
rel_aoa * rel_orthographic_density                        -0.019729
rel_aoa * rel_synonyms_count                              -0.125917
rel_clustering * rel_frequency                             0.155245
rel_clustering * rel_letters_count                         0.228747
rel_clustering * rel_orthographic_density                  0.898529
rel_clustering * rel_synonyms_count                        0.922366
rel_frequency * rel_letters_count                          0.092627
rel_frequency * rel_orthographic_density                   0.246640
rel_frequency * rel_synonyms_count                         0.059191
rel_letters_count * rel_orthographic_density              -0.037856
rel_letters_count * rel_synonyms_count                    -0.276799
rel_orthographic_density * rel_synonyms_count             -0.821146
dtype: float64

Regressing rel frequency with 682 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.32811836267387606

intercept                      3.585208
global_aoa                     0.011454
global_clustering             -0.087449
global_frequency              -0.617930
global_letters_count           0.217929
global_orthographic_density    0.025785
global_synonyms_count          0.245742
rel_aoa                        0.014987
rel_clustering                 0.040814
rel_frequency                  0.971664
rel_letters_count             -0.266122
rel_orthographic_density      -0.073718
rel_synonyms_count            -0.335994
dtype: float64

Regressing rel frequency with 682 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.38385547438502676

intercept                                                -35.378994
global_aoa                                                 1.697579
global_clustering                                         -8.407137
global_frequency                                          -0.420807
global_letters_count                                       0.250329
global_orthographic_density                                6.803893
global_synonyms_count                                      7.538644
rel_aoa                                                   -0.392622
rel_clustering                                            11.519736
rel_frequency                                              1.751999
rel_letters_count                                         -1.252681
rel_orthographic_density                                  -6.616563
rel_synonyms_count                                        -6.627923
global_aoa * global_clustering                             0.440953
global_aoa * global_frequency                              0.140293
global_aoa * global_letters_count                         -0.012330
global_aoa * global_orthographic_density                  -0.167627
global_aoa * global_synonyms_count                        -0.456633
global_aoa * rel_aoa                                      -0.013026
global_aoa * rel_clustering                               -0.454373
global_aoa * rel_frequency                                -0.061748
global_aoa * rel_letters_count                             0.024867
global_aoa * rel_orthographic_density                      0.056280
global_aoa * rel_synonyms_count                            0.365849
global_clustering * global_frequency                       0.222842
global_clustering * global_letters_count                   0.041140
global_clustering * global_orthographic_density            1.623692
global_clustering * global_synonyms_count                  0.545649
global_clustering * rel_aoa                               -0.431797
global_clustering * rel_clustering                         0.196329
global_clustering * rel_frequency                         -0.061895
global_clustering * rel_letters_count                     -0.157173
global_clustering * rel_orthographic_density              -1.308042
global_clustering * rel_synonyms_count                    -0.172176
global_frequency * global_letters_count                   -0.019878
global_frequency * global_orthographic_density             0.298133
global_frequency * global_synonyms_count                  -0.156028
global_frequency * rel_aoa                                -0.249554
global_frequency * rel_clustering                         -0.331890
global_frequency * rel_frequency                           0.008535
global_frequency * rel_letters_count                       0.015005
global_frequency * rel_orthographic_density               -0.191806
global_frequency * rel_synonyms_count                      0.207367
global_letters_count * global_orthographic_density         0.203958
global_letters_count * global_synonyms_count               0.349969
global_letters_count * rel_aoa                             0.031803
global_letters_count * rel_clustering                     -0.281670
global_letters_count * rel_frequency                      -0.085647
global_letters_count * rel_letters_count                   0.002923
global_letters_count * rel_orthographic_density            0.038381
global_letters_count * rel_synonyms_count                 -0.086851
global_orthographic_density * global_synonyms_count       -0.874075
global_orthographic_density * rel_aoa                      0.148316
global_orthographic_density * rel_clustering              -1.361359
global_orthographic_density * rel_frequency               -0.305562
global_orthographic_density * rel_letters_count           -0.093974
global_orthographic_density * rel_orthographic_density     0.046792
global_orthographic_density * rel_synonyms_count           0.965443
global_synonyms_count * rel_aoa                            0.210823
global_synonyms_count * rel_clustering                    -0.920962
global_synonyms_count * rel_frequency                     -0.120944
global_synonyms_count * rel_letters_count                  0.030016
global_synonyms_count * rel_orthographic_density           0.870719
global_synonyms_count * rel_synonyms_count                 0.096468
rel_aoa * rel_clustering                                   0.378541
rel_aoa * rel_frequency                                    0.114184
rel_aoa * rel_letters_count                               -0.017402
rel_aoa * rel_orthographic_density                        -0.039094
rel_aoa * rel_synonyms_count                              -0.095155
rel_clustering * rel_frequency                             0.204698
rel_clustering * rel_letters_count                         0.236478
rel_clustering * rel_orthographic_density                  0.905879
rel_clustering * rel_synonyms_count                        0.889690
rel_frequency * rel_letters_count                          0.094611
rel_frequency * rel_orthographic_density                   0.184319
rel_frequency * rel_synonyms_count                         0.161963
rel_letters_count * rel_orthographic_density              -0.070179
rel_letters_count * rel_synonyms_count                    -0.146622
rel_orthographic_density * rel_synonyms_count             -0.728987
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 625 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.05900835937179594

intercept                      7.152716
global_aoa                     0.162955
global_clustering              0.121683
global_frequency              -0.096384
global_letters_count           0.093025
global_orthographic_density   -0.007571
global_synonyms_count         -0.277976
dtype: float64

Regressing global aoa with 625 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.09359051714618716

intercept                                             -2.311086
global_aoa                                             0.127713
global_clustering                                     -1.590173
global_frequency                                       0.259600
global_letters_count                                   1.381944
global_orthographic_density                            0.152900
global_synonyms_count                                 -4.321143
global_aoa * global_clustering                         0.038381
global_aoa * global_frequency                          0.010001
global_aoa * global_letters_count                      0.032532
global_aoa * global_orthographic_density              -0.041604
global_aoa * global_synonyms_count                     0.053188
global_clustering * global_frequency                   0.042441
global_clustering * global_letters_count               0.229497
global_clustering * global_orthographic_density       -0.004617
global_clustering * global_synonyms_count             -0.696699
global_frequency * global_letters_count               -0.025963
global_frequency * global_orthographic_density        -0.028513
global_frequency * global_synonyms_count              -0.006361
global_letters_count * global_orthographic_density     0.044601
global_letters_count * global_synonyms_count          -0.038816
global_orthographic_density * global_synonyms_count   -0.035608
dtype: float64

Regressing rel aoa with 625 measures, no interactions
           ^^^^^^^
R^2 = 0.014237422247225129

intercept                      1.570758
global_aoa                     0.051694
global_clustering              0.014326
global_frequency              -0.115112
global_letters_count           0.042379
global_orthographic_density    0.089190
global_synonyms_count         -0.173617
dtype: float64

Regressing rel aoa with 625 measures, with interactions
           ^^^^^^^
R^2 = 0.03973100547773068

intercept                                             -4.012494
global_aoa                                             0.245457
global_clustering                                     -0.383465
global_frequency                                       0.394284
global_letters_count                                   0.891392
global_orthographic_density                           -0.960208
global_synonyms_count                                 -2.052342
global_aoa * global_clustering                         0.045437
global_aoa * global_frequency                          0.009103
global_aoa * global_letters_count                     -0.010645
global_aoa * global_orthographic_density               0.039427
global_aoa * global_synonyms_count                     0.014736
global_clustering * global_frequency                   0.034232
global_clustering * global_letters_count               0.050049
global_clustering * global_orthographic_density       -0.218670
global_clustering * global_synonyms_count             -0.592939
global_frequency * global_letters_count               -0.047391
global_frequency * global_orthographic_density        -0.044329
global_frequency * global_synonyms_count              -0.072214
global_letters_count * global_orthographic_density    -0.012573
global_letters_count * global_synonyms_count          -0.150874
global_orthographic_density * global_synonyms_count   -0.080948
dtype: float64

Regressing global aoa with 625 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.031764224789119666

intercept                   6.466991
rel_aoa                    -0.017071
rel_clustering              0.307693
rel_frequency              -0.034927
rel_letters_count           0.053225
rel_orthographic_density   -0.244345
rel_synonyms_count         -0.372797
dtype: float64

Regressing global aoa with 625 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.06927731488619515

intercept                                        6.503113
rel_aoa                                         -0.231114
rel_clustering                                  -0.135188
rel_frequency                                   -0.015785
rel_letters_count                                0.084598
rel_orthographic_density                        -0.434221
rel_synonyms_count                              -0.409388
rel_aoa * rel_clustering                        -0.034544
rel_aoa * rel_frequency                         -0.048644
rel_aoa * rel_letters_count                      0.070645
rel_aoa * rel_orthographic_density               0.039146
rel_aoa * rel_synonyms_count                    -0.040783
rel_clustering * rel_frequency                  -0.000015
rel_clustering * rel_letters_count               0.304657
rel_clustering * rel_orthographic_density        0.138942
rel_clustering * rel_synonyms_count             -0.703748
rel_frequency * rel_letters_count                0.007724
rel_frequency * rel_orthographic_density         0.019781
rel_frequency * rel_synonyms_count              -0.109722
rel_letters_count * rel_orthographic_density     0.131042
rel_letters_count * rel_synonyms_count           0.187824
rel_orthographic_density * rel_synonyms_count    0.422820
dtype: float64

Regressing rel aoa with 625 measures, no interactions
           ^^^^^^^
R^2 = 0.13579605043506449

intercept                   0.721686
rel_aoa                     0.422031
rel_clustering             -0.047113
rel_frequency              -0.103615
rel_letters_count           0.023596
rel_orthographic_density    0.207888
rel_synonyms_count         -0.210760
dtype: float64

Regressing rel aoa with 625 measures, with interactions
           ^^^^^^^
R^2 = 0.16698133021975828

intercept                                        1.016641
rel_aoa                                          0.483313
rel_clustering                                  -0.421473
rel_frequency                                   -0.015028
rel_letters_count                               -0.002357
rel_orthographic_density                         0.369442
rel_synonyms_count                              -0.254484
rel_aoa * rel_clustering                        -0.047379
rel_aoa * rel_frequency                          0.030616
rel_aoa * rel_letters_count                      0.017889
rel_aoa * rel_orthographic_density              -0.018603
rel_aoa * rel_synonyms_count                    -0.075132
rel_clustering * rel_frequency                  -0.011241
rel_clustering * rel_letters_count               0.303294
rel_clustering * rel_orthographic_density        0.288256
rel_clustering * rel_synonyms_count             -0.387988
rel_frequency * rel_letters_count                0.009106
rel_frequency * rel_orthographic_density         0.150100
rel_frequency * rel_synonyms_count              -0.061861
rel_letters_count * rel_orthographic_density     0.079304
rel_letters_count * rel_synonyms_count           0.096392
rel_orthographic_density * rel_synonyms_count    0.173346
dtype: float64

Regressing global aoa with 625 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.07413760499117072

intercept                      3.659024
global_aoa                     0.326050
global_clustering             -0.074919
global_frequency              -0.039964
global_letters_count           0.202932
global_orthographic_density    0.084938
global_synonyms_count          0.012728
rel_aoa                       -0.241305
rel_clustering                 0.199977
rel_frequency                 -0.078579
rel_letters_count             -0.127250
rel_orthographic_density      -0.056818
rel_synonyms_count            -0.351454
dtype: float64

Regressing global aoa with 625 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.20065854068792222

intercept                                                 19.851751
global_aoa                                                 1.516182
global_clustering                                          3.804786
global_frequency                                           4.050996
global_letters_count                                      -5.445866
global_orthographic_density                              -15.054140
global_synonyms_count                                     -5.439013
rel_aoa                                                   -0.238415
rel_clustering                                            -2.491585
rel_frequency                                             -1.830062
rel_letters_count                                          4.230703
rel_orthographic_density                                  15.032658
rel_synonyms_count                                        -9.902812
global_aoa * global_clustering                             0.037673
global_aoa * global_frequency                             -0.161942
global_aoa * global_letters_count                          0.076446
global_aoa * global_orthographic_density                   0.263439
global_aoa * global_synonyms_count                        -0.152459
global_aoa * rel_aoa                                       0.031492
global_aoa * rel_clustering                               -0.058746
global_aoa * rel_frequency                                 0.125288
global_aoa * rel_letters_count                            -0.096761
global_aoa * rel_orthographic_density                     -0.314350
global_aoa * rel_synonyms_count                            0.294110
global_clustering * global_frequency                       0.454575
global_clustering * global_letters_count                  -0.451842
global_clustering * global_orthographic_density           -3.074433
global_clustering * global_synonyms_count                 -1.356621
global_clustering * rel_aoa                                0.059270
global_clustering * rel_clustering                         0.033283
global_clustering * rel_frequency                         -0.221465
global_clustering * rel_letters_count                      0.080471
global_clustering * rel_orthographic_density               2.280749
global_clustering * rel_synonyms_count                     0.194475
global_frequency * global_letters_count                    0.189411
global_frequency * global_orthographic_density            -0.449793
global_frequency * global_synonyms_count                  -0.624177
global_frequency * rel_aoa                                 0.178317
global_frequency * rel_clustering                         -0.452263
global_frequency * rel_frequency                          -0.021110
global_frequency * rel_letters_count                      -0.260163
global_frequency * rel_orthographic_density                0.149146
global_frequency * rel_synonyms_count                      1.292867
global_letters_count * global_orthographic_density        -0.093026
global_letters_count * global_synonyms_count               0.702253
global_letters_count * rel_aoa                            -0.214334
global_letters_count * rel_clustering                      0.536413
global_letters_count * rel_frequency                      -0.127311
global_letters_count * rel_letters_count                   0.041666
global_letters_count * rel_orthographic_density           -0.205218
global_letters_count * rel_synonyms_count                 -0.588243
global_orthographic_density * global_synonyms_count        0.872670
global_orthographic_density * rel_aoa                     -0.400865
global_orthographic_density * rel_clustering               2.275894
global_orthographic_density * rel_frequency                0.247376
global_orthographic_density * rel_letters_count           -0.077254
global_orthographic_density * rel_orthographic_density     0.038010
global_orthographic_density * rel_synonyms_count          -1.281509
global_synonyms_count * rel_aoa                            0.044306
global_synonyms_count * rel_clustering                     1.409234
global_synonyms_count * rel_frequency                     -0.000508
global_synonyms_count * rel_letters_count                 -1.297742
global_synonyms_count * rel_orthographic_density          -1.607429
global_synonyms_count * rel_synonyms_count                -0.309934
rel_aoa * rel_clustering                                  -0.012817
rel_aoa * rel_frequency                                   -0.097109
rel_aoa * rel_letters_count                                0.190344
rel_aoa * rel_orthographic_density                         0.343682
rel_aoa * rel_synonyms_count                              -0.206241
rel_clustering * rel_frequency                             0.138629
rel_clustering * rel_letters_count                         0.124272
rel_clustering * rel_orthographic_density                 -1.218945
rel_clustering * rel_synonyms_count                       -0.992298
rel_frequency * rel_letters_count                          0.111225
rel_frequency * rel_orthographic_density                   0.046868
rel_frequency * rel_synonyms_count                        -0.649962
rel_letters_count * rel_orthographic_density               0.537084
rel_letters_count * rel_synonyms_count                     1.143356
rel_orthographic_density * rel_synonyms_count              2.133999
dtype: float64

Regressing rel aoa with 625 measures, no interactions
           ^^^^^^^
R^2 = 0.19512592635672885

intercept                      2.516994
global_aoa                    -0.443236
global_clustering             -0.045747
global_frequency              -0.024984
global_letters_count           0.122201
global_orthographic_density    0.004633
global_synonyms_count          0.241498
rel_aoa                        0.725560
rel_clustering                 0.143753
rel_frequency                 -0.067730
rel_letters_count             -0.059957
rel_orthographic_density      -0.031035
rel_synonyms_count            -0.515821
dtype: float64

Regressing rel aoa with 625 measures, with interactions
           ^^^^^^^
R^2 = 0.3013328956809689

intercept                                                 21.108346
global_aoa                                                -0.804353
global_clustering                                          3.208709
global_frequency                                           2.383052
global_letters_count                                      -2.833982
global_orthographic_density                              -12.682025
global_synonyms_count                                     -2.096506
rel_aoa                                                    1.579505
rel_clustering                                             0.877547
rel_frequency                                             -1.077795
rel_letters_count                                          2.361194
rel_orthographic_density                                  12.787578
rel_synonyms_count                                       -10.904673
global_aoa * global_clustering                            -0.106440
global_aoa * global_frequency                             -0.083588
global_aoa * global_letters_count                          0.020769
global_aoa * global_orthographic_density                   0.254748
global_aoa * global_synonyms_count                         0.062827
global_aoa * rel_aoa                                      -0.000147
global_aoa * rel_clustering                                0.005943
global_aoa * rel_frequency                                 0.063772
global_aoa * rel_letters_count                            -0.070217
global_aoa * rel_orthographic_density                     -0.355003
global_aoa * rel_synonyms_count                            0.066845
global_clustering * global_frequency                       0.301245
global_clustering * global_letters_count                  -0.201399
global_clustering * global_orthographic_density           -2.178590
global_clustering * global_synonyms_count                 -1.307504
global_clustering * rel_aoa                                0.138644
global_clustering * rel_clustering                         0.119243
global_clustering * rel_frequency                         -0.206419
global_clustering * rel_letters_count                     -0.131028
global_clustering * rel_orthographic_density               1.413170
global_clustering * rel_synonyms_count                     0.361952
global_frequency * global_letters_count                    0.127684
global_frequency * global_orthographic_density            -0.186029
global_frequency * global_synonyms_count                  -0.614523
global_frequency * rel_aoa                                 0.124768
global_frequency * rel_clustering                         -0.430693
global_frequency * rel_frequency                          -0.017681
global_frequency * rel_letters_count                      -0.225857
global_frequency * rel_orthographic_density               -0.114703
global_frequency * rel_synonyms_count                      1.176430
global_letters_count * global_orthographic_density        -0.021520
global_letters_count * global_synonyms_count               0.076772
global_letters_count * rel_aoa                            -0.121176
global_letters_count * rel_clustering                      0.203187
global_letters_count * rel_frequency                      -0.091269
global_letters_count * rel_letters_count                   0.039879
global_letters_count * rel_orthographic_density           -0.176263
global_letters_count * rel_synonyms_count                 -0.043572
global_orthographic_density * global_synonyms_count        0.227531
global_orthographic_density * rel_aoa                     -0.355151
global_orthographic_density * rel_clustering               1.423260
global_orthographic_density * rel_frequency               -0.022886
global_orthographic_density * rel_letters_count           -0.156352
global_orthographic_density * rel_orthographic_density    -0.021039
global_orthographic_density * rel_synonyms_count          -0.434220
global_synonyms_count * rel_aoa                           -0.120986
global_synonyms_count * rel_clustering                     1.339557
global_synonyms_count * rel_frequency                      0.202292
global_synonyms_count * rel_letters_count                 -0.576440
global_synonyms_count * rel_orthographic_density          -0.838468
global_synonyms_count * rel_synonyms_count                -0.310792
rel_aoa * rel_clustering                                  -0.048474
rel_aoa * rel_frequency                                   -0.066849
rel_aoa * rel_letters_count                                0.125940
rel_aoa * rel_orthographic_density                         0.350256
rel_aoa * rel_synonyms_count                              -0.062846
rel_clustering * rel_frequency                             0.279593
rel_clustering * rel_letters_count                         0.406262
rel_clustering * rel_orthographic_density                 -0.434403
rel_clustering * rel_synonyms_count                       -1.141639
rel_frequency * rel_letters_count                          0.122130
rel_frequency * rel_orthographic_density                   0.331986
rel_frequency * rel_synonyms_count                        -0.817088
rel_letters_count * rel_orthographic_density               0.485495
rel_letters_count * rel_synonyms_count                     0.494398
rel_orthographic_density * rel_synonyms_count              1.068073
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.030355880941795532

intercept                     -5.201199
global_aoa                     0.015189
global_clustering              0.090541
global_frequency              -0.044152
global_letters_count           0.027028
global_orthographic_density    0.071590
global_synonyms_count          0.008820
dtype: float64

Regressing global clustering with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.07553598658094474

intercept                                             -6.329461
global_aoa                                             0.211961
global_clustering                                      0.141477
global_frequency                                      -0.369460
global_letters_count                                   0.564472
global_orthographic_density                            1.143834
global_synonyms_count                                 -0.328208
global_aoa * global_clustering                        -0.008497
global_aoa * global_frequency                         -0.014188
global_aoa * global_letters_count                     -0.013963
global_aoa * global_orthographic_density              -0.024209
global_aoa * global_synonyms_count                    -0.011668
global_clustering * global_frequency                  -0.058250
global_clustering * global_letters_count               0.081704
global_clustering * global_orthographic_density        0.110717
global_clustering * global_synonyms_count             -0.180536
global_frequency * global_letters_count                0.010815
global_frequency * global_orthographic_density         0.003269
global_frequency * global_synonyms_count              -0.002472
global_letters_count * global_orthographic_density    -0.048500
global_letters_count * global_synonyms_count          -0.069060
global_orthographic_density * global_synonyms_count   -0.173489
dtype: float64

Regressing rel clustering with 537 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.020867670532995986

intercept                      0.489303
global_aoa                     0.018284
global_clustering              0.059071
global_frequency              -0.025412
global_letters_count           0.038568
global_orthographic_density    0.103441
global_synonyms_count          0.000431
dtype: float64

Regressing rel clustering with 537 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.04976884720480057

intercept                                              2.057372
global_aoa                                             0.085352
global_clustering                                      0.353656
global_frequency                                      -0.432850
global_letters_count                                   0.203577
global_orthographic_density                            0.863145
global_synonyms_count                                  0.149291
global_aoa * global_clustering                        -0.020657
global_aoa * global_frequency                         -0.011986
global_aoa * global_letters_count                     -0.006585
global_aoa * global_orthographic_density              -0.026057
global_aoa * global_synonyms_count                    -0.033974
global_clustering * global_frequency                  -0.056679
global_clustering * global_letters_count               0.049786
global_clustering * global_orthographic_density        0.092303
global_clustering * global_synonyms_count             -0.119472
global_frequency * global_letters_count                0.022644
global_frequency * global_orthographic_density         0.014739
global_frequency * global_synonyms_count              -0.021560
global_letters_count * global_orthographic_density    -0.027239
global_letters_count * global_synonyms_count          -0.040011
global_orthographic_density * global_synonyms_count   -0.157668
dtype: float64

Regressing global clustering with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.012077196275130486

intercept                  -5.857826
rel_aoa                     0.016785
rel_clustering              0.073144
rel_frequency              -0.015438
rel_letters_count           0.012334
rel_orthographic_density    0.048412
rel_synonyms_count         -0.021684
dtype: float64

Regressing global clustering with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.03448131731654547

intercept                                       -5.857719
rel_aoa                                         -0.015780
rel_clustering                                  -0.018644
rel_frequency                                   -0.029155
rel_letters_count                               -0.022273
rel_orthographic_density                         0.075410
rel_synonyms_count                              -0.053472
rel_aoa * rel_clustering                         0.015814
rel_aoa * rel_frequency                         -0.014151
rel_aoa * rel_letters_count                     -0.014774
rel_aoa * rel_orthographic_density              -0.020351
rel_aoa * rel_synonyms_count                    -0.009589
rel_clustering * rel_frequency                  -0.004438
rel_clustering * rel_letters_count               0.064204
rel_clustering * rel_orthographic_density        0.052150
rel_clustering * rel_synonyms_count             -0.100257
rel_frequency * rel_letters_count                0.005330
rel_frequency * rel_orthographic_density        -0.010782
rel_frequency * rel_synonyms_count              -0.026093
rel_letters_count * rel_orthographic_density    -0.031813
rel_letters_count * rel_synonyms_count          -0.040718
rel_orthographic_density * rel_synonyms_count   -0.090421
dtype: float64

Regressing rel clustering with 537 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.09603724344495046

intercept                   0.303830
rel_aoa                     0.007980
rel_clustering              0.310171
rel_frequency               0.005961
rel_letters_count           0.034266
rel_orthographic_density    0.088101
rel_synonyms_count          0.047607
dtype: float64

Regressing rel clustering with 537 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.11974863950748092

intercept                                        0.273223
rel_aoa                                         -0.018479
rel_clustering                                   0.211999
rel_frequency                                   -0.015736
rel_letters_count                                0.036351
rel_orthographic_density                         0.106489
rel_synonyms_count                               0.026009
rel_aoa * rel_clustering                         0.017814
rel_aoa * rel_frequency                         -0.007726
rel_aoa * rel_letters_count                     -0.014961
rel_aoa * rel_orthographic_density              -0.029422
rel_aoa * rel_synonyms_count                    -0.016009
rel_clustering * rel_frequency                  -0.025739
rel_clustering * rel_letters_count               0.030813
rel_clustering * rel_orthographic_density        0.033200
rel_clustering * rel_synonyms_count             -0.171793
rel_frequency * rel_letters_count                0.010001
rel_frequency * rel_orthographic_density        -0.013370
rel_frequency * rel_synonyms_count              -0.024930
rel_letters_count * rel_orthographic_density    -0.021954
rel_letters_count * rel_synonyms_count          -0.003140
rel_orthographic_density * rel_synonyms_count   -0.029966
dtype: float64

Regressing global clustering with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.039570567578087856

intercept                     -4.313525
global_aoa                     0.010736
global_clustering              0.158245
global_frequency              -0.062301
global_letters_count           0.026834
global_orthographic_density   -0.022801
global_synonyms_count          0.084980
rel_aoa                        0.005490
rel_clustering                -0.077191
rel_frequency                  0.021794
rel_letters_count             -0.004731
rel_orthographic_density       0.105074
rel_synonyms_count            -0.102574
dtype: float64

Regressing global clustering with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.18405542287682464

intercept                                                 1.836968
global_aoa                                                0.156005
global_clustering                                         3.215809
global_frequency                                         -0.730785
global_letters_count                                      2.494887
global_orthographic_density                               1.743919
global_synonyms_count                                    -4.978521
rel_aoa                                                   0.441443
rel_clustering                                           -3.742340
rel_frequency                                            -0.239474
rel_letters_count                                        -2.210627
rel_orthographic_density                                 -0.880947
rel_synonyms_count                                       -0.105594
global_aoa * global_clustering                           -0.160195
global_aoa * global_frequency                            -0.039031
global_aoa * global_letters_count                        -0.104376
global_aoa * global_orthographic_density                 -0.146297
global_aoa * global_synonyms_count                        0.148584
global_aoa * rel_aoa                                      0.010566
global_aoa * rel_clustering                               0.182705
global_aoa * rel_frequency                                0.045566
global_aoa * rel_letters_count                            0.090310
global_aoa * rel_orthographic_density                     0.111267
global_aoa * rel_synonyms_count                          -0.098179
global_clustering * global_frequency                     -0.202260
global_clustering * global_letters_count                  0.193751
global_clustering * global_orthographic_density          -0.206523
global_clustering * global_synonyms_count                -0.651033
global_clustering * rel_aoa                               0.062699
global_clustering * rel_clustering                       -0.043223
global_clustering * rel_frequency                         0.104279
global_clustering * rel_letters_count                    -0.134886
global_clustering * rel_orthographic_density              0.202653
global_clustering * rel_synonyms_count                    0.407078
global_frequency * global_letters_count                  -0.025731
global_frequency * global_orthographic_density           -0.113155
global_frequency * global_synonyms_count                  0.134627
global_frequency * rel_aoa                               -0.025688
global_frequency * rel_clustering                         0.161290
global_frequency * rel_frequency                          0.027264
global_frequency * rel_letters_count                      0.053583
global_frequency * rel_orthographic_density               0.089011
global_frequency * rel_synonyms_count                     0.078673
global_letters_count * global_orthographic_density       -0.140010
global_letters_count * global_synonyms_count             -0.213352
global_letters_count * rel_aoa                           -0.001394
global_letters_count * rel_clustering                    -0.075277
global_letters_count * rel_frequency                      0.043085
global_letters_count * rel_letters_count                  0.012957
global_letters_count * rel_orthographic_density           0.103924
global_letters_count * rel_synonyms_count                 0.425085
global_orthographic_density * global_synonyms_count      -0.285414
global_orthographic_density * rel_aoa                     0.130806
global_orthographic_density * rel_clustering              0.294703
global_orthographic_density * rel_frequency               0.145408
global_orthographic_density * rel_letters_count           0.065965
global_orthographic_density * rel_orthographic_density   -0.016705
global_orthographic_density * rel_synonyms_count          0.250297
global_synonyms_count * rel_aoa                          -0.200956
global_synonyms_count * rel_clustering                    0.471964
global_synonyms_count * rel_frequency                    -0.249760
global_synonyms_count * rel_letters_count                -0.104341
global_synonyms_count * rel_orthographic_density         -0.153208
global_synonyms_count * rel_synonyms_count               -0.054608
rel_aoa * rel_clustering                                 -0.073343
rel_aoa * rel_frequency                                   0.027778
rel_aoa * rel_letters_count                              -0.016590
rel_aoa * rel_orthographic_density                       -0.147177
rel_aoa * rel_synonyms_count                              0.133391
rel_clustering * rel_frequency                           -0.103156
rel_clustering * rel_letters_count                        0.117338
rel_clustering * rel_orthographic_density                -0.135713
rel_clustering * rel_synonyms_count                      -0.451172
rel_frequency * rel_letters_count                        -0.055056
rel_frequency * rel_orthographic_density                 -0.124100
rel_frequency * rel_synonyms_count                        0.029413
rel_letters_count * rel_orthographic_density             -0.064391
rel_letters_count * rel_synonyms_count                   -0.119352
rel_orthographic_density * rel_synonyms_count             0.082166
dtype: float64

Regressing rel clustering with 537 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1832431875393008

intercept                     -2.827832
global_aoa                     0.011113
global_clustering             -0.561658
global_frequency              -0.047700
global_letters_count           0.019094
global_orthographic_density    0.018267
global_synonyms_count         -0.035577
rel_aoa                       -0.000502
rel_clustering                 0.771477
rel_frequency                  0.019134
rel_letters_count              0.009687
rel_orthographic_density       0.053178
rel_synonyms_count             0.040923
dtype: float64

Regressing rel clustering with 537 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.29482012061729734

intercept                                                 0.642576
global_aoa                                                0.288773
global_clustering                                         1.437745
global_frequency                                         -0.366336
global_letters_count                                      1.630908
global_orthographic_density                               0.718649
global_synonyms_count                                    -3.438658
rel_aoa                                                   0.386986
rel_clustering                                           -2.225750
rel_frequency                                            -0.300085
rel_letters_count                                        -1.746337
rel_orthographic_density                                 -0.327046
rel_synonyms_count                                       -0.969313
global_aoa * global_clustering                           -0.086913
global_aoa * global_frequency                            -0.022194
global_aoa * global_letters_count                        -0.082591
global_aoa * global_orthographic_density                 -0.128135
global_aoa * global_synonyms_count                        0.076900
global_aoa * rel_aoa                                      0.009762
global_aoa * rel_clustering                               0.108488
global_aoa * rel_frequency                                0.031421
global_aoa * rel_letters_count                            0.074108
global_aoa * rel_orthographic_density                     0.085703
global_aoa * rel_synonyms_count                          -0.021569
global_clustering * global_frequency                     -0.113440
global_clustering * global_letters_count                  0.095404
global_clustering * global_orthographic_density          -0.175445
global_clustering * global_synonyms_count                -0.556080
global_clustering * rel_aoa                               0.023734
global_clustering * rel_clustering                       -0.076097
global_clustering * rel_frequency                         0.051550
global_clustering * rel_letters_count                    -0.092893
global_clustering * rel_orthographic_density              0.153007
global_clustering * rel_synonyms_count                    0.338371
global_frequency * global_letters_count                  -0.029997
global_frequency * global_orthographic_density           -0.041288
global_frequency * global_synonyms_count                  0.027856
global_frequency * rel_aoa                               -0.040716
global_frequency * rel_clustering                         0.093485
global_frequency * rel_frequency                          0.026365
global_frequency * rel_letters_count                      0.055367
global_frequency * rel_orthographic_density               0.028386
global_frequency * rel_synonyms_count                     0.141732
global_letters_count * global_orthographic_density       -0.073360
global_letters_count * global_synonyms_count             -0.157609
global_letters_count * rel_aoa                           -0.007731
global_letters_count * rel_clustering                     0.013758
global_letters_count * rel_frequency                      0.030703
global_letters_count * rel_letters_count                  0.016160
global_letters_count * rel_orthographic_density           0.074702
global_letters_count * rel_synonyms_count                 0.341416
global_orthographic_density * global_synonyms_count      -0.205659
global_orthographic_density * rel_aoa                     0.132504
global_orthographic_density * rel_clustering              0.258087
global_orthographic_density * rel_frequency               0.067303
global_orthographic_density * rel_letters_count           0.013612
global_orthographic_density * rel_orthographic_density    0.016683
global_orthographic_density * rel_synonyms_count          0.199323
global_synonyms_count * rel_aoa                          -0.181310
global_synonyms_count * rel_clustering                    0.485431
global_synonyms_count * rel_frequency                    -0.163934
global_synonyms_count * rel_letters_count                -0.075953
global_synonyms_count * rel_orthographic_density         -0.226915
global_synonyms_count * rel_synonyms_count               -0.022699
rel_aoa * rel_clustering                                 -0.040328
rel_aoa * rel_frequency                                   0.036057
rel_aoa * rel_letters_count                              -0.007192
rel_aoa * rel_orthographic_density                       -0.134747
rel_aoa * rel_synonyms_count                              0.117251
rel_clustering * rel_frequency                           -0.063501
rel_clustering * rel_letters_count                        0.059720
rel_clustering * rel_orthographic_density                -0.137299
rel_clustering * rel_synonyms_count                      -0.487902
rel_frequency * rel_letters_count                        -0.044761
rel_frequency * rel_orthographic_density                 -0.069861
rel_frequency * rel_synonyms_count                       -0.024001
rel_letters_count * rel_orthographic_density             -0.012940
rel_letters_count * rel_synonyms_count                   -0.115468
rel_orthographic_density * rel_synonyms_count             0.132423
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07173219300514122

intercept                      4.345157
global_aoa                     0.047461
global_clustering              0.074837
global_frequency               0.074845
global_letters_count           0.266662
global_orthographic_density    0.001971
global_synonyms_count         -0.303017
dtype: float64

Regressing global letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08909374727616026

intercept                                             -11.629282
global_aoa                                              0.689290
global_clustering                                      -3.339435
global_frequency                                        0.659258
global_letters_count                                    0.784238
global_orthographic_density                            -0.824194
global_synonyms_count                                   0.965234
global_aoa * global_clustering                          0.185551
global_aoa * global_frequency                           0.050047
global_aoa * global_letters_count                      -0.008668
global_aoa * global_orthographic_density                0.052871
global_aoa * global_synonyms_count                     -0.076678
global_clustering * global_frequency                    0.170373
global_clustering * global_letters_count                0.062082
global_clustering * global_orthographic_density         0.093499
global_clustering * global_synonyms_count               0.327457
global_frequency * global_letters_count                -0.013734
global_frequency * global_orthographic_density          0.107356
global_frequency * global_synonyms_count                0.022874
global_letters_count * global_orthographic_density     -0.004037
global_letters_count * global_synonyms_count            0.117922
global_orthographic_density * global_synonyms_count     0.214759
dtype: float64

Regressing rel letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.030515384002004153

intercept                      1.621448
global_aoa                     0.017601
global_clustering              0.009856
global_frequency               0.034842
global_letters_count           0.140118
global_orthographic_density   -0.015571
global_synonyms_count         -0.414225
dtype: float64

Regressing rel letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05360976550229802

intercept                                             -14.510397
global_aoa                                              0.936186
global_clustering                                      -3.246537
global_frequency                                        0.792009
global_letters_count                                    0.428522
global_orthographic_density                            -1.569856
global_synonyms_count                                   1.205646
global_aoa * global_clustering                          0.196236
global_aoa * global_frequency                           0.042738
global_aoa * global_letters_count                      -0.032911
global_aoa * global_orthographic_density                0.054989
global_aoa * global_synonyms_count                     -0.125290
global_clustering * global_frequency                    0.202558
global_clustering * global_letters_count                0.002566
global_clustering * global_orthographic_density        -0.037081
global_clustering * global_synonyms_count               0.274951
global_frequency * global_letters_count                -0.004158
global_frequency * global_orthographic_density          0.124413
global_frequency * global_synonyms_count               -0.022727
global_letters_count * global_orthographic_density     -0.043375
global_letters_count * global_synonyms_count            0.131988
global_orthographic_density * global_synonyms_count     0.200108
dtype: float64

Regressing global letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.055692206647310005

intercept                   5.860931
rel_aoa                    -0.039585
rel_clustering              0.221242
rel_frequency               0.046695
rel_letters_count           0.218640
rel_orthographic_density   -0.121863
rel_synonyms_count         -0.322385
dtype: float64

Regressing global letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07213332909953274

intercept                                        5.962063
rel_aoa                                         -0.044467
rel_clustering                                   0.159596
rel_frequency                                    0.160857
rel_letters_count                                0.274611
rel_orthographic_density                        -0.280268
rel_synonyms_count                              -0.540072
rel_aoa * rel_clustering                         0.065289
rel_aoa * rel_frequency                         -0.028087
rel_aoa * rel_letters_count                     -0.041831
rel_aoa * rel_orthographic_density              -0.017383
rel_aoa * rel_synonyms_count                    -0.176819
rel_clustering * rel_frequency                  -0.029364
rel_clustering * rel_letters_count               0.030499
rel_clustering * rel_orthographic_density        0.127956
rel_clustering * rel_synonyms_count             -0.064554
rel_frequency * rel_letters_count               -0.034593
rel_frequency * rel_orthographic_density        -0.005230
rel_frequency * rel_synonyms_count              -0.084655
rel_letters_count * rel_orthographic_density     0.091062
rel_letters_count * rel_synonyms_count           0.085076
rel_orthographic_density * rel_synonyms_count    0.122926
dtype: float64

Regressing rel letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11044486158404365

intercept                   1.604895
rel_aoa                    -0.037950
rel_clustering              0.007499
rel_frequency              -0.160251
rel_letters_count           0.394746
rel_orthographic_density    0.172348
rel_synonyms_count         -0.338180
dtype: float64

Regressing rel letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1419094491432853

intercept                                        1.710703
rel_aoa                                          0.062885
rel_clustering                                  -0.029275
rel_frequency                                   -0.055571
rel_letters_count                                0.507540
rel_orthographic_density                         0.127012
rel_synonyms_count                              -0.378007
rel_aoa * rel_clustering                         0.118752
rel_aoa * rel_frequency                          0.012953
rel_aoa * rel_letters_count                     -0.111374
rel_aoa * rel_orthographic_density              -0.154347
rel_aoa * rel_synonyms_count                    -0.214907
rel_clustering * rel_frequency                  -0.000114
rel_clustering * rel_letters_count               0.071890
rel_clustering * rel_orthographic_density        0.224349
rel_clustering * rel_synonyms_count             -0.017369
rel_frequency * rel_letters_count               -0.029621
rel_frequency * rel_orthographic_density         0.037805
rel_frequency * rel_synonyms_count              -0.030224
rel_letters_count * rel_orthographic_density     0.108857
rel_letters_count * rel_synonyms_count           0.092097
rel_orthographic_density * rel_synonyms_count    0.123682
dtype: float64

Regressing global letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08494216445117564

intercept                     -0.863641
global_aoa                     0.133485
global_clustering             -0.202835
global_frequency               0.276734
global_letters_count           0.434985
global_orthographic_density   -0.047742
global_synonyms_count         -0.188013
rel_aoa                       -0.129695
rel_clustering                 0.307270
rel_frequency                 -0.233381
rel_letters_count             -0.171315
rel_orthographic_density       0.090674
rel_synonyms_count            -0.124913
dtype: float64

Regressing global letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18716157688158028

intercept                                                 4.937455
global_aoa                                                0.893207
global_clustering                                         1.426020
global_frequency                                          2.349790
global_letters_count                                     -2.748077
global_orthographic_density                              -7.000663
global_synonyms_count                                     4.350380
rel_aoa                                                  -1.883938
rel_clustering                                           -5.619401
rel_frequency                                            -0.324402
rel_letters_count                                         3.695828
rel_orthographic_density                                  5.062454
rel_synonyms_count                                       -6.495021
global_aoa * global_clustering                            0.279484
global_aoa * global_frequency                             0.032453
global_aoa * global_letters_count                         0.087655
global_aoa * global_orthographic_density                  0.228426
global_aoa * global_synonyms_count                       -0.336579
global_aoa * rel_aoa                                      0.039828
global_aoa * rel_clustering                              -0.109629
global_aoa * rel_frequency                               -0.012141
global_aoa * rel_letters_count                           -0.130938
global_aoa * rel_orthographic_density                    -0.121134
global_aoa * rel_synonyms_count                           0.297131
global_clustering * global_frequency                      0.254866
global_clustering * global_letters_count                 -0.293578
global_clustering * global_orthographic_density          -1.924032
global_clustering * global_synonyms_count                -0.064170
global_clustering * rel_aoa                              -0.037799
global_clustering * rel_clustering                        0.046716
global_clustering * rel_frequency                         0.071645
global_clustering * rel_letters_count                     0.111530
global_clustering * rel_orthographic_density              1.725487
global_clustering * rel_synonyms_count                    0.234040
global_frequency * global_letters_count                   0.125919
global_frequency * global_orthographic_density           -0.436557
global_frequency * global_synonyms_count                 -0.591878
global_frequency * rel_aoa                                0.191669
global_frequency * rel_clustering                         0.123128
global_frequency * rel_frequency                         -0.023399
global_frequency * rel_letters_count                     -0.266012
global_frequency * rel_orthographic_density               0.491059
global_frequency * rel_synonyms_count                     0.735094
global_letters_count * global_orthographic_density       -0.320504
global_letters_count * global_synonyms_count              0.389294
global_letters_count * rel_aoa                            0.006015
global_letters_count * rel_clustering                     0.486843
global_letters_count * rel_frequency                      0.013253
global_letters_count * rel_letters_count                  0.031318
global_letters_count * rel_orthographic_density           0.179440
global_letters_count * rel_synonyms_count                -0.216665
global_orthographic_density * global_synonyms_count       0.993264
global_orthographic_density * rel_aoa                    -0.378826
global_orthographic_density * rel_clustering              1.202487
global_orthographic_density * rel_frequency               0.343990
global_orthographic_density * rel_letters_count           0.223099
global_orthographic_density * rel_orthographic_density    0.063290
global_orthographic_density * rel_synonyms_count         -0.543133
global_synonyms_count * rel_aoa                           0.160196
global_synonyms_count * rel_clustering                    0.754443
global_synonyms_count * rel_frequency                     0.341408
global_synonyms_count * rel_letters_count                -0.496707
global_synonyms_count * rel_orthographic_density         -1.062784
global_synonyms_count * rel_synonyms_count               -0.400226
rel_aoa * rel_clustering                                  0.043464
rel_aoa * rel_frequency                                  -0.158240
rel_aoa * rel_letters_count                              -0.109739
rel_aoa * rel_orthographic_density                        0.250899
rel_aoa * rel_synonyms_count                             -0.270153
rel_clustering * rel_frequency                           -0.315870
rel_clustering * rel_letters_count                       -0.201579
rel_clustering * rel_orthographic_density                -0.682857
rel_clustering * rel_synonyms_count                      -0.619506
rel_frequency * rel_letters_count                         0.040184
rel_frequency * rel_orthographic_density                 -0.301867
rel_frequency * rel_synonyms_count                       -0.501613
rel_letters_count * rel_orthographic_density             -0.017742
rel_letters_count * rel_synonyms_count                    0.281652
rel_orthographic_density * rel_synonyms_count             0.734261
dtype: float64

Regressing rel letters_count with 682 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17064313059560443

intercept                     -1.166376
global_aoa                     0.109841
global_clustering             -0.194937
global_frequency               0.266804
global_letters_count          -0.473307
global_orthographic_density    0.003291
global_synonyms_count         -0.180119
rel_aoa                       -0.107001
rel_clustering                 0.269419
rel_frequency                 -0.251587
rel_letters_count              0.767493
rel_orthographic_density       0.022851
rel_synonyms_count            -0.106123
dtype: float64

Regressing rel letters_count with 682 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2647561221966668

intercept                                                -3.794199
global_aoa                                                0.732602
global_clustering                                        -0.408450
global_frequency                                          2.209054
global_letters_count                                     -2.349033
global_orthographic_density                              -6.174796
global_synonyms_count                                     6.509025
rel_aoa                                                  -1.653507
rel_clustering                                           -4.358113
rel_frequency                                            -0.354687
rel_letters_count                                         3.797284
rel_orthographic_density                                  4.418688
rel_synonyms_count                                       -7.997015
global_aoa * global_clustering                            0.281067
global_aoa * global_frequency                             0.047851
global_aoa * global_letters_count                         0.087357
global_aoa * global_orthographic_density                  0.189584
global_aoa * global_synonyms_count                       -0.345694
global_aoa * rel_aoa                                      0.033520
global_aoa * rel_clustering                              -0.107925
global_aoa * rel_frequency                               -0.036123
global_aoa * rel_letters_count                           -0.130220
global_aoa * rel_orthographic_density                    -0.101063
global_aoa * rel_synonyms_count                           0.281807
global_clustering * global_frequency                      0.282633
global_clustering * global_letters_count                 -0.124593
global_clustering * global_orthographic_density          -1.571328
global_clustering * global_synonyms_count                 0.016808
global_clustering * rel_aoa                              -0.053720
global_clustering * rel_clustering                        0.024698
global_clustering * rel_frequency                         0.029057
global_clustering * rel_letters_count                    -0.014202
global_clustering * rel_orthographic_density              1.402313
global_clustering * rel_synonyms_count                    0.127398
global_frequency * global_letters_count                   0.096277
global_frequency * global_orthographic_density           -0.295792
global_frequency * global_synonyms_count                 -0.626278
global_frequency * rel_aoa                                0.162820
global_frequency * rel_clustering                         0.110067
global_frequency * rel_frequency                         -0.027132
global_frequency * rel_letters_count                     -0.233469
global_frequency * rel_orthographic_density               0.359728
global_frequency * rel_synonyms_count                     0.739608
global_letters_count * global_orthographic_density       -0.261694
global_letters_count * global_synonyms_count              0.262601
global_letters_count * rel_aoa                           -0.010457
global_letters_count * rel_clustering                     0.337352
global_letters_count * rel_frequency                      0.033556
global_letters_count * rel_letters_count                  0.014082
global_letters_count * rel_orthographic_density           0.118971
global_letters_count * rel_synonyms_count                -0.116352
global_orthographic_density * global_synonyms_count       0.701559
global_orthographic_density * rel_aoa                    -0.294414
global_orthographic_density * rel_clustering              0.886567
global_orthographic_density * rel_frequency               0.260609
global_orthographic_density * rel_letters_count           0.144191
global_orthographic_density * rel_orthographic_density    0.076264
global_orthographic_density * rel_synonyms_count         -0.302187
global_synonyms_count * rel_aoa                           0.200932
global_synonyms_count * rel_clustering                    0.689474
global_synonyms_count * rel_frequency                     0.383074
global_synonyms_count * rel_letters_count                -0.442180
global_synonyms_count * rel_orthographic_density         -0.841739
global_synonyms_count * rel_synonyms_count               -0.389670
rel_aoa * rel_clustering                                  0.062343
rel_aoa * rel_frequency                                  -0.119226
rel_aoa * rel_letters_count                              -0.090722
rel_aoa * rel_orthographic_density                        0.192564
rel_aoa * rel_synonyms_count                             -0.294461
rel_clustering * rel_frequency                           -0.309305
rel_clustering * rel_letters_count                       -0.095334
rel_clustering * rel_orthographic_density                -0.412989
rel_clustering * rel_synonyms_count                      -0.557473
rel_frequency * rel_letters_count                         0.024245
rel_frequency * rel_orthographic_density                 -0.244760
rel_frequency * rel_synonyms_count                       -0.497497
rel_letters_count * rel_orthographic_density              0.053504
rel_letters_count * rel_synonyms_count                    0.238275
rel_orthographic_density * rel_synonyms_count             0.533381
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 658 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.013933758477240987

intercept                      0.432372
global_aoa                     0.000010
global_clustering              0.029989
global_frequency               0.009720
global_letters_count          -0.004125
global_orthographic_density    0.004063
global_synonyms_count          0.121588
dtype: float64

Regressing global synonyms_count with 658 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.03198114621878323

intercept                                              0.982956
global_aoa                                             0.159613
global_clustering                                      0.339560
global_frequency                                      -0.068185
global_letters_count                                   0.059374
global_orthographic_density                            0.029413
global_synonyms_count                                 -0.351833
global_aoa * global_clustering                         0.011221
global_aoa * global_frequency                         -0.003568
global_aoa * global_letters_count                     -0.011578
global_aoa * global_orthographic_density               0.003957
global_aoa * global_synonyms_count                     0.030026
global_clustering * global_frequency                  -0.027679
global_clustering * global_letters_count              -0.020805
global_clustering * global_orthographic_density       -0.003388
global_clustering * global_synonyms_count              0.025801
global_frequency * global_letters_count               -0.009823
global_frequency * global_orthographic_density        -0.001258
global_frequency * global_synonyms_count               0.021422
global_letters_count * global_orthographic_density    -0.015139
global_letters_count * global_synonyms_count           0.016429
global_orthographic_density * global_synonyms_count    0.108356
dtype: float64

Regressing rel synonyms_count with 658 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.01030984161793258

intercept                      0.142753
global_aoa                    -0.001839
global_clustering              0.013513
global_frequency              -0.002122
global_letters_count          -0.002859
global_orthographic_density    0.012896
global_synonyms_count          0.093334
dtype: float64

Regressing rel synonyms_count with 658 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.028398091162289307

intercept                                              1.920302
global_aoa                                             0.111699
global_clustering                                      0.348096
global_frequency                                      -0.222966
global_letters_count                                  -0.058490
global_orthographic_density                           -0.037414
global_synonyms_count                                 -0.148565
global_aoa * global_clustering                         0.019274
global_aoa * global_frequency                          0.005603
global_aoa * global_letters_count                     -0.010270
global_aoa * global_orthographic_density               0.010568
global_aoa * global_synonyms_count                     0.011024
global_clustering * global_frequency                  -0.032915
global_clustering * global_letters_count              -0.029121
global_clustering * global_orthographic_density        0.003384
global_clustering * global_synonyms_count              0.061832
global_frequency * global_letters_count               -0.003170
global_frequency * global_orthographic_density         0.009503
global_frequency * global_synonyms_count               0.027167
global_letters_count * global_orthographic_density    -0.016142
global_letters_count * global_synonyms_count           0.030123
global_orthographic_density * global_synonyms_count    0.087981
dtype: float64

Regressing global synonyms_count with 658 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.013735191192611906

intercept                   0.407908
rel_aoa                     0.001017
rel_clustering             -0.028413
rel_frequency               0.008767
rel_letters_count           0.002254
rel_orthographic_density    0.017155
rel_synonyms_count          0.123170
dtype: float64

Regressing global synonyms_count with 658 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.025218348406944258

intercept                                        0.466351
rel_aoa                                         -0.005102
rel_clustering                                  -0.124013
rel_frequency                                    0.036636
rel_letters_count                               -0.020626
rel_orthographic_density                         0.054125
rel_synonyms_count                               0.149120
rel_aoa * rel_clustering                        -0.012986
rel_aoa * rel_frequency                         -0.006953
rel_aoa * rel_letters_count                     -0.005037
rel_aoa * rel_orthographic_density              -0.004559
rel_aoa * rel_synonyms_count                     0.021311
rel_clustering * rel_frequency                  -0.028936
rel_clustering * rel_letters_count              -0.006266
rel_clustering * rel_orthographic_density       -0.039332
rel_clustering * rel_synonyms_count              0.025052
rel_frequency * rel_letters_count               -0.005383
rel_frequency * rel_orthographic_density         0.000886
rel_frequency * rel_synonyms_count               0.012341
rel_letters_count * rel_orthographic_density    -0.013372
rel_letters_count * rel_synonyms_count          -0.004201
rel_orthographic_density * rel_synonyms_count    0.009470
dtype: float64

Regressing rel synonyms_count with 658 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07003369218382849

intercept                   0.086408
rel_aoa                    -0.007913
rel_clustering              0.027148
rel_frequency               0.013272
rel_letters_count           0.004476
rel_orthographic_density    0.012009
rel_synonyms_count          0.274606
dtype: float64

Regressing rel synonyms_count with 658 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.08636508428493084

intercept                                        0.134887
rel_aoa                                         -0.006392
rel_clustering                                  -0.089921
rel_frequency                                    0.036770
rel_letters_count                               -0.003998
rel_orthographic_density                         0.065835
rel_synonyms_count                               0.383807
rel_aoa * rel_clustering                         0.011908
rel_aoa * rel_frequency                         -0.001567
rel_aoa * rel_letters_count                     -0.000812
rel_aoa * rel_orthographic_density               0.007183
rel_aoa * rel_synonyms_count                    -0.001171
rel_clustering * rel_frequency                  -0.037542
rel_clustering * rel_letters_count               0.001424
rel_clustering * rel_orthographic_density       -0.010324
rel_clustering * rel_synonyms_count              0.003776
rel_frequency * rel_letters_count               -0.000221
rel_frequency * rel_orthographic_density         0.011156
rel_frequency * rel_synonyms_count               0.027567
rel_letters_count * rel_orthographic_density    -0.010577
rel_letters_count * rel_synonyms_count           0.010665
rel_orthographic_density * rel_synonyms_count    0.054014
dtype: float64

Regressing global synonyms_count with 658 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.026393199628291897

intercept                      1.803501
global_aoa                     0.001362
global_clustering              0.171848
global_frequency              -0.004907
global_letters_count          -0.035308
global_orthographic_density   -0.064110
global_synonyms_count          0.032980
rel_aoa                       -0.001497
rel_clustering                -0.166881
rel_frequency                  0.020315
rel_letters_count              0.033234
rel_orthographic_density       0.079522
rel_synonyms_count             0.100311
dtype: float64

Regressing global synonyms_count with 658 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12331455413515136

intercept                                                 10.214294
global_aoa                                                -0.087511
global_clustering                                          2.407691
global_frequency                                          -0.517889
global_letters_count                                       0.707250
global_orthographic_density                                0.505735
global_synonyms_count                                      0.105314
rel_aoa                                                   -0.115674
rel_clustering                                            -2.393378
rel_frequency                                              0.370372
rel_letters_count                                         -1.049527
rel_orthographic_density                                  -0.391012
rel_synonyms_count                                        -2.645134
global_aoa * global_clustering                            -0.042086
global_aoa * global_frequency                              0.008294
global_aoa * global_letters_count                         -0.020876
global_aoa * global_orthographic_density                  -0.090376
global_aoa * global_synonyms_count                         0.069448
global_aoa * rel_aoa                                      -0.013781
global_aoa * rel_clustering                                0.068874
global_aoa * rel_frequency                                -0.027480
global_aoa * rel_letters_count                             0.013709
global_aoa * rel_orthographic_density                      0.111347
global_aoa * rel_synonyms_count                           -0.009010
global_clustering * global_frequency                      -0.141122
global_clustering * global_letters_count                   0.044145
global_clustering * global_orthographic_density           -0.176422
global_clustering * global_synonyms_count                  0.110878
global_clustering * rel_aoa                               -0.006840
global_clustering * rel_clustering                         0.044403
global_clustering * rel_frequency                          0.103722
global_clustering * rel_letters_count                     -0.143078
global_clustering * rel_orthographic_density               0.149339
global_clustering * rel_synonyms_count                    -0.208140
global_frequency * global_letters_count                   -0.032547
global_frequency * global_orthographic_density            -0.107570
global_frequency * global_synonyms_count                   0.054935
global_frequency * rel_aoa                                 0.005621
global_frequency * rel_clustering                          0.135362
global_frequency * rel_frequency                           0.007579
global_frequency * rel_letters_count                       0.002625
global_frequency * rel_orthographic_density                0.086361
global_frequency * rel_synonyms_count                      0.008716
global_letters_count * global_orthographic_density         0.001327
global_letters_count * global_synonyms_count               0.049149
global_letters_count * rel_aoa                             0.008530
global_letters_count * rel_clustering                     -0.023883
global_letters_count * rel_frequency                       0.043188
global_letters_count * rel_letters_count                  -0.001731
global_letters_count * rel_orthographic_density           -0.025525
global_letters_count * rel_synonyms_count                  0.063964
global_orthographic_density * global_synonyms_count       -0.101418
global_orthographic_density * rel_aoa                      0.016823
global_orthographic_density * rel_clustering               0.196690
global_orthographic_density * rel_frequency                0.083338
global_orthographic_density * rel_letters_count            0.037610
global_orthographic_density * rel_orthographic_density    -0.059237
global_orthographic_density * rel_synonyms_count           0.356717
global_synonyms_count * rel_aoa                            0.034994
global_synonyms_count * rel_clustering                     0.120125
global_synonyms_count * rel_frequency                      0.043050
global_synonyms_count * rel_letters_count                 -0.048410
global_synonyms_count * rel_orthographic_density           0.340272
global_synonyms_count * rel_synonyms_count                 0.148368
rel_aoa * rel_clustering                                   0.003593
rel_aoa * rel_frequency                                    0.008672
rel_aoa * rel_letters_count                               -0.007050
rel_aoa * rel_orthographic_density                        -0.065995
rel_aoa * rel_synonyms_count                              -0.072076
rel_clustering * rel_frequency                            -0.097389
rel_clustering * rel_letters_count                         0.106739
rel_clustering * rel_orthographic_density                 -0.146696
rel_clustering * rel_synonyms_count                       -0.021633
rel_frequency * rel_letters_count                         -0.017300
rel_frequency * rel_orthographic_density                  -0.071382
rel_frequency * rel_synonyms_count                        -0.088226
rel_letters_count * rel_orthographic_density              -0.058770
rel_letters_count * rel_synonyms_count                    -0.012735
rel_orthographic_density * rel_synonyms_count             -0.467954
dtype: float64

Regressing rel synonyms_count with 658 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.15851405236319593

intercept                      1.294803
global_aoa                    -0.000382
global_clustering              0.101161
global_frequency              -0.021605
global_letters_count          -0.025152
global_orthographic_density   -0.005219
global_synonyms_count         -0.680437
rel_aoa                       -0.001642
rel_clustering                -0.098403
rel_frequency                  0.028670
rel_letters_count              0.023558
rel_orthographic_density       0.021866
rel_synonyms_count             0.919046
dtype: float64

Regressing rel synonyms_count with 658 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.2423851963346041

intercept                                                 11.257286
global_aoa                                                -0.098473
global_clustering                                          2.328078
global_frequency                                          -0.648027
global_letters_count                                       0.372118
global_orthographic_density                                0.556975
global_synonyms_count                                     -1.727393
rel_aoa                                                    0.104884
rel_clustering                                            -2.606071
rel_frequency                                              0.523528
rel_letters_count                                         -0.831965
rel_orthographic_density                                  -0.543751
rel_synonyms_count                                        -0.778936
global_aoa * global_clustering                            -0.037909
global_aoa * global_frequency                              0.009729
global_aoa * global_letters_count                         -0.014918
global_aoa * global_orthographic_density                  -0.092582
global_aoa * global_synonyms_count                         0.068496
global_aoa * rel_aoa                                      -0.008117
global_aoa * rel_clustering                                0.063723
global_aoa * rel_frequency                                -0.021185
global_aoa * rel_letters_count                             0.009379
global_aoa * rel_orthographic_density                      0.102873
global_aoa * rel_synonyms_count                            0.001083
global_clustering * global_frequency                      -0.140245
global_clustering * global_letters_count                   0.012358
global_clustering * global_orthographic_density           -0.140325
global_clustering * global_synonyms_count                  0.124908
global_clustering * rel_aoa                                0.011199
global_clustering * rel_clustering                         0.041443
global_clustering * rel_frequency                          0.101873
global_clustering * rel_letters_count                     -0.119375
global_clustering * rel_orthographic_density               0.111961
global_clustering * rel_synonyms_count                    -0.209491
global_frequency * global_letters_count                   -0.025867
global_frequency * global_orthographic_density            -0.086633
global_frequency * global_synonyms_count                   0.140653
global_frequency * rel_aoa                                 0.000064
global_frequency * rel_clustering                          0.162735
global_frequency * rel_frequency                           0.008251
global_frequency * rel_letters_count                       0.001938
global_frequency * rel_orthographic_density                0.073973
global_frequency * rel_synonyms_count                     -0.046177
global_letters_count * global_orthographic_density         0.005208
global_letters_count * global_synonyms_count               0.107870
global_letters_count * rel_aoa                            -0.006334
global_letters_count * rel_clustering                      0.007958
global_letters_count * rel_frequency                       0.024421
global_letters_count * rel_letters_count                   0.000269
global_letters_count * rel_orthographic_density           -0.014874
global_letters_count * rel_synonyms_count                  0.000220
global_orthographic_density * global_synonyms_count       -0.132141
global_orthographic_density * rel_aoa                      0.019363
global_orthographic_density * rel_clustering               0.141406
global_orthographic_density * rel_frequency                0.045448
global_orthographic_density * rel_letters_count            0.027278
global_orthographic_density * rel_orthographic_density    -0.051261
global_orthographic_density * rel_synonyms_count           0.297690
global_synonyms_count * rel_aoa                            0.011889
global_synonyms_count * rel_clustering                     0.059149
global_synonyms_count * rel_frequency                     -0.047056
global_synonyms_count * rel_letters_count                 -0.112505
global_synonyms_count * rel_orthographic_density           0.294860
global_synonyms_count * rel_synonyms_count                 0.131980
rel_aoa * rel_clustering                                  -0.008514
rel_aoa * rel_frequency                                    0.012598
rel_aoa * rel_letters_count                                0.003044
rel_aoa * rel_orthographic_density                        -0.051327
rel_aoa * rel_synonyms_count                              -0.070530
rel_clustering * rel_frequency                            -0.119868
rel_clustering * rel_letters_count                         0.079042
rel_clustering * rel_orthographic_density                 -0.102415
rel_clustering * rel_synonyms_count                        0.016955
rel_frequency * rel_letters_count                         -0.007649
rel_frequency * rel_orthographic_density                  -0.039569
rel_frequency * rel_synonyms_count                        -0.018554
rel_letters_count * rel_orthographic_density              -0.048425
rel_letters_count * rel_synonyms_count                     0.046834
rel_orthographic_density * rel_synonyms_count             -0.349355
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.05716945177544286

intercept                      1.910933
global_aoa                    -0.028591
global_clustering              0.017735
global_frequency              -0.007572
global_letters_count          -0.062625
global_orthographic_density    0.104523
global_synonyms_count          0.044995
dtype: float64

Regressing global orthographic_density with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07766466684642903

intercept                                              6.527974
global_aoa                                            -0.414313
global_clustering                                      0.711932
global_frequency                                      -0.121278
global_letters_count                                  -0.360124
global_orthographic_density                            0.016747
global_synonyms_count                                  0.585387
global_aoa * global_clustering                        -0.017207
global_aoa * global_frequency                          0.018698
global_aoa * global_letters_count                      0.009159
global_aoa * global_orthographic_density               0.050745
global_aoa * global_synonyms_count                     0.011625
global_clustering * global_frequency                  -0.020074
global_clustering * global_letters_count              -0.078464
global_clustering * global_orthographic_density        0.009433
global_clustering * global_synonyms_count              0.101370
global_frequency * global_letters_count               -0.021089
global_frequency * global_orthographic_density        -0.005480
global_frequency * global_synonyms_count               0.027133
global_letters_count * global_orthographic_density    -0.019157
global_letters_count * global_synonyms_count          -0.042205
global_orthographic_density * global_synonyms_count   -0.017684
dtype: float64

Regressing rel orthographic_density with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.039031857434117856

intercept                     -0.588845
global_aoa                    -0.013706
global_clustering              0.017508
global_frequency               0.003119
global_letters_count          -0.050304
global_orthographic_density    0.083873
global_synonyms_count          0.049281
dtype: float64

Regressing rel orthographic_density with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06337344314326643

intercept                                              4.610634
global_aoa                                            -0.350162
global_clustering                                      0.899321
global_frequency                                      -0.009738
global_letters_count                                  -0.497581
global_orthographic_density                           -0.218221
global_synonyms_count                                  0.275520
global_aoa * global_clustering                        -0.011042
global_aoa * global_frequency                          0.010648
global_aoa * global_letters_count                      0.019034
global_aoa * global_orthographic_density               0.050018
global_aoa * global_synonyms_count                     0.013179
global_clustering * global_frequency                  -0.023130
global_clustering * global_letters_count              -0.106174
global_clustering * global_orthographic_density       -0.004577
global_clustering * global_synonyms_count              0.031037
global_frequency * global_letters_count               -0.032968
global_frequency * global_orthographic_density        -0.004394
global_frequency * global_synonyms_count               0.004435
global_letters_count * global_orthographic_density     0.003901
global_letters_count * global_synonyms_count          -0.020373
global_orthographic_density * global_synonyms_count   -0.047421
dtype: float64

Regressing global orthographic_density with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.03920524539417902

intercept                   1.583560
rel_aoa                     0.017932
rel_clustering             -0.019135
rel_frequency              -0.005860
rel_letters_count          -0.063644
rel_orthographic_density    0.122287
rel_synonyms_count          0.065616
dtype: float64

Regressing global orthographic_density with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0667415344696144

intercept                                        1.582807
rel_aoa                                          0.106416
rel_clustering                                   0.128270
rel_frequency                                   -0.024193
rel_letters_count                               -0.060070
rel_orthographic_density                         0.301062
rel_synonyms_count                               0.280194
rel_aoa * rel_clustering                         0.064416
rel_aoa * rel_frequency                          0.024937
rel_aoa * rel_letters_count                      0.001789
rel_aoa * rel_orthographic_density               0.061550
rel_aoa * rel_synonyms_count                     0.022030
rel_clustering * rel_frequency                   0.009137
rel_clustering * rel_letters_count              -0.081243
rel_clustering * rel_orthographic_density        0.011525
rel_clustering * rel_synonyms_count              0.027092
rel_frequency * rel_letters_count                0.015364
rel_frequency * rel_orthographic_density         0.046477
rel_frequency * rel_synonyms_count               0.085822
rel_letters_count * rel_orthographic_density    -0.057130
rel_letters_count * rel_synonyms_count          -0.041630
rel_orthographic_density * rel_synonyms_count   -0.116221
dtype: float64

Regressing rel orthographic_density with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08087860817193204

intercept                  -0.536905
rel_aoa                     0.021936
rel_clustering              0.005973
rel_frequency               0.040522
rel_letters_count          -0.053026
rel_orthographic_density    0.196525
rel_synonyms_count          0.043557
dtype: float64

Regressing rel orthographic_density with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10567025372589889

intercept                                       -0.489481
rel_aoa                                          0.072181
rel_clustering                                   0.125574
rel_frequency                                    0.059459
rel_letters_count                               -0.053964
rel_orthographic_density                         0.359514
rel_synonyms_count                               0.132495
rel_aoa * rel_clustering                         0.054662
rel_aoa * rel_frequency                          0.007502
rel_aoa * rel_letters_count                      0.013061
rel_aoa * rel_orthographic_density               0.084859
rel_aoa * rel_synonyms_count                     0.037891
rel_clustering * rel_frequency                  -0.004100
rel_clustering * rel_letters_count              -0.085514
rel_clustering * rel_orthographic_density       -0.007534
rel_clustering * rel_synonyms_count             -0.015123
rel_frequency * rel_letters_count                0.007284
rel_frequency * rel_orthographic_density         0.050270
rel_frequency * rel_synonyms_count               0.053866
rel_letters_count * rel_orthographic_density    -0.041993
rel_letters_count * rel_synonyms_count          -0.042021
rel_orthographic_density * rel_synonyms_count   -0.144912
dtype: float64

Regressing global orthographic_density with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06876915673653672

intercept                      2.879508
global_aoa                    -0.077098
global_clustering              0.061148
global_frequency              -0.066085
global_letters_count          -0.075947
global_orthographic_density    0.252385
global_synonyms_count         -0.020472
rel_aoa                        0.072190
rel_clustering                -0.045218
rel_frequency                  0.066677
rel_letters_count              0.010221
rel_orthographic_density      -0.183216
rel_synonyms_count             0.079020
dtype: float64

Regressing global orthographic_density with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17481056478171408

intercept                                                -1.234056
global_aoa                                               -0.442375
global_clustering                                         0.242537
global_frequency                                         -0.650590
global_letters_count                                      0.310463
global_orthographic_density                               6.812519
global_synonyms_count                                     5.225010
rel_aoa                                                  -0.246164
rel_clustering                                            1.847984
rel_frequency                                             1.016248
rel_letters_count                                         0.436679
rel_orthographic_density                                 -6.281536
rel_synonyms_count                                        3.342462
global_aoa * global_clustering                           -0.074583
global_aoa * global_frequency                            -0.005293
global_aoa * global_letters_count                         0.011354
global_aoa * global_orthographic_density                 -0.038032
global_aoa * global_synonyms_count                        0.090528
global_aoa * rel_aoa                                     -0.005415
global_aoa * rel_clustering                              -0.010436
global_aoa * rel_frequency                               -0.006849
global_aoa * rel_letters_count                            0.025063
global_aoa * rel_orthographic_density                     0.127953
global_aoa * rel_synonyms_count                          -0.195105
global_clustering * global_frequency                     -0.148675
global_clustering * global_letters_count                 -0.082235
global_clustering * global_orthographic_density           0.969493
global_clustering * global_synonyms_count                 0.296159
global_clustering * rel_aoa                              -0.002154
global_clustering * rel_clustering                       -0.026164
global_clustering * rel_frequency                         0.091552
global_clustering * rel_letters_count                     0.187206
global_clustering * rel_orthographic_density             -0.703315
global_clustering * rel_synonyms_count                    0.066692
global_frequency * global_letters_count                  -0.060966
global_frequency * global_orthographic_density           -0.015873
global_frequency * global_synonyms_count                 -0.089747
global_frequency * rel_aoa                                0.031132
global_frequency * rel_clustering                         0.056591
global_frequency * rel_frequency                         -0.011813
global_frequency * rel_letters_count                      0.075827
global_frequency * rel_orthographic_density               0.093385
global_frequency * rel_synonyms_count                    -0.244001
global_letters_count * global_orthographic_density       -0.021470
global_letters_count * global_synonyms_count             -0.591736
global_letters_count * rel_aoa                            0.020823
global_letters_count * rel_clustering                    -0.068117
global_letters_count * rel_frequency                     -0.025415
global_letters_count * rel_letters_count                 -0.057070
global_letters_count * rel_orthographic_density           0.058047
global_letters_count * rel_synonyms_count                 0.341159
global_orthographic_density * global_synonyms_count      -0.224391
global_orthographic_density * rel_aoa                    -0.030787
global_orthographic_density * rel_clustering             -0.964571
global_orthographic_density * rel_frequency              -0.094050
global_orthographic_density * rel_letters_count          -0.097505
global_orthographic_density * rel_orthographic_density   -0.116792
global_orthographic_density * rel_synonyms_count         -0.095789
global_synonyms_count * rel_aoa                          -0.054131
global_synonyms_count * rel_clustering                   -0.278661
global_synonyms_count * rel_frequency                     0.117855
global_synonyms_count * rel_letters_count                 0.543763
global_synonyms_count * rel_orthographic_density          0.331083
global_synonyms_count * rel_synonyms_count               -0.090417
rel_aoa * rel_clustering                                  0.112398
rel_aoa * rel_frequency                                  -0.008083
rel_aoa * rel_letters_count                              -0.020425
rel_aoa * rel_orthographic_density                        0.059199
rel_aoa * rel_synonyms_count                              0.128796
rel_clustering * rel_frequency                           -0.029569
rel_clustering * rel_letters_count                       -0.164275
rel_clustering * rel_orthographic_density                 0.629337
rel_clustering * rel_synonyms_count                       0.005754
rel_frequency * rel_letters_count                        -0.001168
rel_frequency * rel_orthographic_density                  0.037671
rel_frequency * rel_synonyms_count                        0.214076
rel_letters_count * rel_orthographic_density             -0.131357
rel_letters_count * rel_synonyms_count                   -0.383863
rel_orthographic_density * rel_synonyms_count            -0.193363
dtype: float64

Regressing rel orthographic_density with 537 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11353228682215344

intercept                      1.830816
global_aoa                    -0.058650
global_clustering              0.060314
global_frequency              -0.039901
global_letters_count          -0.036696
global_orthographic_density   -0.488844
global_synonyms_count          0.006727
rel_aoa                        0.053886
rel_clustering                -0.023977
rel_frequency                  0.060560
rel_letters_count             -0.028155
rel_orthographic_density       0.633224
rel_synonyms_count             0.028467
dtype: float64

Regressing rel orthographic_density with 537 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2176692582646258

intercept                                                 0.202599
global_aoa                                                0.021127
global_clustering                                         1.428194
global_frequency                                         -0.514519
global_letters_count                                     -0.155635
global_orthographic_density                               6.606694
global_synonyms_count                                     3.682612
rel_aoa                                                  -0.530904
rel_clustering                                            0.594901
rel_frequency                                             0.755186
rel_letters_count                                         0.518573
rel_orthographic_density                                 -5.831426
rel_synonyms_count                                        3.726978
global_aoa * global_clustering                           -0.051867
global_aoa * global_frequency                            -0.019887
global_aoa * global_letters_count                         0.003558
global_aoa * global_orthographic_density                 -0.078822
global_aoa * global_synonyms_count                        0.044655
global_aoa * rel_aoa                                     -0.000404
global_aoa * rel_clustering                              -0.042805
global_aoa * rel_frequency                                0.009018
global_aoa * rel_letters_count                            0.040663
global_aoa * rel_orthographic_density                     0.170711
global_aoa * rel_synonyms_count                          -0.119874
global_clustering * global_frequency                     -0.190037
global_clustering * global_letters_count                 -0.157501
global_clustering * global_orthographic_density           0.780053
global_clustering * global_synonyms_count                 0.135631
global_clustering * rel_aoa                              -0.011964
global_clustering * rel_clustering                       -0.010117
global_clustering * rel_frequency                         0.133628
global_clustering * rel_letters_count                     0.235698
global_clustering * rel_orthographic_density             -0.486354
global_clustering * rel_synonyms_count                    0.220106
global_frequency * global_letters_count                  -0.055637
global_frequency * global_orthographic_density           -0.147859
global_frequency * global_synonyms_count                 -0.065540
global_frequency * rel_aoa                                0.044910
global_frequency * rel_clustering                         0.086764
global_frequency * rel_frequency                         -0.008094
global_frequency * rel_letters_count                      0.073301
global_frequency * rel_orthographic_density               0.219357
global_frequency * rel_synonyms_count                    -0.225174
global_letters_count * global_orthographic_density       -0.007098
global_letters_count * global_synonyms_count             -0.454478
global_letters_count * rel_aoa                            0.026651
global_letters_count * rel_clustering                     0.058811
global_letters_count * rel_frequency                     -0.002212
global_letters_count * rel_letters_count                 -0.047413
global_letters_count * rel_orthographic_density           0.062487
global_letters_count * rel_synonyms_count                 0.253088
global_orthographic_density * global_synonyms_count      -0.233356
global_orthographic_density * rel_aoa                    -0.037714
global_orthographic_density * rel_clustering             -0.709937
global_orthographic_density * rel_frequency               0.053824
global_orthographic_density * rel_letters_count          -0.073488
global_orthographic_density * rel_orthographic_density   -0.081248
global_orthographic_density * rel_synonyms_count         -0.011073
global_synonyms_count * rel_aoa                          -0.058410
global_synonyms_count * rel_clustering                   -0.074817
global_synonyms_count * rel_frequency                     0.087564
global_synonyms_count * rel_letters_count                 0.408864
global_synonyms_count * rel_orthographic_density          0.262855
global_synonyms_count * rel_synonyms_count               -0.078899
rel_aoa * rel_clustering                                  0.114092
rel_aoa * rel_frequency                                  -0.026669
rel_aoa * rel_letters_count                              -0.036462
rel_aoa * rel_orthographic_density                        0.052108
rel_aoa * rel_synonyms_count                              0.111146
rel_clustering * rel_frequency                           -0.066568
rel_clustering * rel_letters_count                       -0.262844
rel_clustering * rel_orthographic_density                 0.350923
rel_clustering * rel_synonyms_count                      -0.212040
rel_frequency * rel_letters_count                        -0.026462
rel_frequency * rel_orthographic_density                 -0.089922
rel_frequency * rel_synonyms_count                        0.193847
rel_letters_count * rel_orthographic_density             -0.150087
rel_letters_count * rel_synonyms_count                   -0.274306
rel_orthographic_density * rel_synonyms_count            -0.177933
dtype: float64