Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.all, past=Past.all, durl=Durl.exclude_past, max_distance=2)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 4941 substitutions for model Model(time=Time.continuous, source=Source.all, past=Past.all, durl=Durl.exclude_past, max_distance=2)
100% (4941 of 4941) |######################| Elapsed Time: 0:01:39 Time: 0:01:39

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | ns. |
H_00 | *** | ns. | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | *   | *** | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *** |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | *   | *** | *   |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |
------------------
H_0  | *** | ns. |
H_00 | *** | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *   | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | ns. | *** |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *   | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | **  | ns. | ns. | **  |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | **  | ns. |
H_00 | ns. | *** | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *** | ns. | *   |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | *** | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | **  | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | *** | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | **  | ns. | *** |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | ns. | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *** | ns. | *   |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | **  | ns. | *** |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | ns. | **  |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.54452287  0.16544799  0.07839186  0.07265262  0.03478348  0.03030432
  0.0186419   0.01809321  0.01630228  0.00879147]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.455705 0.273060 -0.084683 0.238851 0.221072 -0.442019 0.217481 0.275208 -0.425249 0.279935 -0.164213 0.009609
Component-1 0.311235 -0.409973 0.156801 -0.307205 -0.267830 -0.423266 0.168247 -0.309101 -0.415777 0.206097 -0.161128 0.006865
Component-2 0.809499 0.452996 -0.138917 0.180863 0.180592 -0.096544 0.024420 0.132951 -0.067832 0.131008 -0.005658 -0.074125

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (4941 of 4941) |######################| Elapsed Time: 0:01:22 Time: 0:01:22

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | ns. | ns. | **  | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | ns. | ns. | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.6886474   0.18007583]

Out[35]:
aoa frequency letters_count
Component-0 -0.738795 0.383435 -0.554220
Component-1 0.405921 -0.403283 -0.820116

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (4941 of 4941) |######################| Elapsed Time: 0:00:34 Time: 0:00:34

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 2334 (cluster-unique) substitutions, but the PCA is in fact computed on 1874 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
    * global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07097457733544288

intercept                      5.700915
global_aoa                     0.022391
global_clustering              0.064560
global_frequency               0.364387
global_letters_count          -0.011806
global_orthographic_density   -0.009881
global_synonyms_count          0.037491
dtype: float64

Regressing global frequency with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08162082829467765

intercept                                              9.127081
global_aoa                                            -0.063240
global_clustering                                      0.661007
global_frequency                                       0.658612
global_letters_count                                  -0.844134
global_orthographic_density                           -0.389673
global_synonyms_count                                  0.013164
global_aoa * global_clustering                         0.011439
global_aoa * global_frequency                          0.003813
global_aoa * global_letters_count                      0.017022
global_aoa * global_orthographic_density               0.004137
global_aoa * global_synonyms_count                     0.006244
global_clustering * global_frequency                   0.027098
global_clustering * global_letters_count              -0.137757
global_clustering * global_orthographic_density       -0.116381
global_clustering * global_synonyms_count              0.205381
global_frequency * global_letters_count               -0.015368
global_frequency * global_orthographic_density        -0.068646
global_frequency * global_synonyms_count               0.070999
global_letters_count * global_orthographic_density     0.052235
global_letters_count * global_synonyms_count           0.052002
global_orthographic_density * global_synonyms_count    0.167800
dtype: float64

Regressing rel frequency with 1487 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.03642724815872367

intercept                     -6.328078
global_aoa                     0.046679
global_clustering              0.047828
global_frequency               0.308175
global_letters_count           0.045049
global_orthographic_density   -0.021539
global_synonyms_count          0.110623
dtype: float64

Regressing rel frequency with 1487 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.04448761798858769

intercept                                             -7.688549
global_aoa                                             0.121629
global_clustering                                     -0.227419
global_frequency                                       0.619394
global_letters_count                                  -0.344112
global_orthographic_density                            0.176638
global_synonyms_count                                 -0.218569
global_aoa * global_clustering                         0.038821
global_aoa * global_frequency                          0.005322
global_aoa * global_letters_count                      0.015864
global_aoa * global_orthographic_density              -0.010150
global_aoa * global_synonyms_count                     0.055749
global_clustering * global_frequency                   0.037084
global_clustering * global_letters_count              -0.068336
global_clustering * global_orthographic_density        0.013365
global_clustering * global_synonyms_count              0.242419
global_frequency * global_letters_count               -0.019248
global_frequency * global_orthographic_density        -0.044080
global_frequency * global_synonyms_count               0.109912
global_letters_count * global_orthographic_density     0.058798
global_letters_count * global_synonyms_count           0.027300
global_orthographic_density * global_synonyms_count    0.166200
dtype: float64

Regressing global frequency with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.043506525830809606

intercept                   9.499595
rel_aoa                     0.074820
rel_clustering             -0.063627
rel_frequency               0.234878
rel_letters_count          -0.027694
rel_orthographic_density    0.018987
rel_synonyms_count          0.019825
dtype: float64

Regressing global frequency with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.054369282743936154

intercept                                        9.419941
rel_aoa                                          0.154328
rel_clustering                                  -0.065480
rel_frequency                                    0.191620
rel_letters_count                               -0.006225
rel_orthographic_density                        -0.041581
rel_synonyms_count                               0.376120
rel_aoa * rel_clustering                        -0.002482
rel_aoa * rel_frequency                          0.048225
rel_aoa * rel_letters_count                      0.016696
rel_aoa * rel_orthographic_density              -0.008003
rel_aoa * rel_synonyms_count                     0.031605
rel_clustering * rel_frequency                  -0.039979
rel_clustering * rel_letters_count              -0.057534
rel_clustering * rel_orthographic_density       -0.024848
rel_clustering * rel_synonyms_count              0.175847
rel_frequency * rel_letters_count                0.001448
rel_frequency * rel_orthographic_density        -0.024456
rel_frequency * rel_synonyms_count               0.118009
rel_letters_count * rel_orthographic_density     0.009649
rel_letters_count * rel_synonyms_count           0.017921
rel_orthographic_density * rel_synonyms_count    0.133221
dtype: float64

Regressing rel frequency with 1487 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.22640994767984693

intercept                  -1.560561
rel_aoa                     0.058057
rel_clustering              0.120217
rel_frequency               0.609929
rel_letters_count          -0.110176
rel_orthographic_density   -0.181838
rel_synonyms_count          0.065049
dtype: float64

Regressing rel frequency with 1487 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.2355477013375833

intercept                                       -1.658934
rel_aoa                                          0.055654
rel_clustering                                   0.117009
rel_frequency                                    0.595464
rel_letters_count                               -0.070858
rel_orthographic_density                        -0.351358
rel_synonyms_count                               0.276167
rel_aoa * rel_clustering                        -0.058366
rel_aoa * rel_frequency                         -0.003392
rel_aoa * rel_letters_count                      0.021092
rel_aoa * rel_orthographic_density               0.035976
rel_aoa * rel_synonyms_count                     0.135255
rel_clustering * rel_frequency                  -0.046306
rel_clustering * rel_letters_count              -0.103999
rel_clustering * rel_orthographic_density       -0.174722
rel_clustering * rel_synonyms_count              0.073990
rel_frequency * rel_letters_count               -0.006938
rel_frequency * rel_orthographic_density        -0.065110
rel_frequency * rel_synonyms_count               0.063335
rel_letters_count * rel_orthographic_density     0.020747
rel_letters_count * rel_synonyms_count           0.016508
rel_orthographic_density * rel_synonyms_count    0.208917
dtype: float64

Regressing global frequency with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07892431707423586

intercept                      5.724163
global_aoa                    -0.054181
global_clustering              0.231850
global_frequency               0.404538
global_letters_count           0.169864
global_orthographic_density    0.138095
global_synonyms_count         -0.071289
rel_aoa                        0.111488
rel_clustering                -0.208922
rel_frequency                 -0.042693
rel_letters_count             -0.192979
rel_orthographic_density      -0.161593
rel_synonyms_count             0.111024
dtype: float64

Regressing global frequency with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.12213658845131871

intercept                                                -17.148737
global_aoa                                                -0.734995
global_clustering                                         -7.200345
global_frequency                                           1.412142
global_letters_count                                      -0.889288
global_orthographic_density                                0.775350
global_synonyms_count                                     -3.997101
rel_aoa                                                    1.257763
rel_clustering                                             9.281933
rel_frequency                                              0.659204
rel_letters_count                                         -0.232529
rel_orthographic_density                                  -2.770092
rel_synonyms_count                                         5.324484
global_aoa * global_clustering                             0.157774
global_aoa * global_frequency                              0.099224
global_aoa * global_letters_count                          0.115501
global_aoa * global_orthographic_density                   0.047146
global_aoa * global_synonyms_count                         0.027009
global_aoa * rel_aoa                                      -0.005406
global_aoa * rel_clustering                               -0.148326
global_aoa * rel_frequency                                -0.061540
global_aoa * rel_letters_count                            -0.102117
global_aoa * rel_orthographic_density                     -0.053085
global_aoa * rel_synonyms_count                           -0.111932
global_clustering * global_frequency                       0.355019
global_clustering * global_letters_count                   0.165499
global_clustering * global_orthographic_density            0.917083
global_clustering * global_synonyms_count                  0.643391
global_clustering * rel_aoa                               -0.154099
global_clustering * rel_clustering                         0.096698
global_clustering * rel_frequency                         -0.183143
global_clustering * rel_letters_count                     -0.291345
global_clustering * rel_orthographic_density              -0.942635
global_clustering * rel_synonyms_count                    -0.363184
global_frequency * global_letters_count                    0.030293
global_frequency * global_orthographic_density             0.210438
global_frequency * global_synonyms_count                   0.195295
global_frequency * rel_aoa                                -0.165622
global_frequency * rel_clustering                         -0.353844
global_frequency * rel_frequency                          -0.005046
global_frequency * rel_letters_count                      -0.027235
global_frequency * rel_orthographic_density               -0.142914
global_frequency * rel_synonyms_count                     -0.135690
global_letters_count * global_orthographic_density         0.361700
global_letters_count * global_synonyms_count               0.698184
global_letters_count * rel_aoa                            -0.064637
global_letters_count * rel_clustering                     -0.419502
global_letters_count * rel_frequency                      -0.127639
global_letters_count * rel_letters_count                   0.014860
global_letters_count * rel_orthographic_density           -0.161437
global_letters_count * rel_synonyms_count                 -0.664481
global_orthographic_density * global_synonyms_count        0.986609
global_orthographic_density * rel_aoa                      0.027083
global_orthographic_density * rel_clustering              -1.157096
global_orthographic_density * rel_frequency               -0.398834
global_orthographic_density * rel_letters_count           -0.338961
global_orthographic_density * rel_orthographic_density    -0.048597
global_orthographic_density * rel_synonyms_count          -0.841586
global_synonyms_count * rel_aoa                           -0.186265
global_synonyms_count * rel_clustering                    -0.617303
global_synonyms_count * rel_frequency                     -0.379798
global_synonyms_count * rel_letters_count                 -0.464837
global_synonyms_count * rel_orthographic_density          -0.651094
global_synonyms_count * rel_synonyms_count                 0.108943
rel_aoa * rel_clustering                                   0.092181
rel_aoa * rel_frequency                                    0.118197
rel_aoa * rel_letters_count                                0.073597
rel_aoa * rel_orthographic_density                        -0.030987
rel_aoa * rel_synonyms_count                               0.274180
rel_clustering * rel_frequency                             0.174123
rel_clustering * rel_letters_count                         0.397048
rel_clustering * rel_orthographic_density                  0.981930
rel_clustering * rel_synonyms_count                        0.590431
rel_frequency * rel_letters_count                          0.103489
rel_frequency * rel_orthographic_density                   0.264982
rel_frequency * rel_synonyms_count                         0.453351
rel_letters_count * rel_orthographic_density               0.200546
rel_letters_count * rel_synonyms_count                     0.511153
rel_orthographic_density * rel_synonyms_count              0.692867
dtype: float64

Regressing rel frequency with 1487 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.29985789618236525

intercept                      4.564213
global_aoa                    -0.051356
global_clustering              0.250925
global_frequency              -0.514687
global_letters_count           0.228687
global_orthographic_density    0.227107
global_synonyms_count         -0.078034
rel_aoa                        0.087157
rel_clustering                -0.193230
rel_frequency                  0.911721
rel_letters_count             -0.243609
rel_orthographic_density      -0.237293
rel_synonyms_count             0.104632
dtype: float64

Regressing rel frequency with 1487 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.3316977749152135

intercept                                                -26.188726
global_aoa                                                -0.480228
global_clustering                                         -7.705665
global_frequency                                           1.038854
global_letters_count                                      -0.575815
global_orthographic_density                                2.047716
global_synonyms_count                                     -3.162579
rel_aoa                                                    1.048635
rel_clustering                                             9.811787
rel_frequency                                              0.905369
rel_letters_count                                         -0.268654
rel_orthographic_density                                  -3.460919
rel_synonyms_count                                         4.785659
global_aoa * global_clustering                             0.147395
global_aoa * global_frequency                              0.076332
global_aoa * global_letters_count                          0.109154
global_aoa * global_orthographic_density                   0.041061
global_aoa * global_synonyms_count                         0.027981
global_aoa * rel_aoa                                      -0.004519
global_aoa * rel_clustering                               -0.139494
global_aoa * rel_frequency                                -0.037562
global_aoa * rel_letters_count                            -0.096553
global_aoa * rel_orthographic_density                     -0.044491
global_aoa * rel_synonyms_count                           -0.107037
global_clustering * global_frequency                       0.392579
global_clustering * global_letters_count                   0.188688
global_clustering * global_orthographic_density            0.931933
global_clustering * global_synonyms_count                  0.597323
global_clustering * rel_aoa                               -0.143878
global_clustering * rel_clustering                         0.078795
global_clustering * rel_frequency                         -0.212432
global_clustering * rel_letters_count                     -0.237792
global_clustering * rel_orthographic_density              -0.861466
global_clustering * rel_synonyms_count                    -0.241969
global_frequency * global_letters_count                    0.037757
global_frequency * global_orthographic_density             0.141725
global_frequency * global_synonyms_count                   0.120833
global_frequency * rel_aoa                                -0.141392
global_frequency * rel_clustering                         -0.403397
global_frequency * rel_frequency                           0.015424
global_frequency * rel_letters_count                      -0.014214
global_frequency * rel_orthographic_density               -0.074959
global_frequency * rel_synonyms_count                     -0.029382
global_letters_count * global_orthographic_density         0.314775
global_letters_count * global_synonyms_count               0.670713
global_letters_count * rel_aoa                            -0.070132
global_letters_count * rel_clustering                     -0.412922
global_letters_count * rel_frequency                      -0.110325
global_letters_count * rel_letters_count                   0.018156
global_letters_count * rel_orthographic_density           -0.108431
global_letters_count * rel_synonyms_count                 -0.638140
global_orthographic_density * global_synonyms_count        0.904651
global_orthographic_density * rel_aoa                      0.014442
global_orthographic_density * rel_clustering              -1.173869
global_orthographic_density * rel_frequency               -0.331157
global_orthographic_density * rel_letters_count           -0.298241
global_orthographic_density * rel_orthographic_density    -0.031290
global_orthographic_density * rel_synonyms_count          -0.866003
global_synonyms_count * rel_aoa                           -0.167510
global_synonyms_count * rel_clustering                    -0.658246
global_synonyms_count * rel_frequency                     -0.345969
global_synonyms_count * rel_letters_count                 -0.479738
global_synonyms_count * rel_orthographic_density          -0.589967
global_synonyms_count * rel_synonyms_count                 0.113684
rel_aoa * rel_clustering                                   0.095142
rel_aoa * rel_frequency                                    0.091432
rel_aoa * rel_letters_count                                0.067833
rel_aoa * rel_orthographic_density                        -0.037985
rel_aoa * rel_synonyms_count                               0.276548
rel_clustering * rel_frequency                             0.221987
rel_clustering * rel_letters_count                         0.324334
rel_clustering * rel_orthographic_density                  0.923168
rel_clustering * rel_synonyms_count                        0.549297
rel_frequency * rel_letters_count                          0.072380
rel_frequency * rel_orthographic_density                   0.201496
rel_frequency * rel_synonyms_count                         0.396299
rel_letters_count * rel_orthographic_density               0.161534
rel_letters_count * rel_synonyms_count                     0.527129
rel_orthographic_density * rel_synonyms_count              0.748567
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 1362 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.08764505200927653

intercept                      5.672575
global_aoa                     0.280188
global_clustering             -0.113690
global_frequency              -0.117206
global_letters_count           0.034208
global_orthographic_density   -0.083280
global_synonyms_count         -0.143899
dtype: float64

Regressing global aoa with 1362 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.10371379256384616

intercept                                             -2.279672
global_aoa                                             0.092380
global_clustering                                     -2.042975
global_frequency                                       0.214735
global_letters_count                                   0.789114
global_orthographic_density                           -0.802472
global_synonyms_count                                 -1.232770
global_aoa * global_clustering                         0.067063
global_aoa * global_frequency                          0.022036
global_aoa * global_letters_count                      0.048343
global_aoa * global_orthographic_density               0.057492
global_aoa * global_synonyms_count                     0.043531
global_clustering * global_frequency                   0.062242
global_clustering * global_letters_count               0.163612
global_clustering * global_orthographic_density        0.022361
global_clustering * global_synonyms_count             -0.268584
global_frequency * global_letters_count               -0.021902
global_frequency * global_orthographic_density         0.031321
global_frequency * global_synonyms_count              -0.099550
global_letters_count * global_orthographic_density     0.017197
global_letters_count * global_synonyms_count           0.024932
global_orthographic_density * global_synonyms_count    0.005233
dtype: float64

Regressing rel aoa with 1362 measures, no interactions
           ^^^^^^^
R^2 = 0.019358149460054097

intercept                      0.550267
global_aoa                     0.109832
global_clustering             -0.086592
global_frequency              -0.088913
global_letters_count           0.020548
global_orthographic_density    0.036799
global_synonyms_count         -0.079097
dtype: float64

Regressing rel aoa with 1362 measures, with interactions
           ^^^^^^^
R^2 = 0.034130357896801744

intercept                                             -5.092516
global_aoa                                             0.283783
global_clustering                                     -1.020887
global_frequency                                       0.440634
global_letters_count                                   0.289485
global_orthographic_density                           -0.970563
global_synonyms_count                                 -0.806657
global_aoa * global_clustering                         0.038254
global_aoa * global_frequency                         -0.012702
global_aoa * global_letters_count                      0.014468
global_aoa * global_orthographic_density               0.047462
global_aoa * global_synonyms_count                     0.036980
global_clustering * global_frequency                   0.064785
global_clustering * global_letters_count               0.068715
global_clustering * global_orthographic_density       -0.154523
global_clustering * global_synonyms_count             -0.306194
global_frequency * global_letters_count                0.003058
global_frequency * global_orthographic_density        -0.015321
global_frequency * global_synonyms_count              -0.171882
global_letters_count * global_orthographic_density    -0.024997
global_letters_count * global_synonyms_count           0.032897
global_orthographic_density * global_synonyms_count    0.065719
dtype: float64

Regressing global aoa with 1362 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.02890808611875117

intercept                   6.585246
rel_aoa                     0.023864
rel_clustering              0.170990
rel_frequency              -0.025601
rel_letters_count           0.004833
rel_orthographic_density   -0.360525
rel_synonyms_count         -0.263096
dtype: float64

Regressing global aoa with 1362 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.048071808859931564

intercept                                        6.538783
rel_aoa                                         -0.134960
rel_clustering                                   0.083569
rel_frequency                                   -0.032227
rel_letters_count                               -0.021471
rel_orthographic_density                        -0.655658
rel_synonyms_count                              -0.711189
rel_aoa * rel_clustering                        -0.013405
rel_aoa * rel_frequency                         -0.046722
rel_aoa * rel_letters_count                      0.045494
rel_aoa * rel_orthographic_density               0.059723
rel_aoa * rel_synonyms_count                    -0.066769
rel_clustering * rel_frequency                   0.082994
rel_clustering * rel_letters_count               0.150318
rel_clustering * rel_orthographic_density       -0.008664
rel_clustering * rel_synonyms_count             -0.361479
rel_frequency * rel_letters_count               -0.012881
rel_frequency * rel_orthographic_density        -0.033384
rel_frequency * rel_synonyms_count              -0.215131
rel_letters_count * rel_orthographic_density     0.086845
rel_letters_count * rel_synonyms_count           0.003792
rel_orthographic_density * rel_synonyms_count    0.013665
dtype: float64

Regressing rel aoa with 1362 measures, no interactions
           ^^^^^^^
R^2 = 0.1406596798660794

intercept                   0.749156
rel_aoa                     0.431996
rel_clustering             -0.130638
rel_frequency              -0.099890
rel_letters_count          -0.018774
rel_orthographic_density    0.095548
rel_synonyms_count         -0.179414
dtype: float64

Regressing rel aoa with 1362 measures, with interactions
           ^^^^^^^
R^2 = 0.1534592351450621

intercept                                        0.908839
rel_aoa                                          0.492356
rel_clustering                                  -0.349309
rel_frequency                                   -0.089187
rel_letters_count                               -0.035325
rel_orthographic_density                         0.258640
rel_synonyms_count                              -0.503601
rel_aoa * rel_clustering                         0.004721
rel_aoa * rel_frequency                          0.024592
rel_aoa * rel_letters_count                      0.033051
rel_aoa * rel_orthographic_density               0.065531
rel_aoa * rel_synonyms_count                    -0.073655
rel_clustering * rel_frequency                   0.033929
rel_clustering * rel_letters_count               0.151699
rel_clustering * rel_orthographic_density        0.050654
rel_clustering * rel_synonyms_count             -0.157866
rel_frequency * rel_letters_count                0.025009
rel_frequency * rel_orthographic_density         0.106879
rel_frequency * rel_synonyms_count              -0.129591
rel_letters_count * rel_orthographic_density     0.019601
rel_letters_count * rel_synonyms_count           0.038703
rel_orthographic_density * rel_synonyms_count    0.027572
dtype: float64

Regressing global aoa with 1362 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.11652857304721509

intercept                      1.851231
global_aoa                     0.473409
global_clustering             -0.340431
global_frequency              -0.058317
global_letters_count           0.183798
global_orthographic_density   -0.078031
global_synonyms_count          0.250969
rel_aoa                       -0.303463
rel_clustering                 0.278404
rel_frequency                 -0.064241
rel_letters_count             -0.153735
rel_orthographic_density       0.064421
rel_synonyms_count            -0.459957
dtype: float64

Regressing global aoa with 1362 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.1667574983268414

intercept                                                 27.336601
global_aoa                                                 2.095191
global_clustering                                          6.900913
global_frequency                                           1.126850
global_letters_count                                      -1.962132
global_orthographic_density                               -8.814720
global_synonyms_count                                     -3.554317
rel_aoa                                                   -1.253043
rel_clustering                                            -7.418713
rel_frequency                                             -0.326605
rel_letters_count                                          2.708655
rel_orthographic_density                                   9.479412
rel_synonyms_count                                        -3.522456
global_aoa * global_clustering                             0.009271
global_aoa * global_frequency                             -0.126483
global_aoa * global_letters_count                         -0.066180
global_aoa * global_orthographic_density                   0.052791
global_aoa * global_synonyms_count                        -0.245372
global_aoa * rel_aoa                                       0.033307
global_aoa * rel_clustering                                0.059883
global_aoa * rel_frequency                                 0.097502
global_aoa * rel_letters_count                             0.091494
global_aoa * rel_orthographic_density                     -0.005736
global_aoa * rel_synonyms_count                            0.514826
global_clustering * global_frequency                      -0.033234
global_clustering * global_letters_count                  -0.345267
global_clustering * global_orthographic_density           -2.401445
global_clustering * global_synonyms_count                 -0.533149
global_clustering * rel_aoa                                0.096138
global_clustering * rel_clustering                         0.052861
global_clustering * rel_frequency                          0.128090
global_clustering * rel_letters_count                      0.363217
global_clustering * rel_orthographic_density               2.221701
global_clustering * rel_synonyms_count                     0.336937
global_frequency * global_letters_count                    0.076743
global_frequency * global_orthographic_density            -0.434293
global_frequency * global_synonyms_count                  -0.016665
global_frequency * rel_aoa                                 0.146101
global_frequency * rel_clustering                          0.099929
global_frequency * rel_frequency                          -0.016159
global_frequency * rel_letters_count                      -0.109566
global_frequency * rel_orthographic_density                0.349203
global_frequency * rel_synonyms_count                      0.214103
global_letters_count * global_orthographic_density        -0.286706
global_letters_count * global_synonyms_count               0.346229
global_letters_count * rel_aoa                            -0.032908
global_letters_count * rel_clustering                      0.417235
global_letters_count * rel_frequency                      -0.003834
global_letters_count * rel_letters_count                  -0.005640
global_letters_count * rel_orthographic_density            0.013011
global_letters_count * rel_synonyms_count                 -0.041331
global_orthographic_density * global_synonyms_count        0.316360
global_orthographic_density * rel_aoa                     -0.073776
global_orthographic_density * rel_clustering               2.074458
global_orthographic_density * rel_frequency                0.311072
global_orthographic_density * rel_letters_count            0.098845
global_orthographic_density * rel_orthographic_density     0.063407
global_orthographic_density * rel_synonyms_count          -0.152868
global_synonyms_count * rel_aoa                            0.353145
global_synonyms_count * rel_clustering                     0.466752
global_synonyms_count * rel_frequency                      0.011485
global_synonyms_count * rel_letters_count                 -0.500443
global_synonyms_count * rel_orthographic_density          -0.832883
global_synonyms_count * rel_synonyms_count                -0.008972
rel_aoa * rel_clustering                                  -0.102758
rel_aoa * rel_frequency                                   -0.072619
rel_aoa * rel_letters_count                                0.034808
rel_aoa * rel_orthographic_density                         0.106337
rel_aoa * rel_synonyms_count                              -0.580842
rel_clustering * rel_frequency                            -0.100583
rel_clustering * rel_letters_count                        -0.242666
rel_clustering * rel_orthographic_density                 -1.745991
rel_clustering * rel_synonyms_count                       -0.673962
rel_frequency * rel_letters_count                          0.002668
rel_frequency * rel_orthographic_density                  -0.169453
rel_frequency * rel_synonyms_count                        -0.386696
rel_letters_count * rel_orthographic_density               0.210640
rel_letters_count * rel_synonyms_count                     0.223145
rel_orthographic_density * rel_synonyms_count              0.857340
dtype: float64

Regressing rel aoa with 1362 measures, no interactions
           ^^^^^^^
R^2 = 0.175783614850897

intercept                      0.650095
global_aoa                    -0.311194
global_clustering             -0.266649
global_frequency              -0.007967
global_letters_count           0.111699
global_orthographic_density   -0.144718
global_synonyms_count          0.320022
rel_aoa                        0.647129
rel_clustering                 0.204462
rel_frequency                 -0.096564
rel_letters_count             -0.103915
rel_orthographic_density       0.087663
rel_synonyms_count            -0.528724
dtype: float64

Regressing rel aoa with 1362 measures, with interactions
           ^^^^^^^
R^2 = 0.2180599729702435

intercept                                                 13.123466
global_aoa                                                 0.612363
global_clustering                                          3.472838
global_frequency                                           1.152240
global_letters_count                                      -0.515290
global_orthographic_density                               -8.101285
global_synonyms_count                                     -2.565211
rel_aoa                                                    0.526470
rel_clustering                                            -3.311975
rel_frequency                                             -0.370731
rel_letters_count                                          1.950771
rel_orthographic_density                                   9.572118
rel_synonyms_count                                        -2.175863
global_aoa * global_clustering                             0.008800
global_aoa * global_frequency                             -0.099787
global_aoa * global_letters_count                         -0.042977
global_aoa * global_orthographic_density                   0.165538
global_aoa * global_synonyms_count                        -0.156677
global_aoa * rel_aoa                                       0.003547
global_aoa * rel_clustering                                0.006299
global_aoa * rel_frequency                                 0.048045
global_aoa * rel_letters_count                             0.027754
global_aoa * rel_orthographic_density                     -0.202320
global_aoa * rel_synonyms_count                            0.341161
global_clustering * global_frequency                       0.041405
global_clustering * global_letters_count                  -0.104870
global_clustering * global_orthographic_density           -1.672695
global_clustering * global_synonyms_count                 -0.676257
global_clustering * rel_aoa                                0.106557
global_clustering * rel_clustering                         0.089281
global_clustering * rel_frequency                          0.014572
global_clustering * rel_letters_count                      0.097929
global_clustering * rel_orthographic_density               1.455055
global_clustering * rel_synonyms_count                     0.564091
global_frequency * global_letters_count                    0.051702
global_frequency * global_orthographic_density            -0.212251
global_frequency * global_synonyms_count                  -0.131329
global_frequency * rel_aoa                                 0.115058
global_frequency * rel_clustering                         -0.016946
global_frequency * rel_frequency                          -0.012413
global_frequency * rel_letters_count                      -0.131282
global_frequency * rel_orthographic_density                0.042787
global_frequency * rel_synonyms_count                      0.249797
global_letters_count * global_orthographic_density        -0.189612
global_letters_count * global_synonyms_count               0.189240
global_letters_count * rel_aoa                            -0.022128
global_letters_count * rel_clustering                      0.236642
global_letters_count * rel_frequency                      -0.002125
global_letters_count * rel_letters_count                  -0.011341
global_letters_count * rel_orthographic_density            0.042454
global_letters_count * rel_synonyms_count                  0.004988
global_orthographic_density * global_synonyms_count        0.186456
global_orthographic_density * rel_aoa                     -0.164280
global_orthographic_density * rel_clustering               1.392354
global_orthographic_density * rel_frequency                0.069228
global_orthographic_density * rel_letters_count           -0.013507
global_orthographic_density * rel_orthographic_density    -0.003700
global_orthographic_density * rel_synonyms_count          -0.022850
global_synonyms_count * rel_aoa                            0.196052
global_synonyms_count * rel_clustering                     0.550136
global_synonyms_count * rel_frequency                      0.119629
global_synonyms_count * rel_letters_count                 -0.288241
global_synonyms_count * rel_orthographic_density          -0.532144
global_synonyms_count * rel_synonyms_count                 0.049658
rel_aoa * rel_clustering                                  -0.070278
rel_aoa * rel_frequency                                   -0.039011
rel_aoa * rel_letters_count                                0.055826
rel_aoa * rel_orthographic_density                         0.256947
rel_aoa * rel_synonyms_count                              -0.383770
rel_clustering * rel_frequency                             0.057105
rel_clustering * rel_letters_count                        -0.053646
rel_clustering * rel_orthographic_density                 -1.084939
rel_clustering * rel_synonyms_count                       -0.771274
rel_frequency * rel_letters_count                          0.060554
rel_frequency * rel_orthographic_density                   0.126740
rel_frequency * rel_synonyms_count                        -0.403139
rel_letters_count * rel_orthographic_density               0.147087
rel_letters_count * rel_synonyms_count                     0.125724
rel_orthographic_density * rel_synonyms_count              0.501532
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 1222 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.021537495116610406

intercept                     -4.875561
global_aoa                    -0.003121
global_clustering              0.121103
global_frequency              -0.023648
global_letters_count           0.004582
global_orthographic_density    0.001888
global_synonyms_count          0.020200
dtype: float64

Regressing global clustering with 1222 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.04554616200053563

intercept                                             -3.497981
global_aoa                                             0.104778
global_clustering                                      0.379925
global_frequency                                      -0.402657
global_letters_count                                   0.198563
global_orthographic_density                            0.265952
global_synonyms_count                                 -0.010847
global_aoa * global_clustering                         0.020737
global_aoa * global_frequency                          0.004005
global_aoa * global_letters_count                     -0.001522
global_aoa * global_orthographic_density              -0.003400
global_aoa * global_synonyms_count                    -0.009814
global_clustering * global_frequency                  -0.059472
global_clustering * global_letters_count               0.020201
global_clustering * global_orthographic_density        0.056596
global_clustering * global_synonyms_count             -0.097750
global_frequency * global_letters_count               -0.004299
global_frequency * global_orthographic_density         0.019083
global_frequency * global_synonyms_count               0.010348
global_letters_count * global_orthographic_density    -0.005788
global_letters_count * global_synonyms_count          -0.060238
global_orthographic_density * global_synonyms_count   -0.164408
dtype: float64

Regressing rel clustering with 1222 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.01235926478122018

intercept                      0.943672
global_aoa                     0.003433
global_clustering              0.090744
global_frequency              -0.013749
global_letters_count           0.007800
global_orthographic_density    0.017214
global_synonyms_count          0.010291
dtype: float64

Regressing rel clustering with 1222 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.02458120019619059

intercept                                              3.324577
global_aoa                                             0.019059
global_clustering                                      0.405485
global_frequency                                      -0.359028
global_letters_count                                  -0.016573
global_orthographic_density                            0.155865
global_synonyms_count                                  0.192643
global_aoa * global_clustering                         0.012878
global_aoa * global_frequency                          0.006238
global_aoa * global_letters_count                      0.002699
global_aoa * global_orthographic_density              -0.004350
global_aoa * global_synonyms_count                    -0.012950
global_clustering * global_frequency                  -0.046481
global_clustering * global_letters_count               0.001053
global_clustering * global_orthographic_density        0.038015
global_clustering * global_synonyms_count             -0.067150
global_frequency * global_letters_count                0.002520
global_frequency * global_orthographic_density         0.017168
global_frequency * global_synonyms_count              -0.010728
global_letters_count * global_orthographic_density     0.000059
global_letters_count * global_synonyms_count          -0.035272
global_orthographic_density * global_synonyms_count   -0.138057
dtype: float64

Regressing global clustering with 1222 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.009478476555686899

intercept                  -5.810828
rel_aoa                    -0.001444
rel_clustering              0.104981
rel_frequency               0.006538
rel_letters_count           0.003894
rel_orthographic_density    0.009880
rel_synonyms_count         -0.005032
dtype: float64

Regressing global clustering with 1222 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.017092798230092243

intercept                                       -5.802940
rel_aoa                                         -0.008626
rel_clustering                                   0.111414
rel_frequency                                    0.014932
rel_letters_count                               -0.003750
rel_orthographic_density                         0.010865
rel_synonyms_count                              -0.098197
rel_aoa * rel_clustering                         0.031063
rel_aoa * rel_frequency                         -0.007465
rel_aoa * rel_letters_count                     -0.008764
rel_aoa * rel_orthographic_density               0.001821
rel_aoa * rel_synonyms_count                    -0.018576
rel_clustering * rel_frequency                   0.004822
rel_clustering * rel_letters_count              -0.002321
rel_clustering * rel_orthographic_density        0.007198
rel_clustering * rel_synonyms_count             -0.026141
rel_frequency * rel_letters_count               -0.003792
rel_frequency * rel_orthographic_density        -0.005101
rel_frequency * rel_synonyms_count              -0.027902
rel_letters_count * rel_orthographic_density    -0.007257
rel_letters_count * rel_synonyms_count          -0.032195
rel_orthographic_density * rel_synonyms_count   -0.110716
dtype: float64

Regressing rel clustering with 1222 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.10078438986624738

intercept                   0.298998
rel_aoa                    -0.019162
rel_clustering              0.329989
rel_frequency               0.012168
rel_letters_count           0.014897
rel_orthographic_density    0.018153
rel_synonyms_count          0.035644
dtype: float64

Regressing rel clustering with 1222 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.10957067594070424

intercept                                        0.301065
rel_aoa                                         -0.039251
rel_clustering                                   0.328672
rel_frequency                                    0.017845
rel_letters_count                                0.001120
rel_orthographic_density                         0.001551
rel_synonyms_count                              -0.080088
rel_aoa * rel_clustering                         0.030411
rel_aoa * rel_frequency                         -0.007072
rel_aoa * rel_letters_count                     -0.010984
rel_aoa * rel_orthographic_density              -0.017546
rel_aoa * rel_synonyms_count                    -0.011411
rel_clustering * rel_frequency                  -0.000852
rel_clustering * rel_letters_count               0.001088
rel_clustering * rel_orthographic_density        0.021904
rel_clustering * rel_synonyms_count             -0.039922
rel_frequency * rel_letters_count               -0.006062
rel_frequency * rel_orthographic_density        -0.014629
rel_frequency * rel_synonyms_count              -0.034486
rel_letters_count * rel_orthographic_density    -0.006753
rel_letters_count * rel_synonyms_count          -0.026808
rel_orthographic_density * rel_synonyms_count   -0.102949
dtype: float64

Regressing global clustering with 1222 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.03282780055183432

intercept                     -3.710006
global_aoa                    -0.001190
global_clustering              0.153445
global_frequency              -0.074555
global_letters_count          -0.030843
global_orthographic_density   -0.096457
global_synonyms_count          0.083154
rel_aoa                       -0.005533
rel_clustering                -0.031814
rel_frequency                  0.056609
rel_letters_count              0.037033
rel_orthographic_density       0.113833
rel_synonyms_count            -0.082638
dtype: float64

Regressing global clustering with 1222 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11400871432464321

intercept                                                 4.840181
global_aoa                                                0.065270
global_clustering                                         2.305650
global_frequency                                         -0.956332
global_letters_count                                      1.434975
global_orthographic_density                              -0.827210
global_synonyms_count                                    -2.219298
rel_aoa                                                  -0.489868
rel_clustering                                           -2.386524
rel_frequency                                            -0.123644
rel_letters_count                                        -1.047876
rel_orthographic_density                                  1.193483
rel_synonyms_count                                        0.044108
global_aoa * global_clustering                           -0.079147
global_aoa * global_frequency                            -0.006788
global_aoa * global_letters_count                        -0.062168
global_aoa * global_orthographic_density                 -0.124240
global_aoa * global_synonyms_count                        0.049521
global_aoa * rel_aoa                                      0.011474
global_aoa * rel_clustering                               0.149526
global_aoa * rel_frequency                                0.021641
global_aoa * rel_letters_count                            0.068487
global_aoa * rel_orthographic_density                     0.100852
global_aoa * rel_synonyms_count                          -0.021758
global_clustering * global_frequency                     -0.179550
global_clustering * global_letters_count                  0.120718
global_clustering * global_orthographic_density          -0.142052
global_clustering * global_synonyms_count                -0.217577
global_clustering * rel_aoa                              -0.016513
global_clustering * rel_clustering                       -0.112685
global_clustering * rel_frequency                         0.031435
global_clustering * rel_letters_count                    -0.051545
global_clustering * rel_orthographic_density              0.198974
global_clustering * rel_synonyms_count                    0.051593
global_frequency * global_letters_count                  -0.051505
global_frequency * global_orthographic_density            0.014217
global_frequency * global_synonyms_count                  0.164666
global_frequency * rel_aoa                               -0.004032
global_frequency * rel_clustering                         0.112126
global_frequency * rel_frequency                          0.009634
global_frequency * rel_letters_count                      0.054760
global_frequency * rel_orthographic_density               0.011745
global_frequency * rel_synonyms_count                    -0.081726
global_letters_count * global_orthographic_density        0.140259
global_letters_count * global_synonyms_count             -0.123902
global_letters_count * rel_aoa                            0.027656
global_letters_count * rel_clustering                    -0.124799
global_letters_count * rel_frequency                      0.048609
global_letters_count * rel_letters_count                 -0.000325
global_letters_count * rel_orthographic_density          -0.139941
global_letters_count * rel_synonyms_count                 0.167265
global_orthographic_density * global_synonyms_count      -0.273771
global_orthographic_density * rel_aoa                     0.127146
global_orthographic_density * rel_clustering              0.127323
global_orthographic_density * rel_frequency               0.004992
global_orthographic_density * rel_letters_count          -0.127961
global_orthographic_density * rel_orthographic_density   -0.004754
global_orthographic_density * rel_synonyms_count          0.199231
global_synonyms_count * rel_aoa                           0.020839
global_synonyms_count * rel_clustering                    0.011218
global_synonyms_count * rel_frequency                    -0.139989
global_synonyms_count * rel_letters_count                -0.078774
global_synonyms_count * rel_orthographic_density         -0.086404
global_synonyms_count * rel_synonyms_count               -0.039943
rel_aoa * rel_clustering                                 -0.005816
rel_aoa * rel_frequency                                  -0.003350
rel_aoa * rel_letters_count                              -0.045894
rel_aoa * rel_orthographic_density                       -0.109536
rel_aoa * rel_synonyms_count                             -0.054601
rel_clustering * rel_frequency                           -0.017926
rel_clustering * rel_letters_count                        0.055188
rel_clustering * rel_orthographic_density                -0.124645
rel_clustering * rel_synonyms_count                       0.083640
rel_frequency * rel_letters_count                        -0.055284
rel_frequency * rel_orthographic_density                 -0.016019
rel_frequency * rel_synonyms_count                        0.049700
rel_letters_count * rel_orthographic_density              0.114805
rel_letters_count * rel_synonyms_count                   -0.009198
rel_orthographic_density * rel_synonyms_count             0.032030
dtype: float64

Regressing rel clustering with 1222 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1968046461730797

intercept                     -2.497768
global_aoa                     0.008617
global_clustering             -0.608960
global_frequency              -0.064430
global_letters_count          -0.040757
global_orthographic_density   -0.055978
global_synonyms_count          0.023821
rel_aoa                       -0.017548
rel_clustering                 0.849788
rel_frequency                  0.055154
rel_letters_count              0.047440
rel_orthographic_density       0.063705
rel_synonyms_count            -0.008409
dtype: float64

Regressing rel clustering with 1222 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.24906089122626462

intercept                                                 3.313449
global_aoa                                                0.102356
global_clustering                                         0.887688
global_frequency                                         -0.473298
global_letters_count                                      0.850852
global_orthographic_density                              -1.001142
global_synonyms_count                                    -2.400268
rel_aoa                                                  -0.412070
rel_clustering                                           -1.143589
rel_frequency                                            -0.189927
rel_letters_count                                        -0.793293
rel_orthographic_density                                  1.142171
rel_synonyms_count                                        1.067072
global_aoa * global_clustering                           -0.052242
global_aoa * global_frequency                            -0.004501
global_aoa * global_letters_count                        -0.047346
global_aoa * global_orthographic_density                 -0.100680
global_aoa * global_synonyms_count                        0.020522
global_aoa * rel_aoa                                      0.009338
global_aoa * rel_clustering                               0.109432
global_aoa * rel_frequency                                0.017779
global_aoa * rel_letters_count                            0.058203
global_aoa * rel_orthographic_density                     0.079011
global_aoa * rel_synonyms_count                          -0.006564
global_clustering * global_frequency                     -0.098027
global_clustering * global_letters_count                  0.038939
global_clustering * global_orthographic_density          -0.130497
global_clustering * global_synonyms_count                -0.180717
global_clustering * rel_aoa                              -0.020616
global_clustering * rel_clustering                       -0.119381
global_clustering * rel_frequency                         0.013301
global_clustering * rel_letters_count                    -0.020801
global_clustering * rel_orthographic_density              0.129965
global_clustering * rel_synonyms_count                    0.066093
global_frequency * global_letters_count                  -0.056049
global_frequency * global_orthographic_density            0.025308
global_frequency * global_synonyms_count                  0.128993
global_frequency * rel_aoa                               -0.007673
global_frequency * rel_clustering                         0.066497
global_frequency * rel_frequency                          0.011621
global_frequency * rel_letters_count                      0.055307
global_frequency * rel_orthographic_density              -0.019984
global_frequency * rel_synonyms_count                    -0.081359
global_letters_count * global_orthographic_density        0.128084
global_letters_count * global_synonyms_count             -0.009149
global_letters_count * rel_aoa                            0.020830
global_letters_count * rel_clustering                    -0.038956
global_letters_count * rel_frequency                      0.044990
global_letters_count * rel_letters_count                  0.000322
global_letters_count * rel_orthographic_density          -0.125859
global_letters_count * rel_synonyms_count                 0.042864
global_orthographic_density * global_synonyms_count      -0.105928
global_orthographic_density * rel_aoa                     0.105196
global_orthographic_density * rel_clustering              0.081138
global_orthographic_density * rel_frequency              -0.006790
global_orthographic_density * rel_letters_count          -0.106179
global_orthographic_density * rel_orthographic_density    0.005163
global_orthographic_density * rel_synonyms_count          0.028319
global_synonyms_count * rel_aoa                           0.032878
global_synonyms_count * rel_clustering                    0.077172
global_synonyms_count * rel_frequency                    -0.119848
global_synonyms_count * rel_letters_count                -0.122428
global_synonyms_count * rel_orthographic_density         -0.149903
global_synonyms_count * rel_synonyms_count               -0.035407
rel_aoa * rel_clustering                                  0.003409
rel_aoa * rel_frequency                                  -0.001541
rel_aoa * rel_letters_count                              -0.036854
rel_aoa * rel_orthographic_density                       -0.090581
rel_aoa * rel_synonyms_count                             -0.050364
rel_clustering * rel_frequency                           -0.010558
rel_clustering * rel_letters_count                        0.012647
rel_clustering * rel_orthographic_density                -0.046715
rel_clustering * rel_synonyms_count                      -0.021845
rel_frequency * rel_letters_count                        -0.047711
rel_frequency * rel_orthographic_density                  0.008551
rel_frequency * rel_synonyms_count                        0.054901
rel_letters_count * rel_orthographic_density              0.107223
rel_letters_count * rel_synonyms_count                    0.046327
rel_orthographic_density * rel_synonyms_count             0.104684
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07657973529824014

intercept                      4.451055
global_aoa                     0.048154
global_clustering             -0.164599
global_frequency              -0.039239
global_letters_count           0.221691
global_orthographic_density   -0.142223
global_synonyms_count         -0.287914
dtype: float64

Regressing global letters_count with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08634649243421122

intercept                                             -3.506807
global_aoa                                             0.338170
global_clustering                                     -2.120391
global_frequency                                       0.097560
global_letters_count                                   0.431430
global_orthographic_density                           -0.579316
global_synonyms_count                                  2.060482
global_aoa * global_clustering                         0.121113
global_aoa * global_frequency                          0.034813
global_aoa * global_letters_count                      0.012376
global_aoa * global_orthographic_density               0.038285
global_aoa * global_synonyms_count                    -0.065298
global_clustering * global_frequency                   0.077304
global_clustering * global_letters_count               0.034897
global_clustering * global_orthographic_density        0.087261
global_clustering * global_synonyms_count              0.314511
global_frequency * global_letters_count               -0.009681
global_frequency * global_orthographic_density         0.084077
global_frequency * global_synonyms_count               0.039487
global_letters_count * global_orthographic_density    -0.004437
global_letters_count * global_synonyms_count          -0.026307
global_orthographic_density * global_synonyms_count   -0.221160
dtype: float64

Regressing rel letters_count with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.03969525194559809

intercept                      1.033252
global_aoa                     0.005671
global_clustering             -0.178236
global_frequency              -0.041914
global_letters_count           0.185304
global_orthographic_density   -0.055019
global_synonyms_count         -0.317146
dtype: float64

Regressing rel letters_count with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.047629295984369935

intercept                                             -1.810427
global_aoa                                             0.134173
global_clustering                                     -1.245065
global_frequency                                      -0.051148
global_letters_count                                   0.126691
global_orthographic_density                           -1.225793
global_synonyms_count                                  1.597683
global_aoa * global_clustering                         0.084515
global_aoa * global_frequency                          0.037927
global_aoa * global_letters_count                      0.002354
global_aoa * global_orthographic_density               0.019205
global_aoa * global_synonyms_count                    -0.067166
global_clustering * global_frequency                   0.059004
global_clustering * global_letters_count              -0.009127
global_clustering * global_orthographic_density       -0.057799
global_clustering * global_synonyms_count              0.244662
global_frequency * global_letters_count               -0.001687
global_frequency * global_orthographic_density         0.082045
global_frequency * global_synonyms_count              -0.000225
global_letters_count * global_orthographic_density    -0.007387
global_letters_count * global_synonyms_count           0.013676
global_orthographic_density * global_synonyms_count   -0.094853
dtype: float64

Regressing global letters_count with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.05895265916865111

intercept                   5.722564
rel_aoa                    -0.090649
rel_clustering              0.043670
rel_frequency               0.003145
rel_letters_count           0.205685
rel_orthographic_density   -0.263655
rel_synonyms_count         -0.291742
dtype: float64

Regressing global letters_count with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06828235030376495

intercept                                        5.572653
rel_aoa                                         -0.175736
rel_clustering                                   0.043909
rel_frequency                                    0.004026
rel_letters_count                                0.350073
rel_orthographic_density                        -0.358037
rel_synonyms_count                              -0.638076
rel_aoa * rel_clustering                         0.094003
rel_aoa * rel_frequency                         -0.017521
rel_aoa * rel_letters_count                      0.008359
rel_aoa * rel_orthographic_density               0.014802
rel_aoa * rel_synonyms_count                    -0.083957
rel_clustering * rel_frequency                  -0.007322
rel_clustering * rel_letters_count              -0.054287
rel_clustering * rel_orthographic_density       -0.060096
rel_clustering * rel_synonyms_count              0.007992
rel_frequency * rel_letters_count                0.011551
rel_frequency * rel_orthographic_density         0.010374
rel_frequency * rel_synonyms_count              -0.059552
rel_letters_count * rel_orthographic_density     0.071296
rel_letters_count * rel_synonyms_count           0.073336
rel_orthographic_density * rel_synonyms_count   -0.060389
dtype: float64

Regressing rel letters_count with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1178817656128528

intercept                   1.454894
rel_aoa                    -0.065167
rel_clustering             -0.114104
rel_frequency              -0.173132
rel_letters_count           0.392521
rel_orthographic_density    0.061620
rel_synonyms_count         -0.272620
dtype: float64

Regressing rel letters_count with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.1304326393238436

intercept                                        1.355942
rel_aoa                                         -0.055349
rel_clustering                                  -0.168632
rel_frequency                                   -0.170525
rel_letters_count                                0.575207
rel_orthographic_density                         0.108051
rel_synonyms_count                              -0.549995
rel_aoa * rel_clustering                         0.117383
rel_aoa * rel_frequency                          0.017421
rel_aoa * rel_letters_count                     -0.032072
rel_aoa * rel_orthographic_density              -0.066524
rel_aoa * rel_synonyms_count                    -0.093655
rel_clustering * rel_frequency                  -0.011211
rel_clustering * rel_letters_count              -0.017836
rel_clustering * rel_orthographic_density        0.008916
rel_clustering * rel_synonyms_count              0.005115
rel_frequency * rel_letters_count                0.023049
rel_frequency * rel_orthographic_density         0.060002
rel_frequency * rel_synonyms_count              -0.045979
rel_letters_count * rel_orthographic_density     0.072702
rel_letters_count * rel_synonyms_count           0.091208
rel_orthographic_density * rel_synonyms_count   -0.012538
dtype: float64

Regressing global letters_count with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09156090300984177

intercept                      1.589257
global_aoa                     0.192757
global_clustering             -0.416165
global_frequency               0.028790
global_letters_count           0.190205
global_orthographic_density   -0.262193
global_synonyms_count         -0.120258
rel_aoa                       -0.221133
rel_clustering                 0.300512
rel_frequency                 -0.080062
rel_letters_count              0.037815
rel_orthographic_density       0.158640
rel_synonyms_count            -0.168366
dtype: float64

Regressing global letters_count with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1386236683759785

intercept                                                 20.995569
global_aoa                                                 0.132816
global_clustering                                          5.979143
global_frequency                                           0.573208
global_letters_count                                      -0.861615
global_orthographic_density                               -2.007909
global_synonyms_count                                      7.044925
rel_aoa                                                   -1.105004
rel_clustering                                           -10.424157
rel_frequency                                              0.605666
rel_letters_count                                          2.960047
rel_orthographic_density                                   1.183404
rel_synonyms_count                                        -6.685140
global_aoa * global_clustering                            -0.016374
global_aoa * global_frequency                             -0.049670
global_aoa * global_letters_count                         -0.004381
global_aoa * global_orthographic_density                   0.252459
global_aoa * global_synonyms_count                        -0.147173
global_aoa * rel_aoa                                       0.032986
global_aoa * rel_clustering                                0.088069
global_aoa * rel_frequency                                 0.032404
global_aoa * rel_letters_count                            -0.021570
global_aoa * rel_orthographic_density                     -0.243555
global_aoa * rel_synonyms_count                            0.196749
global_clustering * global_frequency                      -0.144078
global_clustering * global_letters_count                  -0.423178
global_clustering * global_orthographic_density           -1.261747
global_clustering * global_synonyms_count                 -0.096796
global_clustering * rel_aoa                                0.196696
global_clustering * rel_clustering                        -0.029980
global_clustering * rel_frequency                          0.272525
global_clustering * rel_letters_count                      0.570607
global_clustering * rel_orthographic_density               1.411986
global_clustering * rel_synonyms_count                     0.708591
global_frequency * global_letters_count                   -0.041680
global_frequency * global_orthographic_density            -0.400299
global_frequency * global_synonyms_count                  -0.238315
global_frequency * rel_aoa                                 0.173651
global_frequency * rel_clustering                          0.419325
global_frequency * rel_frequency                          -0.011629
global_frequency * rel_letters_count                       0.028745
global_frequency * rel_orthographic_density                0.555562
global_frequency * rel_synonyms_count                      0.438562
global_letters_count * global_orthographic_density        -0.495087
global_letters_count * global_synonyms_count              -0.289017
global_letters_count * rel_aoa                             0.034451
global_letters_count * rel_clustering                      0.616150
global_letters_count * rel_frequency                       0.017798
global_letters_count * rel_letters_count                   0.020037
global_letters_count * rel_orthographic_density            0.473836
global_letters_count * rel_synonyms_count                  0.355785
global_orthographic_density * global_synonyms_count       -1.185127
global_orthographic_density * rel_aoa                     -0.073736
global_orthographic_density * rel_clustering               1.172399
global_orthographic_density * rel_frequency                0.321909
global_orthographic_density * rel_letters_count            0.210744
global_orthographic_density * rel_orthographic_density     0.037890
global_orthographic_density * rel_synonyms_count           1.246664
global_synonyms_count * rel_aoa                            0.212544
global_synonyms_count * rel_clustering                     0.440403
global_synonyms_count * rel_frequency                      0.300876
global_synonyms_count * rel_letters_count                 -0.046833
global_synonyms_count * rel_orthographic_density           0.710130
global_synonyms_count * rel_synonyms_count                -0.008233
rel_aoa * rel_clustering                                  -0.101944
rel_aoa * rel_frequency                                   -0.103488
rel_aoa * rel_letters_count                               -0.047086
rel_aoa * rel_orthographic_density                         0.091057
rel_aoa * rel_synonyms_count                              -0.296535
rel_clustering * rel_frequency                            -0.464505
rel_clustering * rel_letters_count                        -0.734477
rel_clustering * rel_orthographic_density                 -1.196173
rel_clustering * rel_synonyms_count                       -0.917155
rel_frequency * rel_letters_count                         -0.000878
rel_frequency * rel_orthographic_density                  -0.372066
rel_frequency * rel_synonyms_count                        -0.479605
rel_letters_count * rel_orthographic_density              -0.138561
rel_letters_count * rel_synonyms_count                    -0.000749
rel_orthographic_density * rel_synonyms_count             -0.868643
dtype: float64

Regressing rel letters_count with 1487 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16338816401455014

intercept                      0.571573
global_aoa                     0.145749
global_clustering             -0.395492
global_frequency               0.075070
global_letters_count          -0.619892
global_orthographic_density   -0.243928
global_synonyms_count         -0.126335
rel_aoa                       -0.154885
rel_clustering                 0.270253
rel_frequency                 -0.132734
rel_letters_count              0.871580
rel_orthographic_density       0.130856
rel_synonyms_count            -0.136342
dtype: float64

Regressing rel letters_count with 1487 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.20227968467466584

intercept                                                 19.821719
global_aoa                                                -0.596744
global_clustering                                          4.065272
global_frequency                                           0.122773
global_letters_count                                      -1.486234
global_orthographic_density                               -3.582712
global_synonyms_count                                      7.495764
rel_aoa                                                   -0.208550
rel_clustering                                            -8.245585
rel_frequency                                              0.859003
rel_letters_count                                          3.387604
rel_orthographic_density                                   1.528008
rel_synonyms_count                                        -6.718615
global_aoa * global_clustering                            -0.020933
global_aoa * global_frequency                              0.005729
global_aoa * global_letters_count                          0.030024
global_aoa * global_orthographic_density                   0.191237
global_aoa * global_synonyms_count                        -0.158473
global_aoa * rel_aoa                                       0.027751
global_aoa * rel_clustering                                0.088707
global_aoa * rel_frequency                                -0.014247
global_aoa * rel_letters_count                            -0.043514
global_aoa * rel_orthographic_density                     -0.184288
global_aoa * rel_synonyms_count                            0.184680
global_clustering * global_frequency                      -0.078588
global_clustering * global_letters_count                  -0.263067
global_clustering * global_orthographic_density           -1.039548
global_clustering * global_synonyms_count                  0.066715
global_clustering * rel_aoa                                0.199414
global_clustering * rel_clustering                        -0.050864
global_clustering * rel_frequency                          0.205930
global_clustering * rel_letters_count                      0.362514
global_clustering * rel_orthographic_density               1.071114
global_clustering * rel_synonyms_count                     0.377487
global_frequency * global_letters_count                   -0.019555
global_frequency * global_orthographic_density            -0.176051
global_frequency * global_synonyms_count                  -0.189167
global_frequency * rel_aoa                                 0.112201
global_frequency * rel_clustering                          0.323444
global_frequency * rel_frequency                          -0.015983
global_frequency * rel_letters_count                      -0.001151
global_frequency * rel_orthographic_density                0.362073
global_frequency * rel_synonyms_count                      0.295242
global_letters_count * global_orthographic_density        -0.301474
global_letters_count * global_synonyms_count              -0.277102
global_letters_count * rel_aoa                            -0.005959
global_letters_count * rel_clustering                      0.450260
global_letters_count * rel_frequency                       0.010616
global_letters_count * rel_letters_count                   0.005201
global_letters_count * rel_orthographic_density            0.314225
global_letters_count * rel_synonyms_count                  0.311997
global_orthographic_density * global_synonyms_count       -1.181731
global_orthographic_density * rel_aoa                     -0.027700
global_orthographic_density * rel_clustering               0.912242
global_orthographic_density * rel_frequency                0.147258
global_orthographic_density * rel_letters_count            0.047457
global_orthographic_density * rel_orthographic_density     0.042925
global_orthographic_density * rel_synonyms_count           1.217521
global_synonyms_count * rel_aoa                            0.213723
global_synonyms_count * rel_clustering                     0.338847
global_synonyms_count * rel_frequency                      0.277023
global_synonyms_count * rel_letters_count                 -0.015168
global_synonyms_count * rel_orthographic_density           0.764122
global_synonyms_count * rel_synonyms_count                -0.056010
rel_aoa * rel_clustering                                  -0.122971
rel_aoa * rel_frequency                                   -0.053158
rel_aoa * rel_letters_count                               -0.012699
rel_aoa * rel_orthographic_density                         0.051997
rel_aoa * rel_synonyms_count                              -0.264917
rel_clustering * rel_frequency                            -0.398748
rel_clustering * rel_letters_count                        -0.536202
rel_clustering * rel_orthographic_density                 -0.874956
rel_clustering * rel_synonyms_count                       -0.700183
rel_frequency * rel_letters_count                          0.013079
rel_frequency * rel_orthographic_density                  -0.232915
rel_frequency * rel_synonyms_count                        -0.392615
rel_letters_count * rel_orthographic_density              -0.011066
rel_letters_count * rel_synonyms_count                    -0.006248
rel_orthographic_density * rel_synonyms_count             -0.854432
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 1441 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.018744944018728726

intercept                      0.725610
global_aoa                    -0.006197
global_clustering              0.026299
global_frequency              -0.007651
global_letters_count          -0.019429
global_orthographic_density   -0.019685
global_synonyms_count          0.123778
dtype: float64

Regressing global synonyms_count with 1441 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.02974005873312857

intercept                                              2.498266
global_aoa                                             0.031407
global_clustering                                      0.384337
global_frequency                                      -0.091099
global_letters_count                                  -0.108451
global_orthographic_density                           -0.327923
global_synonyms_count                                  0.020330
global_aoa * global_clustering                        -0.005244
global_aoa * global_frequency                         -0.005111
global_aoa * global_letters_count                     -0.003899
global_aoa * global_orthographic_density              -0.004972
global_aoa * global_synonyms_count                     0.030150
global_clustering * global_frequency                  -0.018046
global_clustering * global_letters_count              -0.013239
global_clustering * global_orthographic_density       -0.057520
global_clustering * global_synonyms_count              0.033796
global_frequency * global_letters_count                0.004109
global_frequency * global_orthographic_density        -0.004138
global_frequency * global_synonyms_count              -0.001242
global_letters_count * global_orthographic_density     0.003545
global_letters_count * global_synonyms_count           0.001024
global_orthographic_density * global_synonyms_count    0.085432
dtype: float64

Regressing rel synonyms_count with 1441 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.010828824310878282

intercept                      0.325227
global_aoa                    -0.004361
global_clustering              0.021544
global_frequency              -0.008815
global_letters_count          -0.008983
global_orthographic_density   -0.014417
global_synonyms_count          0.088606
dtype: float64

Regressing rel synonyms_count with 1441 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.021310135082451498

intercept                                              2.284819
global_aoa                                             0.011913
global_clustering                                      0.392921
global_frequency                                      -0.119455
global_letters_count                                  -0.083944
global_orthographic_density                           -0.308461
global_synonyms_count                                 -0.124865
global_aoa * global_clustering                        -0.004064
global_aoa * global_frequency                         -0.000507
global_aoa * global_letters_count                     -0.005715
global_aoa * global_orthographic_density              -0.007290
global_aoa * global_synonyms_count                     0.031701
global_clustering * global_frequency                  -0.018870
global_clustering * global_letters_count              -0.016200
global_clustering * global_orthographic_density       -0.053540
global_clustering * global_synonyms_count              0.019208
global_frequency * global_letters_count                0.001986
global_frequency * global_orthographic_density         0.000077
global_frequency * global_synonyms_count              -0.000173
global_letters_count * global_orthographic_density     0.002154
global_letters_count * global_synonyms_count           0.007181
global_orthographic_density * global_synonyms_count    0.062684
dtype: float64

Regressing global synonyms_count with 1441 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.013143153386864337

intercept                   0.371138
rel_aoa                    -0.007266
rel_clustering             -0.009338
rel_frequency              -0.006099
rel_letters_count          -0.011483
rel_orthographic_density   -0.000848
rel_synonyms_count          0.114151
dtype: float64

Regressing global synonyms_count with 1441 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.02326483814614544

intercept                                        0.361673
rel_aoa                                         -0.038320
rel_clustering                                  -0.075756
rel_frequency                                   -0.013966
rel_letters_count                                0.001741
rel_orthographic_density                         0.023640
rel_synonyms_count                               0.109434
rel_aoa * rel_clustering                        -0.007192
rel_aoa * rel_frequency                         -0.008834
rel_aoa * rel_letters_count                      0.005772
rel_aoa * rel_orthographic_density               0.004124
rel_aoa * rel_synonyms_count                     0.019526
rel_clustering * rel_frequency                  -0.002053
rel_clustering * rel_letters_count               0.010445
rel_clustering * rel_orthographic_density       -0.050427
rel_clustering * rel_synonyms_count              0.015975
rel_frequency * rel_letters_count                0.008777
rel_frequency * rel_orthographic_density         0.005663
rel_frequency * rel_synonyms_count              -0.003302
rel_letters_count * rel_orthographic_density    -0.000384
rel_letters_count * rel_synonyms_count           0.004220
rel_orthographic_density * rel_synonyms_count    0.038944
dtype: float64

Regressing rel synonyms_count with 1441 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07008676817860948

intercept                   0.035267
rel_aoa                    -0.017063
rel_clustering              0.034955
rel_frequency               0.001197
rel_letters_count          -0.004458
rel_orthographic_density   -0.016736
rel_synonyms_count          0.260612
dtype: float64

Regressing rel synonyms_count with 1441 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.0782452444315771

intercept                                        0.038242
rel_aoa                                         -0.032278
rel_clustering                                  -0.036432
rel_frequency                                    0.000522
rel_letters_count                                0.006175
rel_orthographic_density                         0.002650
rel_synonyms_count                               0.309538
rel_aoa * rel_clustering                         0.003774
rel_aoa * rel_frequency                         -0.002830
rel_aoa * rel_letters_count                      0.003364
rel_aoa * rel_orthographic_density               0.001396
rel_aoa * rel_synonyms_count                     0.011349
rel_clustering * rel_frequency                  -0.008130
rel_clustering * rel_letters_count               0.007225
rel_clustering * rel_orthographic_density       -0.037949
rel_clustering * rel_synonyms_count             -0.001301
rel_frequency * rel_letters_count                0.005024
rel_frequency * rel_orthographic_density         0.006282
rel_frequency * rel_synonyms_count               0.011100
rel_letters_count * rel_orthographic_density     0.003030
rel_letters_count * rel_synonyms_count           0.014641
rel_orthographic_density * rel_synonyms_count    0.061660
dtype: float64

Regressing global synonyms_count with 1441 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.02365548236331083

intercept                      1.414982
global_aoa                     0.002252
global_clustering              0.098726
global_frequency              -0.013972
global_letters_count          -0.042614
global_orthographic_density   -0.069642
global_synonyms_count          0.115281
rel_aoa                       -0.011825
rel_clustering                -0.085711
rel_frequency                  0.008087
rel_letters_count              0.025582
rel_orthographic_density       0.058074
rel_synonyms_count             0.007816
dtype: float64

Regressing global synonyms_count with 1441 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07193127345880179

intercept                                                 6.779176
global_aoa                                               -0.128724
global_clustering                                         1.354384
global_frequency                                         -0.288295
global_letters_count                                     -0.035184
global_orthographic_density                               0.223463
global_synonyms_count                                    -0.257307
rel_aoa                                                  -0.253713
rel_clustering                                           -0.283061
rel_frequency                                             0.020452
rel_letters_count                                        -0.171400
rel_orthographic_density                                 -0.228475
rel_synonyms_count                                       -1.047309
global_aoa * global_clustering                            0.013701
global_aoa * global_frequency                             0.030457
global_aoa * global_letters_count                         0.001630
global_aoa * global_orthographic_density                 -0.065611
global_aoa * global_synonyms_count                        0.014004
global_aoa * rel_aoa                                     -0.002017
global_aoa * rel_clustering                              -0.021292
global_aoa * rel_frequency                               -0.038183
global_aoa * rel_letters_count                           -0.012767
global_aoa * rel_orthographic_density                     0.059923
global_aoa * rel_synonyms_count                           0.042916
global_clustering * global_frequency                     -0.071017
global_clustering * global_letters_count                 -0.069296
global_clustering * global_orthographic_density          -0.088762
global_clustering * global_synonyms_count                 0.026627
global_clustering * rel_aoa                              -0.034009
global_clustering * rel_clustering                        0.025720
global_clustering * rel_frequency                         0.028642
global_clustering * rel_letters_count                     0.007180
global_clustering * rel_orthographic_density              0.080011
global_clustering * rel_synonyms_count                   -0.025600
global_frequency * global_letters_count                  -0.041615
global_frequency * global_orthographic_density           -0.056119
global_frequency * global_synonyms_count                  0.001093
global_frequency * rel_aoa                               -0.007028
global_frequency * rel_clustering                         0.019238
global_frequency * rel_frequency                          0.006283
global_frequency * rel_letters_count                      0.024712
global_frequency * rel_orthographic_density               0.054339
global_frequency * rel_synonyms_count                     0.053111
global_letters_count * global_orthographic_density        0.023666
global_letters_count * global_synonyms_count              0.076548
global_letters_count * rel_aoa                            0.008265
global_letters_count * rel_clustering                     0.013030
global_letters_count * rel_frequency                      0.039955
global_letters_count * rel_letters_count                 -0.001670
global_letters_count * rel_orthographic_density          -0.006728
global_letters_count * rel_synonyms_count                -0.015407
global_orthographic_density * global_synonyms_count       0.128142
global_orthographic_density * rel_aoa                     0.031995
global_orthographic_density * rel_clustering              0.029488
global_orthographic_density * rel_frequency               0.051902
global_orthographic_density * rel_letters_count           0.029290
global_orthographic_density * rel_orthographic_density   -0.023059
global_orthographic_density * rel_synonyms_count         -0.007063
global_synonyms_count * rel_aoa                          -0.018614
global_synonyms_count * rel_clustering                    0.113271
global_synonyms_count * rel_frequency                     0.023693
global_synonyms_count * rel_letters_count                -0.109634
global_synonyms_count * rel_orthographic_density         -0.112485
global_synonyms_count * rel_synonyms_count                0.079851
rel_aoa * rel_clustering                                  0.033728
rel_aoa * rel_frequency                                   0.007732
rel_aoa * rel_letters_count                               0.005268
rel_aoa * rel_orthographic_density                       -0.030432
rel_aoa * rel_synonyms_count                             -0.012118
rel_clustering * rel_frequency                            0.020326
rel_clustering * rel_letters_count                        0.047234
rel_clustering * rel_orthographic_density                -0.072480
rel_clustering * rel_synonyms_count                      -0.091353
rel_frequency * rel_letters_count                        -0.011797
rel_frequency * rel_orthographic_density                 -0.052919
rel_frequency * rel_synonyms_count                       -0.070611
rel_letters_count * rel_orthographic_density             -0.059865
rel_letters_count * rel_synonyms_count                    0.062774
rel_orthographic_density * rel_synonyms_count             0.081271
dtype: float64

Regressing rel synonyms_count with 1441 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.14269169668792747

intercept                      0.948152
global_aoa                     0.004920
global_clustering              0.054565
global_frequency              -0.022495
global_letters_count          -0.030179
global_orthographic_density   -0.016694
global_synonyms_count         -0.558244
rel_aoa                       -0.014410
rel_clustering                -0.044512
rel_frequency                  0.019038
rel_letters_count              0.018244
rel_orthographic_density       0.003709
rel_synonyms_count             0.788354
dtype: float64

Regressing rel synonyms_count with 1441 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.18915731558150548

intercept                                                 5.078153
global_aoa                                               -0.215325
global_clustering                                         1.037767
global_frequency                                         -0.278011
global_letters_count                                      0.131220
global_orthographic_density                               0.425567
global_synonyms_count                                    -0.359121
rel_aoa                                                   0.074779
rel_clustering                                           -0.140666
rel_frequency                                             0.019477
rel_letters_count                                        -0.352344
rel_orthographic_density                                 -0.398774
rel_synonyms_count                                       -0.852613
global_aoa * global_clustering                            0.000764
global_aoa * global_frequency                             0.029738
global_aoa * global_letters_count                         0.001585
global_aoa * global_orthographic_density                 -0.053870
global_aoa * global_synonyms_count                        0.014906
global_aoa * rel_aoa                                     -0.001212
global_aoa * rel_clustering                              -0.002289
global_aoa * rel_frequency                               -0.030847
global_aoa * rel_letters_count                           -0.009884
global_aoa * rel_orthographic_density                     0.047338
global_aoa * rel_synonyms_count                           0.052811
global_clustering * global_frequency                     -0.058269
global_clustering * global_letters_count                 -0.038775
global_clustering * global_orthographic_density          -0.069269
global_clustering * global_synonyms_count                 0.016565
global_clustering * rel_aoa                              -0.011508
global_clustering * rel_clustering                        0.019680
global_clustering * rel_frequency                         0.006931
global_clustering * rel_letters_count                    -0.007372
global_clustering * rel_orthographic_density              0.078940
global_clustering * rel_synonyms_count                    0.011041
global_frequency * global_letters_count                  -0.034771
global_frequency * global_orthographic_density           -0.050223
global_frequency * global_synonyms_count                  0.011051
global_frequency * rel_aoa                               -0.015745
global_frequency * rel_clustering                         0.029582
global_frequency * rel_frequency                          0.003885
global_frequency * rel_letters_count                      0.028500
global_frequency * rel_orthographic_density               0.058826
global_frequency * rel_synonyms_count                     0.059967
global_letters_count * global_orthographic_density        0.003080
global_letters_count * global_synonyms_count              0.005484
global_letters_count * rel_aoa                            0.000426
global_letters_count * rel_clustering                    -0.023613
global_letters_count * rel_frequency                      0.025136
global_letters_count * rel_letters_count                 -0.000538
global_letters_count * rel_orthographic_density           0.008184
global_letters_count * rel_synonyms_count                 0.057996
global_orthographic_density * global_synonyms_count      -0.095135
global_orthographic_density * rel_aoa                     0.006567
global_orthographic_density * rel_clustering             -0.056739
global_orthographic_density * rel_frequency               0.027399
global_orthographic_density * rel_letters_count           0.034951
global_orthographic_density * rel_orthographic_density   -0.030923
global_orthographic_density * rel_synonyms_count          0.189473
global_synonyms_count * rel_aoa                          -0.042606
global_synonyms_count * rel_clustering                    0.125368
global_synonyms_count * rel_frequency                     0.010989
global_synonyms_count * rel_letters_count                -0.044498
global_synonyms_count * rel_orthographic_density          0.070951
global_synonyms_count * rel_synonyms_count                0.084414
rel_aoa * rel_clustering                                  0.007363
rel_aoa * rel_frequency                                   0.012363
rel_aoa * rel_letters_count                               0.009835
rel_aoa * rel_orthographic_density                       -0.003949
rel_aoa * rel_synonyms_count                             -0.005135
rel_clustering * rel_frequency                            0.013536
rel_clustering * rel_letters_count                        0.058120
rel_clustering * rel_orthographic_density                -0.012721
rel_clustering * rel_synonyms_count                      -0.142208
rel_frequency * rel_letters_count                        -0.010787
rel_frequency * rel_orthographic_density                 -0.033124
rel_frequency * rel_synonyms_count                       -0.064760
rel_letters_count * rel_orthographic_density             -0.054924
rel_letters_count * rel_synonyms_count                    0.001478
rel_orthographic_density * rel_synonyms_count            -0.064922
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 1234 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07879548507974721

intercept                      2.040351
global_aoa                    -0.048189
global_clustering              0.081833
global_frequency               0.013336
global_letters_count          -0.042351
global_orthographic_density    0.150808
global_synonyms_count         -0.014094
dtype: float64

Regressing global orthographic_density with 1234 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09320348323462624

intercept                                              5.998343
global_aoa                                            -0.305824
global_clustering                                      0.666341
global_frequency                                      -0.083385
global_letters_count                                  -0.332188
global_orthographic_density                            0.076109
global_synonyms_count                                 -0.491549
global_aoa * global_clustering                        -0.035504
global_aoa * global_frequency                         -0.000961
global_aoa * global_letters_count                      0.004229
global_aoa * global_orthographic_density               0.019404
global_aoa * global_synonyms_count                     0.032294
global_clustering * global_frequency                  -0.013766
global_clustering * global_letters_count              -0.031716
global_clustering * global_orthographic_density       -0.031945
global_clustering * global_synonyms_count              0.026467
global_frequency * global_letters_count                0.009642
global_frequency * global_orthographic_density        -0.024481
global_frequency * global_synonyms_count               0.021908
global_letters_count * global_orthographic_density    -0.008644
global_letters_count * global_synonyms_count          -0.001020
global_orthographic_density * global_synonyms_count    0.177297
dtype: float64

Regressing rel orthographic_density with 1234 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.058925493478120254

intercept                     -0.217003
global_aoa                    -0.033737
global_clustering              0.080488
global_frequency               0.012591
global_letters_count          -0.050422
global_orthographic_density    0.103273
global_synonyms_count         -0.009426
dtype: float64

Regressing rel orthographic_density with 1234 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0708908160043028

intercept                                              3.986034
global_aoa                                            -0.176732
global_clustering                                      0.834303
global_frequency                                      -0.008714
global_letters_count                                  -0.439671
global_orthographic_density                           -0.201825
global_synonyms_count                                 -0.757336
global_aoa * global_clustering                        -0.017841
global_aoa * global_frequency                         -0.002358
global_aoa * global_letters_count                      0.003930
global_aoa * global_orthographic_density               0.023444
global_aoa * global_synonyms_count                     0.025330
global_clustering * global_frequency                  -0.015728
global_clustering * global_letters_count              -0.068357
global_clustering * global_orthographic_density       -0.055403
global_clustering * global_synonyms_count             -0.044345
global_frequency * global_letters_count               -0.003800
global_frequency * global_orthographic_density        -0.023089
global_frequency * global_synonyms_count               0.014153
global_letters_count * global_orthographic_density     0.003978
global_letters_count * global_synonyms_count           0.003773
global_orthographic_density * global_synonyms_count    0.133933
dtype: float64

Regressing global orthographic_density with 1234 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.04948885714036155

intercept                   1.588672
rel_aoa                     0.021927
rel_clustering             -0.017901
rel_frequency              -0.001778
rel_letters_count          -0.035792
rel_orthographic_density    0.202692
rel_synonyms_count          0.030955
dtype: float64

Regressing global orthographic_density with 1234 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06847199944058091

intercept                                        1.607357
rel_aoa                                          0.103435
rel_clustering                                  -0.016419
rel_frequency                                   -0.019720
rel_letters_count                               -0.032074
rel_orthographic_density                         0.304568
rel_synonyms_count                               0.349074
rel_aoa * rel_clustering                         0.007493
rel_aoa * rel_frequency                          0.027474
rel_aoa * rel_letters_count                      0.001469
rel_aoa * rel_orthographic_density               0.021675
rel_aoa * rel_synonyms_count                     0.078787
rel_clustering * rel_frequency                   0.001433
rel_clustering * rel_letters_count               0.009093
rel_clustering * rel_orthographic_density        0.034148
rel_clustering * rel_synonyms_count              0.064712
rel_frequency * rel_letters_count                0.012736
rel_frequency * rel_orthographic_density         0.035654
rel_frequency * rel_synonyms_count               0.070097
rel_letters_count * rel_orthographic_density    -0.019901
rel_letters_count * rel_synonyms_count          -0.026626
rel_orthographic_density * rel_synonyms_count    0.132644
dtype: float64

Regressing rel orthographic_density with 1234 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08794423684157382

intercept                  -0.563000
rel_aoa                     0.020918
rel_clustering              0.010677
rel_frequency               0.036560
rel_letters_count          -0.036219
rel_orthographic_density    0.247681
rel_synonyms_count          0.014354
dtype: float64

Regressing rel orthographic_density with 1234 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10161653503444401

intercept                                       -0.522349
rel_aoa                                          0.088513
rel_clustering                                   0.026214
rel_frequency                                    0.047587
rel_letters_count                               -0.023874
rel_orthographic_density                         0.329509
rel_synonyms_count                               0.256770
rel_aoa * rel_clustering                        -0.001156
rel_aoa * rel_frequency                          0.015020
rel_aoa * rel_letters_count                      0.000014
rel_aoa * rel_orthographic_density               0.031797
rel_aoa * rel_synonyms_count                     0.078189
rel_clustering * rel_frequency                  -0.004288
rel_clustering * rel_letters_count              -0.016843
rel_clustering * rel_orthographic_density       -0.003305
rel_clustering * rel_synonyms_count              0.022812
rel_frequency * rel_letters_count                0.005238
rel_frequency * rel_orthographic_density         0.034506
rel_frequency * rel_synonyms_count               0.048456
rel_letters_count * rel_orthographic_density    -0.007768
rel_letters_count * rel_synonyms_count          -0.030529
rel_orthographic_density * rel_synonyms_count    0.098457
dtype: float64

Regressing global orthographic_density with 1234 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09988870733728772

intercept                      3.837455
global_aoa                    -0.110166
global_clustering              0.225591
global_frequency              -0.031106
global_letters_count          -0.087858
global_orthographic_density    0.232238
global_synonyms_count         -0.170286
rel_aoa                        0.099181
rel_clustering                -0.172520
rel_frequency                  0.051450
rel_letters_count              0.043886
rel_orthographic_density      -0.109467
rel_synonyms_count             0.177815
dtype: float64

Regressing global orthographic_density with 1234 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14034798586912223

intercept                                                 2.568606
global_aoa                                               -0.386856
global_clustering                                         0.408828
global_frequency                                         -0.176501
global_letters_count                                     -0.094960
global_orthographic_density                               3.576305
global_synonyms_count                                    -0.767296
rel_aoa                                                   0.291042
rel_clustering                                            1.090724
rel_frequency                                             0.235820
rel_letters_count                                        -0.412369
rel_orthographic_density                                 -3.498106
rel_synonyms_count                                        1.142614
global_aoa * global_clustering                           -0.089070
global_aoa * global_frequency                            -0.016520
global_aoa * global_letters_count                         0.001148
global_aoa * global_orthographic_density                 -0.033349
global_aoa * global_synonyms_count                        0.035804
global_aoa * rel_aoa                                     -0.005606
global_aoa * rel_clustering                               0.056409
global_aoa * rel_frequency                                0.011671
global_aoa * rel_letters_count                            0.020935
global_aoa * rel_orthographic_density                     0.082061
global_aoa * rel_synonyms_count                          -0.094102
global_clustering * global_frequency                     -0.055308
global_clustering * global_letters_count                 -0.007622
global_clustering * global_orthographic_density           0.440123
global_clustering * global_synonyms_count                 0.051899
global_clustering * rel_aoa                               0.015457
global_clustering * rel_clustering                        0.010288
global_clustering * rel_frequency                         0.025271
global_clustering * rel_letters_count                    -0.009851
global_clustering * rel_orthographic_density             -0.462085
global_clustering * rel_synonyms_count                   -0.097206
global_frequency * global_letters_count                   0.001417
global_frequency * global_orthographic_density           -0.047925
global_frequency * global_synonyms_count                 -0.061762
global_frequency * rel_aoa                                0.006121
global_frequency * rel_clustering                        -0.012697
global_frequency * rel_frequency                          0.011386
global_frequency * rel_letters_count                      0.018216
global_frequency * rel_orthographic_density              -0.013687
global_frequency * rel_synonyms_count                     0.018085
global_letters_count * global_orthographic_density       -0.028863
global_letters_count * global_synonyms_count              0.031170
global_letters_count * rel_aoa                           -0.014176
global_letters_count * rel_clustering                    -0.094575
global_letters_count * rel_frequency                     -0.035836
global_letters_count * rel_letters_count                  0.000030
global_letters_count * rel_orthographic_density           0.075000
global_letters_count * rel_synonyms_count                -0.022722
global_orthographic_density * global_synonyms_count       0.534003
global_orthographic_density * rel_aoa                    -0.010277
global_orthographic_density * rel_clustering             -0.431132
global_orthographic_density * rel_frequency              -0.017462
global_orthographic_density * rel_letters_count           0.041390
global_orthographic_density * rel_orthographic_density    0.076675
global_orthographic_density * rel_synonyms_count         -0.358742
global_synonyms_count * rel_aoa                          -0.051030
global_synonyms_count * rel_clustering                   -0.082939
global_synonyms_count * rel_frequency                    -0.034407
global_synonyms_count * rel_letters_count                 0.011280
global_synonyms_count * rel_orthographic_density         -0.315836
global_synonyms_count * rel_synonyms_count               -0.004893
rel_aoa * rel_clustering                                 -0.002432
rel_aoa * rel_frequency                                   0.015297
rel_aoa * rel_letters_count                               0.004180
rel_aoa * rel_orthographic_density                       -0.007240
rel_aoa * rel_synonyms_count                              0.175065
rel_clustering * rel_frequency                            0.037499
rel_clustering * rel_letters_count                        0.068321
rel_clustering * rel_orthographic_density                 0.375608
rel_clustering * rel_synonyms_count                       0.185346
rel_frequency * rel_letters_count                         0.022218
rel_frequency * rel_orthographic_density                  0.058011
rel_frequency * rel_synonyms_count                        0.118524
rel_letters_count * rel_orthographic_density             -0.059342
rel_letters_count * rel_synonyms_count                   -0.038361
rel_orthographic_density * rel_synonyms_count             0.278705
dtype: float64

Regressing rel orthographic_density with 1234 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1214514607102547

intercept                      2.623015
global_aoa                    -0.087354
global_clustering              0.193181
global_frequency              -0.018751
global_letters_count          -0.056854
global_orthographic_density   -0.466155
global_synonyms_count         -0.143756
rel_aoa                        0.076513
rel_clustering                -0.133720
rel_frequency                  0.046188
rel_letters_count              0.013001
rel_orthographic_density       0.647467
rel_synonyms_count             0.150858
dtype: float64

Regressing rel orthographic_density with 1234 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16384438131540946

intercept                                                 2.282270
global_aoa                                               -0.062240
global_clustering                                         0.596882
global_frequency                                         -0.328795
global_letters_count                                     -0.403861
global_orthographic_density                               3.125262
global_synonyms_count                                    -0.665634
rel_aoa                                                   0.044226
rel_clustering                                            0.678275
rel_frequency                                             0.148766
rel_letters_count                                        -0.156802
rel_orthographic_density                                 -2.953747
rel_synonyms_count                                        0.851059
global_aoa * global_clustering                           -0.044103
global_aoa * global_frequency                            -0.009239
global_aoa * global_letters_count                         0.003231
global_aoa * global_orthographic_density                 -0.076123
global_aoa * global_synonyms_count                        0.030309
global_aoa * rel_aoa                                     -0.003536
global_aoa * rel_clustering                               0.026507
global_aoa * rel_frequency                                0.013040
global_aoa * rel_letters_count                            0.023449
global_aoa * rel_orthographic_density                     0.138569
global_aoa * rel_synonyms_count                          -0.074298
global_clustering * global_frequency                     -0.084739
global_clustering * global_letters_count                 -0.037773
global_clustering * global_orthographic_density           0.447214
global_clustering * global_synonyms_count                -0.054145
global_clustering * rel_aoa                              -0.014751
global_clustering * rel_clustering                        0.000993
global_clustering * rel_frequency                         0.036451
global_clustering * rel_letters_count                     0.026338
global_clustering * rel_orthographic_density             -0.398786
global_clustering * rel_synonyms_count                    0.033832
global_frequency * global_letters_count                   0.004657
global_frequency * global_orthographic_density           -0.066155
global_frequency * global_synonyms_count                 -0.074038
global_frequency * rel_aoa                                0.004471
global_frequency * rel_clustering                         0.006943
global_frequency * rel_frequency                          0.012121
global_frequency * rel_letters_count                      0.014710
global_frequency * rel_orthographic_density               0.027043
global_frequency * rel_synonyms_count                     0.052420
global_letters_count * global_orthographic_density        0.043945
global_letters_count * global_synonyms_count             -0.029180
global_letters_count * rel_aoa                           -0.009663
global_letters_count * rel_clustering                    -0.054270
global_letters_count * rel_frequency                     -0.031242
global_letters_count * rel_letters_count                 -0.002018
global_letters_count * rel_orthographic_density           0.024388
global_letters_count * rel_synonyms_count                 0.029049
global_orthographic_density * global_synonyms_count       0.412974
global_orthographic_density * rel_aoa                    -0.003654
global_orthographic_density * rel_clustering             -0.361155
global_orthographic_density * rel_frequency               0.030742
global_orthographic_density * rel_letters_count           0.000856
global_orthographic_density * rel_orthographic_density    0.111265
global_orthographic_density * rel_synonyms_count         -0.230077
global_synonyms_count * rel_aoa                          -0.050707
global_synonyms_count * rel_clustering                    0.016030
global_synonyms_count * rel_frequency                     0.002761
global_synonyms_count * rel_letters_count                 0.060957
global_synonyms_count * rel_orthographic_density         -0.230644
global_synonyms_count * rel_synonyms_count               -0.004172
rel_aoa * rel_clustering                                  0.011465
rel_aoa * rel_frequency                                   0.008090
rel_aoa * rel_letters_count                              -0.007827
rel_aoa * rel_orthographic_density                       -0.026937
rel_aoa * rel_synonyms_count                              0.151060
rel_clustering * rel_frequency                            0.032522
rel_clustering * rel_letters_count                        0.019330
rel_clustering * rel_orthographic_density                 0.242362
rel_clustering * rel_synonyms_count                       0.044720
rel_frequency * rel_letters_count                         0.011117
rel_frequency * rel_orthographic_density                 -0.003775
rel_frequency * rel_synonyms_count                        0.055306
rel_letters_count * rel_orthographic_density             -0.043310
rel_letters_count * rel_synonyms_count                   -0.084918
rel_orthographic_density * rel_synonyms_count             0.156949
dtype: float64