Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(time=Time.continuous, source=Source.majority, past=Past.last_bin, durl=Durl.all, max_distance=2)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 11223 substitutions for model Model(time=Time.continuous, source=Source.majority, past=Past.last_bin, durl=Durl.all, max_distance=2)
100% (11223 of 11223) |####################| Elapsed Time: 0:02:55 Time: 0:02:55

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | ns. | **  | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | **  |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | *** | **  | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | **  | **  |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *   | *** | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | *** | ns. | ns. | *** |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | **  |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | ns. | **  | **  |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | **  | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *** |
H_00 | *** | *** | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | *** | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | *** | *** |
H_00 | *** | ns. | ns. | *** |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | *** | ns. | *** |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | *** | *   | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | ns. | *** |
H_00 | ns. | *** | ns. | **  |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | *   | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *** | *   | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | **  | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | **  | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | ns. | *** | *   | *   |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | **  | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *** | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | **  | ns. | *** |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | *   | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | **  | ns. | *** |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 10 components.

Those explain the following variance:
[ 0.54701695  0.18174093  0.06961053  0.06644243  0.03297527  0.0300175
  0.01854228  0.01722637  0.01564068  0.00893736]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 0.506041 -0.253363 0.082770 -0.231941 -0.220781 0.437806 -0.206897 -0.265651 0.407437 -0.271662 0.159619 -0.001656
Component-1 -0.375107 0.365650 -0.133811 0.280493 0.261468 0.427307 -0.170193 0.277580 0.432662 -0.240732 0.163313 -0.014839
Component-2 0.447259 0.594408 -0.060312 0.227197 -0.544861 -0.080640 0.027687 0.287470 -0.001478 0.047139 -0.047293 -0.011835

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (11223 of 11223) |####################| Elapsed Time: 0:02:13 Time: 0:02:13

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *** | *** | ns. | **  |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *** | *** | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.67536509  0.20403139]

Out[35]:
aoa frequency letters_count
Component-0 0.757138 -0.388089 0.525479
Component-1 -0.362778 0.419166 0.832281

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (11223 of 11223) |####################| Elapsed Time: 0:01:13 Time: 0:01:13

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | **  | ns. |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 1287 (cluster-unique) substitutions, but the PCA is in fact computed on 988 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [44]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [45]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.1481113553980814

intercept                      4.425644
global_aoa                     0.088085
global_clustering              0.087866
global_frequency               0.512785
global_letters_count          -0.048184
global_orthographic_density   -0.010171
global_synonyms_count         -0.005553
dtype: float64

Regressing global frequency with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.16952757065026947

intercept                                             -1.223568
global_aoa                                            -0.005025
global_clustering                                      0.389805
global_frequency                                       1.625890
global_letters_count                                   0.227516
global_orthographic_density                            1.689159
global_synonyms_count                                  0.378633
global_aoa * global_clustering                        -0.014245
global_aoa * global_frequency                         -0.012096
global_aoa * global_letters_count                      0.010694
global_aoa * global_orthographic_density               0.020287
global_aoa * global_synonyms_count                     0.024264
global_clustering * global_frequency                   0.032591
global_clustering * global_letters_count              -0.085701
global_clustering * global_orthographic_density        0.028921
global_clustering * global_synonyms_count              0.044882
global_frequency * global_letters_count               -0.095216
global_frequency * global_orthographic_density        -0.209259
global_frequency * global_synonyms_count               0.000476
global_letters_count * global_orthographic_density     0.046341
global_letters_count * global_synonyms_count          -0.050994
global_orthographic_density * global_synonyms_count    0.005617
dtype: float64

Regressing rel frequency with 777 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.08784965673735202

intercept                     -7.106104
global_aoa                     0.119017
global_clustering              0.088775
global_frequency               0.441263
global_letters_count           0.020617
global_orthographic_density   -0.020897
global_synonyms_count          0.106085
dtype: float64

Regressing rel frequency with 777 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.10381719413540247

intercept                                             -11.929953
global_aoa                                              0.021545
global_clustering                                       0.136026
global_frequency                                        1.416922
global_letters_count                                    0.140021
global_orthographic_density                             1.201888
global_synonyms_count                                  -0.473875
global_aoa * global_clustering                         -0.023235
global_aoa * global_frequency                          -0.028077
global_aoa * global_letters_count                       0.023236
global_aoa * global_orthographic_density                0.037229
global_aoa * global_synonyms_count                      0.037330
global_clustering * global_frequency                    0.034126
global_clustering * global_letters_count               -0.045950
global_clustering * global_orthographic_density         0.075148
global_clustering * global_synonyms_count               0.037592
global_frequency * global_letters_count                -0.066823
global_frequency * global_orthographic_density         -0.151013
global_frequency * global_synonyms_count                0.020610
global_letters_count * global_orthographic_density      0.056376
global_letters_count * global_synonyms_count            0.036919
global_orthographic_density * global_synonyms_count     0.094793
dtype: float64

Regressing global frequency with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.05855565649963845

intercept                   9.583552
rel_aoa                     0.093402
rel_clustering             -0.082943
rel_frequency               0.271612
rel_letters_count          -0.003410
rel_orthographic_density    0.034388
rel_synonyms_count         -0.023594
dtype: float64

Regressing global frequency with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08446729561419208

intercept                                        9.486809
rel_aoa                                          0.192946
rel_clustering                                  -0.007696
rel_frequency                                    0.270902
rel_letters_count                                0.019197
rel_orthographic_density                        -0.330269
rel_synonyms_count                               0.353064
rel_aoa * rel_clustering                         0.018891
rel_aoa * rel_frequency                          0.038749
rel_aoa * rel_letters_count                     -0.006609
rel_aoa * rel_orthographic_density              -0.011554
rel_aoa * rel_synonyms_count                    -0.021166
rel_clustering * rel_frequency                  -0.037379
rel_clustering * rel_letters_count              -0.039997
rel_clustering * rel_orthographic_density        0.117818
rel_clustering * rel_synonyms_count              0.126219
rel_frequency * rel_letters_count               -0.043032
rel_frequency * rel_orthographic_density        -0.096902
rel_frequency * rel_synonyms_count               0.085015
rel_letters_count * rel_orthographic_density     0.075450
rel_letters_count * rel_synonyms_count          -0.138207
rel_orthographic_density * rel_synonyms_count   -0.174571
dtype: float64

Regressing rel frequency with 777 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.27713312809675195

intercept                  -1.191482
rel_aoa                     0.063959
rel_clustering              0.160655
rel_frequency               0.645187
rel_letters_count          -0.062644
rel_orthographic_density   -0.141234
rel_synonyms_count          0.063578
dtype: float64

Regressing rel frequency with 777 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.2990447461119662

intercept                                       -1.306339
rel_aoa                                          0.051839
rel_clustering                                   0.077506
rel_frequency                                    0.655671
rel_letters_count                               -0.064851
rel_orthographic_density                        -0.550125
rel_synonyms_count                               0.330714
rel_aoa * rel_clustering                        -0.051017
rel_aoa * rel_frequency                         -0.038883
rel_aoa * rel_letters_count                     -0.001706
rel_aoa * rel_orthographic_density               0.060274
rel_aoa * rel_synonyms_count                     0.035042
rel_clustering * rel_frequency                  -0.070296
rel_clustering * rel_letters_count              -0.033156
rel_clustering * rel_orthographic_density        0.001246
rel_clustering * rel_synonyms_count              0.203660
rel_frequency * rel_letters_count               -0.033335
rel_frequency * rel_orthographic_density        -0.141452
rel_frequency * rel_synonyms_count               0.099400
rel_letters_count * rel_orthographic_density     0.028310
rel_letters_count * rel_synonyms_count          -0.055109
rel_orthographic_density * rel_synonyms_count   -0.014005
dtype: float64

Regressing global frequency with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.1570749169476553

intercept                      3.848278
global_aoa                     0.065390
global_clustering              0.210967
global_frequency               0.620603
global_letters_count          -0.034196
global_orthographic_density    0.056803
global_synonyms_count         -0.129454
rel_aoa                        0.039073
rel_clustering                -0.148548
rel_frequency                 -0.119518
rel_letters_count             -0.006536
rel_orthographic_density      -0.085746
rel_synonyms_count             0.157201
dtype: float64

Regressing global frequency with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.24768184876110488

intercept                                                  1.948057
global_aoa                                                -0.785632
global_clustering                                         -2.344044
global_frequency                                           1.156320
global_letters_count                                      -1.150314
global_orthographic_density                                0.855965
global_synonyms_count                                    -12.168273
rel_aoa                                                    1.885570
rel_clustering                                             1.298139
rel_frequency                                              0.233184
rel_letters_count                                          2.208473
rel_orthographic_density                                   4.922324
rel_synonyms_count                                        13.870468
global_aoa * global_clustering                             0.108759
global_aoa * global_frequency                              0.078025
global_aoa * global_letters_count                          0.112880
global_aoa * global_orthographic_density                   0.116464
global_aoa * global_synonyms_count                        -0.109791
global_aoa * rel_aoa                                      -0.030048
global_aoa * rel_clustering                               -0.013230
global_aoa * rel_frequency                                -0.003131
global_aoa * rel_letters_count                            -0.107845
global_aoa * rel_orthographic_density                     -0.202367
global_aoa * rel_synonyms_count                            0.099900
global_clustering * global_frequency                       0.168989
global_clustering * global_letters_count                  -0.108023
global_clustering * global_orthographic_density            0.454851
global_clustering * global_synonyms_count                 -0.842481
global_clustering * rel_aoa                               -0.175294
global_clustering * rel_clustering                         0.052420
global_clustering * rel_frequency                         -0.106906
global_clustering * rel_letters_count                      0.259913
global_clustering * rel_orthographic_density               0.077039
global_clustering * rel_synonyms_count                     0.902704
global_frequency * global_letters_count                   -0.093826
global_frequency * global_orthographic_density             0.039817
global_frequency * global_synonyms_count                   0.220775
global_frequency * rel_aoa                                -0.218833
global_frequency * rel_clustering                          0.080989
global_frequency * rel_frequency                          -0.036129
global_frequency * rel_letters_count                       0.071184
global_frequency * rel_orthographic_density               -0.220058
global_frequency * rel_synonyms_count                     -0.354440
global_letters_count * global_orthographic_density         0.122021
global_letters_count * global_synonyms_count               0.678917
global_letters_count * rel_aoa                            -0.088852
global_letters_count * rel_clustering                     -0.117717
global_letters_count * rel_frequency                      -0.058456
global_letters_count * rel_letters_count                  -0.001469
global_letters_count * rel_orthographic_density           -0.156508
global_letters_count * rel_synonyms_count                 -0.793313
global_orthographic_density * global_synonyms_count        0.719155
global_orthographic_density * rel_aoa                     -0.019593
global_orthographic_density * rel_clustering              -0.892976
global_orthographic_density * rel_frequency               -0.062299
global_orthographic_density * rel_letters_count           -0.163963
global_orthographic_density * rel_orthographic_density    -0.248020
global_orthographic_density * rel_synonyms_count          -0.278378
global_synonyms_count * rel_aoa                           -0.050522
global_synonyms_count * rel_clustering                     0.746063
global_synonyms_count * rel_frequency                     -0.501348
global_synonyms_count * rel_letters_count                 -0.716569
global_synonyms_count * rel_orthographic_density          -1.079686
global_synonyms_count * rel_synonyms_count                 0.023083
rel_aoa * rel_clustering                                   0.016848
rel_aoa * rel_frequency                                    0.082911
rel_aoa * rel_letters_count                                0.092652
rel_aoa * rel_orthographic_density                         0.081571
rel_aoa * rel_synonyms_count                               0.057546
rel_clustering * rel_frequency                            -0.144408
rel_clustering * rel_letters_count                        -0.129621
rel_clustering * rel_orthographic_density                  0.316546
rel_clustering * rel_synonyms_count                       -0.514365
rel_frequency * rel_letters_count                          0.005103
rel_frequency * rel_orthographic_density                   0.047775
rel_frequency * rel_synonyms_count                         0.739695
rel_letters_count * rel_orthographic_density               0.180622
rel_letters_count * rel_synonyms_count                     0.788149
rel_orthographic_density * rel_synonyms_count              0.617799
dtype: float64

Regressing rel frequency with 777 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3127415596166254

intercept                      2.787848
global_aoa                     0.073691
global_clustering              0.295497
global_frequency              -0.293398
global_letters_count           0.057076
global_orthographic_density    0.160385
global_synonyms_count         -0.131628
rel_aoa                        0.012240
rel_clustering                -0.196032
rel_frequency                  0.814698
rel_letters_count             -0.085430
rel_orthographic_density      -0.167891
rel_synonyms_count             0.174801
dtype: float64

Regressing rel frequency with 777 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.38093461673274787

intercept                                                -10.189475
global_aoa                                                -0.213709
global_clustering                                         -3.653567
global_frequency                                           0.628120
global_letters_count                                      -0.565299
global_orthographic_density                                2.665291
global_synonyms_count                                    -10.304478
rel_aoa                                                    1.701487
rel_clustering                                             3.469996
rel_frequency                                              1.026611
rel_letters_count                                          1.506033
rel_orthographic_density                                   3.436509
rel_synonyms_count                                        11.995644
global_aoa * global_clustering                             0.128579
global_aoa * global_frequency                              0.050064
global_aoa * global_letters_count                          0.097007
global_aoa * global_orthographic_density                   0.092979
global_aoa * global_synonyms_count                        -0.113952
global_aoa * rel_aoa                                      -0.029218
global_aoa * rel_clustering                               -0.063042
global_aoa * rel_frequency                                 0.024125
global_aoa * rel_letters_count                            -0.082677
global_aoa * rel_orthographic_density                     -0.159106
global_aoa * rel_synonyms_count                            0.082900
global_clustering * global_frequency                       0.222032
global_clustering * global_letters_count                  -0.005130
global_clustering * global_orthographic_density            0.648128
global_clustering * global_synonyms_count                 -0.747065
global_clustering * rel_aoa                               -0.166049
global_clustering * rel_clustering                         0.062424
global_clustering * rel_frequency                         -0.094594
global_clustering * rel_letters_count                      0.176928
global_clustering * rel_orthographic_density              -0.015870
global_clustering * rel_synonyms_count                     0.801349
global_frequency * global_letters_count                   -0.060307
global_frequency * global_orthographic_density             0.027623
global_frequency * global_synonyms_count                   0.165495
global_frequency * rel_aoa                                -0.201366
global_frequency * rel_clustering                         -0.023420
global_frequency * rel_frequency                          -0.032759
global_frequency * rel_letters_count                       0.064194
global_frequency * rel_orthographic_density               -0.203261
global_frequency * rel_synonyms_count                     -0.323525
global_letters_count * global_orthographic_density         0.112517
global_letters_count * global_synonyms_count               0.680023
global_letters_count * rel_aoa                            -0.087397
global_letters_count * rel_clustering                     -0.238629
global_letters_count * rel_frequency                      -0.065510
global_letters_count * rel_letters_count                  -0.007527
global_letters_count * rel_orthographic_density           -0.111991
global_letters_count * rel_synonyms_count                 -0.712413
global_orthographic_density * global_synonyms_count        0.492087
global_orthographic_density * rel_aoa                     -0.037539
global_orthographic_density * rel_clustering              -1.080736
global_orthographic_density * rel_frequency               -0.030664
global_orthographic_density * rel_letters_count           -0.155774
global_orthographic_density * rel_orthographic_density    -0.216213
global_orthographic_density * rel_synonyms_count          -0.059053
global_synonyms_count * rel_aoa                           -0.022557
global_synonyms_count * rel_clustering                     0.750182
global_synonyms_count * rel_frequency                     -0.451060
global_synonyms_count * rel_letters_count                 -0.743676
global_synonyms_count * rel_orthographic_density          -0.722171
global_synonyms_count * rel_synonyms_count                 0.034848
rel_aoa * rel_clustering                                   0.022574
rel_aoa * rel_frequency                                    0.054022
rel_aoa * rel_letters_count                                0.077798
rel_aoa * rel_orthographic_density                         0.064917
rel_aoa * rel_synonyms_count                               0.029166
rel_clustering * rel_frequency                            -0.119095
rel_clustering * rel_letters_count                        -0.030089
rel_clustering * rel_orthographic_density                  0.393193
rel_clustering * rel_synonyms_count                       -0.471948
rel_frequency * rel_letters_count                         -0.002229
rel_frequency * rel_orthographic_density                   0.023812
rel_frequency * rel_synonyms_count                         0.693240
rel_letters_count * rel_orthographic_density               0.140964
rel_letters_count * rel_synonyms_count                     0.749126
rel_orthographic_density * rel_synonyms_count              0.267455
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 698 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.1470987775178001

intercept                      3.630868
global_aoa                     0.344899
global_clustering             -0.166814
global_frequency              -0.073954
global_letters_count           0.134762
global_orthographic_density   -0.017720
global_synonyms_count         -0.085798
dtype: float64

Regressing global aoa with 698 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.18215972117691392

intercept                                             -1.718055
global_aoa                                             1.479527
global_clustering                                     -1.927339
global_frequency                                      -0.475065
global_letters_count                                   0.116938
global_orthographic_density                           -2.503702
global_synonyms_count                                 -1.455291
global_aoa * global_clustering                         0.100925
global_aoa * global_frequency                         -0.047899
global_aoa * global_letters_count                     -0.019384
global_aoa * global_orthographic_density               0.003691
global_aoa * global_synonyms_count                    -0.039117
global_clustering * global_frequency                   0.040963
global_clustering * global_letters_count               0.153124
global_clustering * global_orthographic_density       -0.171550
global_clustering * global_synonyms_count             -0.095408
global_frequency * global_letters_count                0.117268
global_frequency * global_orthographic_density         0.168003
global_frequency * global_synonyms_count               0.074401
global_letters_count * global_orthographic_density    -0.028323
global_letters_count * global_synonyms_count           0.041536
global_orthographic_density * global_synonyms_count    0.102031
dtype: float64

Regressing rel aoa with 698 measures, no interactions
           ^^^^^^^
R^2 = 0.05955830343485202

intercept                     -0.349362
global_aoa                     0.195755
global_clustering             -0.069126
global_frequency              -0.127202
global_letters_count           0.061602
global_orthographic_density    0.134897
global_synonyms_count         -0.056623
dtype: float64

Regressing rel aoa with 698 measures, with interactions
           ^^^^^^^
R^2 = 0.10013167092344999

intercept                                             -2.320634
global_aoa                                             1.563185
global_clustering                                     -0.756093
global_frequency                                      -0.496366
global_letters_count                                  -0.422653
global_orthographic_density                           -2.249470
global_synonyms_count                                 -1.098154
global_aoa * global_clustering                         0.116312
global_aoa * global_frequency                         -0.054396
global_aoa * global_letters_count                     -0.037562
global_aoa * global_orthographic_density               0.004312
global_aoa * global_synonyms_count                    -0.003607
global_clustering * global_frequency                   0.014064
global_clustering * global_letters_count               0.018683
global_clustering * global_orthographic_density       -0.254163
global_clustering * global_synonyms_count             -0.158932
global_frequency * global_letters_count                0.106638
global_frequency * global_orthographic_density         0.137814
global_frequency * global_synonyms_count              -0.000265
global_letters_count * global_orthographic_density    -0.084610
global_letters_count * global_synonyms_count           0.001680
global_orthographic_density * global_synonyms_count    0.095428
dtype: float64

Regressing global aoa with 698 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.049980066948837054

intercept                   6.746309
rel_aoa                     0.105682
rel_clustering              0.163663
rel_frequency               0.127495
rel_letters_count           0.050927
rel_orthographic_density   -0.371842
rel_synonyms_count         -0.162093
dtype: float64

Regressing global aoa with 698 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.10294255424025778

intercept                                        6.567071
rel_aoa                                         -0.230740
rel_clustering                                   0.146387
rel_frequency                                    0.106897
rel_letters_count                                0.101561
rel_orthographic_density                        -0.235977
rel_synonyms_count                               0.119171
rel_aoa * rel_clustering                         0.104902
rel_aoa * rel_frequency                         -0.138645
rel_aoa * rel_letters_count                     -0.020977
rel_aoa * rel_orthographic_density               0.029672
rel_aoa * rel_synonyms_count                     0.038639
rel_clustering * rel_frequency                   0.143109
rel_clustering * rel_letters_count               0.114055
rel_clustering * rel_orthographic_density       -0.092899
rel_clustering * rel_synonyms_count              0.058458
rel_frequency * rel_letters_count                0.032463
rel_frequency * rel_orthographic_density         0.046036
rel_frequency * rel_synonyms_count               0.009763
rel_letters_count * rel_orthographic_density    -0.022040
rel_letters_count * rel_synonyms_count           0.084265
rel_orthographic_density * rel_synonyms_count    0.507364
dtype: float64

Regressing rel aoa with 698 measures, no interactions
           ^^^^^^^
R^2 = 0.20961184551414203

intercept                   0.449015
rel_aoa                     0.480186
rel_clustering             -0.071056
rel_frequency              -0.052183
rel_letters_count           0.042498
rel_orthographic_density    0.166541
rel_synonyms_count         -0.127305
dtype: float64

Regressing rel aoa with 698 measures, with interactions
           ^^^^^^^
R^2 = 0.23985553741102328

intercept                                        0.523504
rel_aoa                                          0.350374
rel_clustering                                  -0.057754
rel_frequency                                   -0.002756
rel_letters_count                                0.102585
rel_orthographic_density                         0.596380
rel_synonyms_count                              -0.031701
rel_aoa * rel_clustering                         0.083581
rel_aoa * rel_frequency                         -0.052938
rel_aoa * rel_letters_count                     -0.039685
rel_aoa * rel_orthographic_density              -0.042626
rel_aoa * rel_synonyms_count                    -0.035232
rel_clustering * rel_frequency                   0.080921
rel_clustering * rel_letters_count               0.064152
rel_clustering * rel_orthographic_density       -0.008198
rel_clustering * rel_synonyms_count             -0.140165
rel_frequency * rel_letters_count                0.031087
rel_frequency * rel_orthographic_density         0.149473
rel_frequency * rel_synonyms_count              -0.023836
rel_letters_count * rel_orthographic_density    -0.023566
rel_letters_count * rel_synonyms_count           0.026082
rel_orthographic_density * rel_synonyms_count    0.138538
dtype: float64

Regressing global aoa with 698 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.19099038833538196

intercept                      2.739472
global_aoa                     0.499090
global_clustering             -0.308706
global_frequency              -0.205043
global_letters_count           0.276539
global_orthographic_density    0.025727
global_synonyms_count          0.174104
rel_aoa                       -0.233394
rel_clustering                 0.169545
rel_frequency                  0.127750
rel_letters_count             -0.179196
rel_orthographic_density       0.011308
rel_synonyms_count            -0.341225
dtype: float64

Regressing global aoa with 698 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.3039737772018468

intercept                                                 31.899667
global_aoa                                                 2.062463
global_clustering                                          3.109623
global_frequency                                          -1.343913
global_letters_count                                      -2.521063
global_orthographic_density                              -12.898990
global_synonyms_count                                      3.930012
rel_aoa                                                   -1.187696
rel_clustering                                            -0.348744
rel_frequency                                              1.925842
rel_letters_count                                          1.383927
rel_orthographic_density                                   6.825453
rel_synonyms_count                                       -15.008734
global_aoa * global_clustering                             0.149582
global_aoa * global_frequency                             -0.061700
global_aoa * global_letters_count                         -0.048190
global_aoa * global_orthographic_density                   0.073191
global_aoa * global_synonyms_count                        -0.161380
global_aoa * rel_aoa                                       0.033595
global_aoa * rel_clustering                               -0.300650
global_aoa * rel_frequency                                 0.003979
global_aoa * rel_letters_count                             0.056584
global_aoa * rel_orthographic_density                     -0.063682
global_aoa * rel_synonyms_count                            0.230156
global_clustering * global_frequency                      -0.039354
global_clustering * global_letters_count                  -0.138767
global_clustering * global_orthographic_density           -1.637556
global_clustering * global_synonyms_count                 -0.632815
global_clustering * rel_aoa                               -0.031893
global_clustering * rel_clustering                         0.081284
global_clustering * rel_frequency                          0.231581
global_clustering * rel_letters_count                      0.051489
global_clustering * rel_orthographic_density               0.693084
global_clustering * rel_synonyms_count                    -0.050831
global_frequency * global_letters_count                    0.222653
global_frequency * global_orthographic_density             0.177139
global_frequency * global_synonyms_count                  -0.243161
global_frequency * rel_aoa                                 0.063972
global_frequency * rel_clustering                         -0.173375
global_frequency * rel_frequency                          -0.027214
global_frequency * rel_letters_count                      -0.159816
global_frequency * rel_orthographic_density               -0.156434
global_frequency * rel_synonyms_count                      0.812394
global_letters_count * global_orthographic_density         0.203215
global_letters_count * global_synonyms_count              -0.517152
global_letters_count * rel_aoa                            -0.014717
global_letters_count * rel_clustering                      0.463803
global_letters_count * rel_frequency                       0.068004
global_letters_count * rel_letters_count                   0.030008
global_letters_count * rel_orthographic_density           -0.172433
global_letters_count * rel_synonyms_count                  0.893118
global_orthographic_density * global_synonyms_count       -0.676073
global_orthographic_density * rel_aoa                     -0.062937
global_orthographic_density * rel_clustering               1.308241
global_orthographic_density * rel_frequency               -0.180199
global_orthographic_density * rel_letters_count           -0.226468
global_orthographic_density * rel_orthographic_density     0.192611
global_orthographic_density * rel_synonyms_count           0.022503
global_synonyms_count * rel_aoa                            0.180665
global_synonyms_count * rel_clustering                     0.675111
global_synonyms_count * rel_frequency                      0.170034
global_synonyms_count * rel_letters_count                  0.463315
global_synonyms_count * rel_orthographic_density           1.069641
global_synonyms_count * rel_synonyms_count                 0.148829
rel_aoa * rel_clustering                                   0.296477
rel_aoa * rel_frequency                                   -0.069883
rel_aoa * rel_letters_count                               -0.059411
rel_aoa * rel_orthographic_density                         0.041399
rel_aoa * rel_synonyms_count                              -0.237517
rel_clustering * rel_frequency                             0.010421
rel_clustering * rel_letters_count                        -0.181766
rel_clustering * rel_orthographic_density                 -0.322735
rel_clustering * rel_synonyms_count                       -0.145984
rel_frequency * rel_letters_count                         -0.045257
rel_frequency * rel_orthographic_density                   0.366801
rel_frequency * rel_synonyms_count                        -0.659699
rel_letters_count * rel_orthographic_density               0.267981
rel_letters_count * rel_synonyms_count                    -0.812911
rel_orthographic_density * rel_synonyms_count             -0.170797
dtype: float64

Regressing rel aoa with 698 measures, no interactions
           ^^^^^^^
R^2 = 0.23820441530983705

intercept                      0.933259
global_aoa                    -0.296412
global_clustering             -0.259712
global_frequency              -0.110140
global_letters_count           0.190965
global_orthographic_density   -0.024925
global_synonyms_count          0.324113
rel_aoa                        0.688252
rel_clustering                 0.206410
rel_frequency                  0.016079
rel_letters_count             -0.114808
rel_orthographic_density       0.076222
rel_synonyms_count            -0.462318
dtype: float64

Regressing rel aoa with 698 measures, with interactions
           ^^^^^^^
R^2 = 0.3432552979428021

intercept                                                 39.681420
global_aoa                                                -0.465130
global_clustering                                          3.719910
global_frequency                                          -1.948280
global_letters_count                                      -2.519902
global_orthographic_density                              -12.265451
global_synonyms_count                                      1.807438
rel_aoa                                                    0.150346
rel_clustering                                            -1.738761
rel_frequency                                              1.670255
rel_letters_count                                          2.475255
rel_orthographic_density                                   7.189322
rel_synonyms_count                                        -8.922138
global_aoa * global_clustering                            -0.025355
global_aoa * global_frequency                             -0.064931
global_aoa * global_letters_count                          0.034720
global_aoa * global_orthographic_density                   0.172524
global_aoa * global_synonyms_count                         0.151288
global_aoa * rel_aoa                                       0.018253
global_aoa * rel_clustering                               -0.023644
global_aoa * rel_frequency                                -0.026786
global_aoa * rel_letters_count                            -0.065410
global_aoa * rel_orthographic_density                     -0.187722
global_aoa * rel_synonyms_count                           -0.082276
global_clustering * global_frequency                      -0.121801
global_clustering * global_letters_count                  -0.030934
global_clustering * global_orthographic_density           -1.325905
global_clustering * global_synonyms_count                 -0.356203
global_clustering * rel_aoa                                0.003543
global_clustering * rel_clustering                         0.014962
global_clustering * rel_frequency                          0.185574
global_clustering * rel_letters_count                      0.031826
global_clustering * rel_orthographic_density               0.478850
global_clustering * rel_synonyms_count                    -0.171119
global_frequency * global_letters_count                    0.234735
global_frequency * global_orthographic_density             0.262295
global_frequency * global_synonyms_count                  -0.194767
global_frequency * rel_aoa                                 0.089967
global_frequency * rel_clustering                         -0.105698
global_frequency * rel_frequency                          -0.007722
global_frequency * rel_letters_count                      -0.201565
global_frequency * rel_orthographic_density               -0.310971
global_frequency * rel_synonyms_count                      0.570665
global_letters_count * global_orthographic_density         0.127466
global_letters_count * global_synonyms_count              -0.450046
global_letters_count * rel_aoa                            -0.054625
global_letters_count * rel_clustering                      0.284036
global_letters_count * rel_frequency                       0.030415
global_letters_count * rel_letters_count                   0.046805
global_letters_count * rel_orthographic_density           -0.023110
global_letters_count * rel_synonyms_count                  0.595291
global_orthographic_density * global_synonyms_count       -0.157885
global_orthographic_density * rel_aoa                     -0.117670
global_orthographic_density * rel_clustering               1.146278
global_orthographic_density * rel_frequency               -0.186467
global_orthographic_density * rel_letters_count           -0.197183
global_orthographic_density * rel_orthographic_density     0.261367
global_orthographic_density * rel_synonyms_count          -0.422619
global_synonyms_count * rel_aoa                            0.007108
global_synonyms_count * rel_clustering                     0.479538
global_synonyms_count * rel_frequency                      0.170758
global_synonyms_count * rel_letters_count                  0.441058
global_synonyms_count * rel_orthographic_density           0.601921
global_synonyms_count * rel_synonyms_count                 0.088839
rel_aoa * rel_clustering                                   0.171495
rel_aoa * rel_frequency                                   -0.052179
rel_aoa * rel_letters_count                                0.010448
rel_aoa * rel_orthographic_density                         0.091701
rel_aoa * rel_synonyms_count                              -0.076603
rel_clustering * rel_frequency                             0.056611
rel_clustering * rel_letters_count                        -0.113207
rel_clustering * rel_orthographic_density                 -0.247663
rel_clustering * rel_synonyms_count                       -0.198586
rel_frequency * rel_letters_count                          0.015534
rel_frequency * rel_orthographic_density                   0.390214
rel_frequency * rel_synonyms_count                        -0.477993
rel_letters_count * rel_orthographic_density               0.219159
rel_letters_count * rel_synonyms_count                    -0.593787
rel_orthographic_density * rel_synonyms_count              0.071740
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 641 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.09613316283586348

intercept                     -3.759352
global_aoa                    -0.039522
global_clustering              0.257936
global_frequency              -0.049593
global_letters_count           0.030678
global_orthographic_density   -0.004634
global_synonyms_count         -0.027813
dtype: float64

Regressing global clustering with 641 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.14328025303004244

intercept                                             -1.224152
global_aoa                                             0.273709
global_clustering                                      0.774641
global_frequency                                      -0.511715
global_letters_count                                   0.049305
global_orthographic_density                           -0.088840
global_synonyms_count                                 -0.864908
global_aoa * global_clustering                         0.019850
global_aoa * global_frequency                         -0.006075
global_aoa * global_letters_count                     -0.013735
global_aoa * global_orthographic_density              -0.042306
global_aoa * global_synonyms_count                    -0.002176
global_clustering * global_frequency                  -0.065813
global_clustering * global_letters_count              -0.001427
global_clustering * global_orthographic_density       -0.020940
global_clustering * global_synonyms_count             -0.101406
global_frequency * global_letters_count                0.009564
global_frequency * global_orthographic_density         0.038590
global_frequency * global_synonyms_count               0.038592
global_letters_count * global_orthographic_density    -0.015967
global_letters_count * global_synonyms_count           0.001885
global_orthographic_density * global_synonyms_count   -0.082092
dtype: float64

Regressing rel clustering with 641 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.07149207912250999

intercept                      2.147469
global_aoa                    -0.041511
global_clustering              0.207156
global_frequency              -0.046639
global_letters_count           0.022808
global_orthographic_density   -0.024819
global_synonyms_count         -0.052401
dtype: float64

Regressing rel clustering with 641 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.10319727334216355

intercept                                              5.444315
global_aoa                                             0.179325
global_clustering                                      0.810932
global_frequency                                      -0.499711
global_letters_count                                  -0.079829
global_orthographic_density                           -0.060726
global_synonyms_count                                 -0.369735
global_aoa * global_clustering                         0.012039
global_aoa * global_frequency                         -0.000018
global_aoa * global_letters_count                     -0.012143
global_aoa * global_orthographic_density              -0.050935
global_aoa * global_synonyms_count                    -0.027810
global_clustering * global_frequency                  -0.061218
global_clustering * global_letters_count              -0.016556
global_clustering * global_orthographic_density       -0.027446
global_clustering * global_synonyms_count             -0.024192
global_frequency * global_letters_count                0.009575
global_frequency * global_orthographic_density         0.025493
global_frequency * global_synonyms_count               0.022914
global_letters_count * global_orthographic_density     0.001136
global_letters_count * global_synonyms_count           0.034165
global_orthographic_density * global_synonyms_count   -0.039051
dtype: float64

Regressing global clustering with 641 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05284313108316685

intercept                  -5.899130
rel_aoa                    -0.018852
rel_clustering              0.226878
rel_frequency              -0.002888
rel_letters_count           0.018622
rel_orthographic_density    0.019320
rel_synonyms_count         -0.012020
dtype: float64

Regressing global clustering with 641 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.07900026480363198

intercept                                       -5.827151
rel_aoa                                         -0.040002
rel_clustering                                   0.149719
rel_frequency                                    0.041906
rel_letters_count                               -0.001871
rel_orthographic_density                         0.071826
rel_synonyms_count                              -0.081324
rel_aoa * rel_clustering                         0.027668
rel_aoa * rel_frequency                         -0.012188
rel_aoa * rel_letters_count                     -0.023488
rel_aoa * rel_orthographic_density              -0.029626
rel_aoa * rel_synonyms_count                     0.009762
rel_clustering * rel_frequency                  -0.017689
rel_clustering * rel_letters_count              -0.014332
rel_clustering * rel_orthographic_density       -0.044323
rel_clustering * rel_synonyms_count             -0.030973
rel_frequency * rel_letters_count               -0.014614
rel_frequency * rel_orthographic_density        -0.003766
rel_frequency * rel_synonyms_count              -0.003929
rel_letters_count * rel_orthographic_density    -0.013685
rel_letters_count * rel_synonyms_count           0.033922
rel_orthographic_density * rel_synonyms_count    0.028491
dtype: float64

Regressing rel clustering with 641 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.16862868340199066

intercept                   0.162586
rel_aoa                    -0.035673
rel_clustering              0.394720
rel_frequency              -0.006836
rel_letters_count           0.019920
rel_orthographic_density    0.008773
rel_synonyms_count         -0.005288
dtype: float64

Regressing rel clustering with 641 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1896195553945469

intercept                                        0.220628
rel_aoa                                         -0.047781
rel_clustering                                   0.310123
rel_frequency                                    0.031417
rel_letters_count                                0.002891
rel_orthographic_density                         0.010290
rel_synonyms_count                              -0.107636
rel_aoa * rel_clustering                         0.011672
rel_aoa * rel_frequency                         -0.005170
rel_aoa * rel_letters_count                     -0.024957
rel_aoa * rel_orthographic_density              -0.051684
rel_aoa * rel_synonyms_count                    -0.005970
rel_clustering * rel_frequency                  -0.024255
rel_clustering * rel_letters_count              -0.005109
rel_clustering * rel_orthographic_density       -0.024128
rel_clustering * rel_synonyms_count             -0.031401
rel_frequency * rel_letters_count               -0.017798
rel_frequency * rel_orthographic_density        -0.018302
rel_frequency * rel_synonyms_count              -0.019264
rel_letters_count * rel_orthographic_density     0.000693
rel_letters_count * rel_synonyms_count           0.045851
rel_orthographic_density * rel_synonyms_count    0.051132
dtype: float64

Regressing global clustering with 641 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11397081452815849

intercept                     -2.491273
global_aoa                    -0.043411
global_clustering              0.322871
global_frequency              -0.092209
global_letters_count           0.021327
global_orthographic_density   -0.122063
global_synonyms_count         -0.065508
rel_aoa                        0.000965
rel_clustering                -0.066118
rel_frequency                  0.050669
rel_letters_count              0.007421
rel_orthographic_density       0.137002
rel_synonyms_count             0.027438
dtype: float64

Regressing global clustering with 641 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2583324169558432

intercept                                                 19.518180
global_aoa                                                 0.388167
global_clustering                                          5.007373
global_frequency                                          -1.619001
global_letters_count                                      -0.478579
global_orthographic_density                               -1.259332
global_synonyms_count                                      0.601646
rel_aoa                                                   -0.384203
rel_clustering                                            -4.968183
rel_frequency                                              0.387506
rel_letters_count                                          0.345096
rel_orthographic_density                                   0.107719
rel_synonyms_count                                        -2.565614
global_aoa * global_clustering                            -0.057915
global_aoa * global_frequency                             -0.012363
global_aoa * global_letters_count                         -0.076064
global_aoa * global_orthographic_density                  -0.170867
global_aoa * global_synonyms_count                         0.060306
global_aoa * rel_aoa                                       0.016177
global_aoa * rel_clustering                                0.092821
global_aoa * rel_frequency                                 0.015225
global_aoa * rel_letters_count                             0.071115
global_aoa * rel_orthographic_density                      0.135239
global_aoa * rel_synonyms_count                           -0.018764
global_clustering * global_frequency                      -0.289284
global_clustering * global_letters_count                  -0.194069
global_clustering * global_orthographic_density           -0.255390
global_clustering * global_synonyms_count                 -0.553824
global_clustering * rel_aoa                               -0.007266
global_clustering * rel_clustering                        -0.124003
global_clustering * rel_frequency                          0.126776
global_clustering * rel_letters_count                      0.182372
global_clustering * rel_orthographic_density               0.018298
global_clustering * rel_synonyms_count                     0.586742
global_frequency * global_letters_count                   -0.030853
global_frequency * global_orthographic_density             0.035830
global_frequency * global_synonyms_count                  -0.072314
global_frequency * rel_aoa                                -0.004155
global_frequency * rel_clustering                          0.205459
global_frequency * rel_frequency                           0.021591
global_frequency * rel_letters_count                       0.052449
global_frequency * rel_orthographic_density                0.008659
global_frequency * rel_synonyms_count                      0.234270
global_letters_count * global_orthographic_density         0.115742
global_letters_count * global_synonyms_count              -0.375444
global_letters_count * rel_aoa                             0.019219
global_letters_count * rel_clustering                      0.255540
global_letters_count * rel_frequency                       0.050964
global_letters_count * rel_letters_count                  -0.004275
global_letters_count * rel_orthographic_density           -0.194260
global_letters_count * rel_synonyms_count                  0.494744
global_orthographic_density * global_synonyms_count       -0.800360
global_orthographic_density * rel_aoa                      0.133843
global_orthographic_density * rel_clustering               0.219301
global_orthographic_density * rel_frequency               -0.036027
global_orthographic_density * rel_letters_count           -0.138478
global_orthographic_density * rel_orthographic_density    -0.029040
global_orthographic_density * rel_synonyms_count           0.618879
global_synonyms_count * rel_aoa                           -0.019215
global_synonyms_count * rel_clustering                     0.325227
global_synonyms_count * rel_frequency                      0.129466
global_synonyms_count * rel_letters_count                  0.152128
global_synonyms_count * rel_orthographic_density           0.503778
global_synonyms_count * rel_synonyms_count                -0.091325
rel_aoa * rel_clustering                                   0.023573
rel_aoa * rel_frequency                                    0.023181
rel_aoa * rel_letters_count                               -0.039207
rel_aoa * rel_orthographic_density                        -0.128128
rel_aoa * rel_synonyms_count                              -0.003463
rel_clustering * rel_frequency                            -0.113276
rel_clustering * rel_letters_count                        -0.238572
rel_clustering * rel_orthographic_density                  0.017467
rel_clustering * rel_synonyms_count                       -0.427006
rel_frequency * rel_letters_count                         -0.069384
rel_frequency * rel_orthographic_density                   0.029310
rel_frequency * rel_synonyms_count                        -0.265369
rel_letters_count * rel_orthographic_density               0.175364
rel_letters_count * rel_synonyms_count                    -0.242139
rel_orthographic_density * rel_synonyms_count             -0.266498
dtype: float64

Regressing rel clustering with 641 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2455147432151885

intercept                     -1.783752
global_aoa                    -0.040585
global_clustering             -0.528850
global_frequency              -0.075891
global_letters_count           0.013436
global_orthographic_density   -0.118494
global_synonyms_count         -0.082030
rel_aoa                        0.002731
rel_clustering                 0.844425
rel_frequency                  0.036690
rel_letters_count              0.006662
rel_orthographic_density       0.107118
rel_synonyms_count             0.057653
dtype: float64

Regressing rel clustering with 641 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.36855737758846707

intercept                                                 19.376562
global_aoa                                                 0.187626
global_clustering                                          3.385277
global_frequency                                          -1.553601
global_letters_count                                      -0.940865
global_orthographic_density                               -1.227043
global_synonyms_count                                      0.212666
rel_aoa                                                   -0.180354
rel_clustering                                            -3.572159
rel_frequency                                              0.430241
rel_letters_count                                          0.580840
rel_orthographic_density                                  -0.339617
rel_synonyms_count                                        -1.849963
global_aoa * global_clustering                            -0.042007
global_aoa * global_frequency                             -0.001793
global_aoa * global_letters_count                         -0.052308
global_aoa * global_orthographic_density                  -0.127230
global_aoa * global_synonyms_count                         0.040337
global_aoa * rel_aoa                                       0.014253
global_aoa * rel_clustering                                0.060289
global_aoa * rel_frequency                                 0.006665
global_aoa * rel_letters_count                             0.053528
global_aoa * rel_orthographic_density                      0.098967
global_aoa * rel_synonyms_count                           -0.020099
global_clustering * global_frequency                      -0.241773
global_clustering * global_letters_count                  -0.203524
global_clustering * global_orthographic_density           -0.157792
global_clustering * global_synonyms_count                 -0.449119
global_clustering * rel_aoa                               -0.015827
global_clustering * rel_clustering                        -0.135837
global_clustering * rel_frequency                          0.106043
global_clustering * rel_letters_count                      0.161352
global_clustering * rel_orthographic_density              -0.113439
global_clustering * rel_synonyms_count                     0.509222
global_frequency * global_letters_count                   -0.005656
global_frequency * global_orthographic_density             0.064455
global_frequency * global_synonyms_count                  -0.082600
global_frequency * rel_aoa                                -0.017053
global_frequency * rel_clustering                          0.173521
global_frequency * rel_frequency                           0.016899
global_frequency * rel_letters_count                       0.028502
global_frequency * rel_orthographic_density               -0.016562
global_frequency * rel_synonyms_count                      0.243530
global_letters_count * global_orthographic_density         0.094128
global_letters_count * global_synonyms_count              -0.266095
global_letters_count * rel_aoa                             0.004172
global_letters_count * rel_clustering                      0.271626
global_letters_count * rel_frequency                       0.033575
global_letters_count * rel_letters_count                  -0.005164
global_letters_count * rel_orthographic_density           -0.151568
global_letters_count * rel_synonyms_count                  0.369691
global_orthographic_density * global_synonyms_count       -0.464863
global_orthographic_density * rel_aoa                      0.109550
global_orthographic_density * rel_clustering               0.162008
global_orthographic_density * rel_frequency               -0.044821
global_orthographic_density * rel_letters_count           -0.127392
global_orthographic_density * rel_orthographic_density    -0.019236
global_orthographic_density * rel_synonyms_count           0.287364
global_synonyms_count * rel_aoa                           -0.012404
global_synonyms_count * rel_clustering                     0.314114
global_synonyms_count * rel_frequency                      0.133750
global_synonyms_count * rel_letters_count                  0.073580
global_synonyms_count * rel_orthographic_density           0.156343
global_synonyms_count * rel_synonyms_count                -0.062138
rel_aoa * rel_clustering                                   0.031327
rel_aoa * rel_frequency                                    0.026628
rel_aoa * rel_letters_count                               -0.030472
rel_aoa * rel_orthographic_density                        -0.111008
rel_aoa * rel_synonyms_count                               0.001662
rel_clustering * rel_frequency                            -0.101451
rel_clustering * rel_letters_count                        -0.216025
rel_clustering * rel_orthographic_density                  0.091381
rel_clustering * rel_synonyms_count                       -0.414773
rel_frequency * rel_letters_count                         -0.053228
rel_frequency * rel_orthographic_density                   0.028286
rel_frequency * rel_synonyms_count                        -0.273668
rel_letters_count * rel_orthographic_density               0.145711
rel_letters_count * rel_synonyms_count                    -0.154280
rel_orthographic_density * rel_synonyms_count              0.045384
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1194994189258407

intercept                      3.726597
global_aoa                    -0.042913
global_clustering             -0.179683
global_frequency              -0.037378
global_letters_count           0.368593
global_orthographic_density   -0.070728
global_synonyms_count         -0.280485
dtype: float64

Regressing global letters_count with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13555541794810388

intercept                                             -3.188099
global_aoa                                             0.187365
global_clustering                                     -2.822317
global_frequency                                      -0.150714
global_letters_count                                   0.191463
global_orthographic_density                           -1.061239
global_synonyms_count                                  1.357420
global_aoa * global_clustering                         0.082758
global_aoa * global_frequency                          0.029703
global_aoa * global_letters_count                     -0.005017
global_aoa * global_orthographic_density               0.001313
global_aoa * global_synonyms_count                     0.019549
global_clustering * global_frequency                   0.150264
global_clustering * global_letters_count               0.073000
global_clustering * global_orthographic_density        0.101395
global_clustering * global_synonyms_count              0.326989
global_frequency * global_letters_count                0.077009
global_frequency * global_orthographic_density         0.202720
global_frequency * global_synonyms_count               0.067327
global_letters_count * global_orthographic_density    -0.051741
global_letters_count * global_synonyms_count          -0.068796
global_orthographic_density * global_synonyms_count   -0.025267
dtype: float64

Regressing rel letters_count with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.07066843952581969

intercept                      0.889717
global_aoa                    -0.089415
global_clustering             -0.164608
global_frequency              -0.067461
global_letters_count           0.284708
global_orthographic_density   -0.040059
global_synonyms_count         -0.326560
dtype: float64

Regressing rel letters_count with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.09303924663984098

intercept                                             -7.744489
global_aoa                                             0.435879
global_clustering                                     -2.912653
global_frequency                                      -0.000740
global_letters_count                                   0.042853
global_orthographic_density                           -1.171830
global_synonyms_count                                  0.742336
global_aoa * global_clustering                         0.126243
global_aoa * global_frequency                          0.036807
global_aoa * global_letters_count                     -0.022440
global_aoa * global_orthographic_density              -0.019250
global_aoa * global_synonyms_count                     0.053067
global_clustering * global_frequency                   0.180294
global_clustering * global_letters_count               0.021450
global_clustering * global_orthographic_density        0.024280
global_clustering * global_synonyms_count              0.158378
global_frequency * global_letters_count                0.069955
global_frequency * global_orthographic_density         0.195941
global_frequency * global_synonyms_count               0.042668
global_letters_count * global_orthographic_density    -0.069464
global_letters_count * global_synonyms_count          -0.123554
global_orthographic_density * global_synonyms_count   -0.075618
dtype: float64

Regressing global letters_count with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08996737663631926

intercept                   5.660893
rel_aoa                    -0.110494
rel_clustering              0.033018
rel_frequency               0.086606
rel_letters_count           0.281408
rel_orthographic_density   -0.219768
rel_synonyms_count         -0.309697
dtype: float64

Regressing global letters_count with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10388686084238374

intercept                                        5.583312
rel_aoa                                         -0.238057
rel_clustering                                   0.013846
rel_frequency                                    0.088005
rel_letters_count                                0.402625
rel_orthographic_density                        -0.114521
rel_synonyms_count                              -0.203700
rel_aoa * rel_clustering                         0.019121
rel_aoa * rel_frequency                         -0.038783
rel_aoa * rel_letters_count                      0.004169
rel_aoa * rel_orthographic_density              -0.007634
rel_aoa * rel_synonyms_count                     0.047357
rel_clustering * rel_frequency                  -0.001332
rel_clustering * rel_letters_count               0.044927
rel_clustering * rel_orthographic_density        0.121818
rel_clustering * rel_synonyms_count              0.396440
rel_frequency * rel_letters_count                0.044798
rel_frequency * rel_orthographic_density         0.075976
rel_frequency * rel_synonyms_count               0.015171
rel_letters_count * rel_orthographic_density     0.018710
rel_letters_count * rel_synonyms_count           0.026285
rel_orthographic_density * rel_synonyms_count    0.327446
dtype: float64

Regressing rel letters_count with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16026597715458613

intercept                   1.137927
rel_aoa                    -0.079964
rel_clustering             -0.113306
rel_frequency              -0.122179
rel_letters_count           0.436786
rel_orthographic_density    0.052490
rel_synonyms_count         -0.332178
dtype: float64

Regressing rel letters_count with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17551091264562402

intercept                                        1.029290
rel_aoa                                         -0.150171
rel_clustering                                   0.014074
rel_frequency                                   -0.148134
rel_letters_count                                0.630662
rel_orthographic_density                         0.250595
rel_synonyms_count                              -0.221084
rel_aoa * rel_clustering                         0.064538
rel_aoa * rel_frequency                         -0.001553
rel_aoa * rel_letters_count                     -0.023110
rel_aoa * rel_orthographic_density              -0.081544
rel_aoa * rel_synonyms_count                     0.070363
rel_clustering * rel_frequency                   0.060417
rel_clustering * rel_letters_count               0.040032
rel_clustering * rel_orthographic_density        0.126536
rel_clustering * rel_synonyms_count              0.243830
rel_frequency * rel_letters_count                0.051636
rel_frequency * rel_orthographic_density         0.120020
rel_frequency * rel_synonyms_count               0.010119
rel_letters_count * rel_orthographic_density     0.050421
rel_letters_count * rel_synonyms_count          -0.009691
rel_orthographic_density * rel_synonyms_count    0.187275
dtype: float64

Regressing global letters_count with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13243208828082775

intercept                      1.633452
global_aoa                     0.037736
global_clustering             -0.483868
global_frequency              -0.091681
global_letters_count           0.416518
global_orthographic_density   -0.023909
global_synonyms_count          0.052793
rel_aoa                       -0.122794
rel_clustering                 0.348173
rel_frequency                  0.048174
rel_letters_count             -0.064928
rel_orthographic_density      -0.034697
rel_synonyms_count            -0.381244
dtype: float64

Regressing global letters_count with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.22296678591665697

intercept                                                -19.685278
global_aoa                                                 2.822687
global_clustering                                         -1.294533
global_frequency                                           1.828535
global_letters_count                                      -0.766842
global_orthographic_density                                0.498901
global_synonyms_count                                      9.758727
rel_aoa                                                   -5.071393
rel_clustering                                            -4.355590
rel_frequency                                             -0.351191
rel_letters_count                                          1.199971
rel_orthographic_density                                  -5.398272
rel_synonyms_count                                       -10.759563
global_aoa * global_clustering                             0.293908
global_aoa * global_frequency                             -0.041361
global_aoa * global_letters_count                         -0.143460
global_aoa * global_orthographic_density                   0.010890
global_aoa * global_synonyms_count                         0.087016
global_aoa * rel_aoa                                       0.038460
global_aoa * rel_clustering                               -0.272491
global_aoa * rel_frequency                                -0.002591
global_aoa * rel_letters_count                             0.111847
global_aoa * rel_orthographic_density                      0.076171
global_aoa * rel_synonyms_count                           -0.163058
global_clustering * global_frequency                       0.220821
global_clustering * global_letters_count                  -0.517383
global_clustering * global_orthographic_density           -0.355697
global_clustering * global_synonyms_count                 -0.508969
global_clustering * rel_aoa                               -0.180900
global_clustering * rel_clustering                        -0.077535
global_clustering * rel_frequency                          0.123855
global_clustering * rel_letters_count                      0.360079
global_clustering * rel_orthographic_density              -0.200851
global_clustering * rel_synonyms_count                     0.199187
global_frequency * global_letters_count                    0.014313
global_frequency * global_orthographic_density            -0.059215
global_frequency * global_synonyms_count                  -0.574517
global_frequency * rel_aoa                                 0.224483
global_frequency * rel_clustering                          0.026425
global_frequency * rel_frequency                          -0.014202
global_frequency * rel_letters_count                      -0.062596
global_frequency * rel_orthographic_density                0.229646
global_frequency * rel_synonyms_count                      0.681163
global_letters_count * global_orthographic_density        -0.364534
global_letters_count * global_synonyms_count              -1.112159
global_letters_count * rel_aoa                             0.206947
global_letters_count * rel_clustering                      0.921956
global_letters_count * rel_frequency                       0.197401
global_letters_count * rel_letters_count                   0.006812
global_letters_count * rel_orthographic_density            0.146493
global_letters_count * rel_synonyms_count                  0.944759
global_orthographic_density * global_synonyms_count       -0.908149
global_orthographic_density * rel_aoa                      0.095049
global_orthographic_density * rel_clustering               0.619577
global_orthographic_density * rel_frequency                0.127079
global_orthographic_density * rel_letters_count            0.268022
global_orthographic_density * rel_orthographic_density     0.124745
global_orthographic_density * rel_synonyms_count           0.357023
global_synonyms_count * rel_aoa                            0.324283
global_synonyms_count * rel_clustering                     0.884466
global_synonyms_count * rel_frequency                      0.608813
global_synonyms_count * rel_letters_count                  1.027495
global_synonyms_count * rel_orthographic_density           1.396777
global_synonyms_count * rel_synonyms_count                 0.133672
rel_aoa * rel_clustering                                   0.281936
rel_aoa * rel_frequency                                   -0.153354
rel_aoa * rel_letters_count                               -0.202918
rel_aoa * rel_orthographic_density                        -0.126647
rel_aoa * rel_synonyms_count                              -0.151840
rel_clustering * rel_frequency                            -0.272991
rel_clustering * rel_letters_count                        -0.652694
rel_clustering * rel_orthographic_density                  0.186064
rel_clustering * rel_synonyms_count                       -0.234264
rel_frequency * rel_letters_count                         -0.074864
rel_frequency * rel_orthographic_density                  -0.054614
rel_frequency * rel_synonyms_count                        -0.645040
rel_letters_count * rel_orthographic_density              -0.065468
rel_letters_count * rel_synonyms_count                    -0.907792
rel_orthographic_density * rel_synonyms_count             -0.803696
dtype: float64

Regressing rel letters_count with 777 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.19308491305369258

intercept                      0.204074
global_aoa                     0.012530
global_clustering             -0.463718
global_frequency              -0.018208
global_letters_count          -0.392701
global_orthographic_density   -0.021398
global_synonyms_count          0.093490
rel_aoa                       -0.084676
rel_clustering                 0.347465
rel_frequency                 -0.031658
rel_letters_count              0.758317
rel_orthographic_density      -0.062824
rel_synonyms_count            -0.407819
dtype: float64

Regressing rel letters_count with 777 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.2755355492552616

intercept                                                -18.652384
global_aoa                                                 2.145122
global_clustering                                         -2.885357
global_frequency                                           1.104170
global_letters_count                                      -1.577175
global_orthographic_density                               -0.788624
global_synonyms_count                                      7.797718
rel_aoa                                                   -4.066294
rel_clustering                                            -3.136101
rel_frequency                                             -0.336552
rel_letters_count                                          1.714333
rel_orthographic_density                                  -5.235854
rel_synonyms_count                                        -9.235517
global_aoa * global_clustering                             0.271775
global_aoa * global_frequency                             -0.001921
global_aoa * global_letters_count                         -0.109355
global_aoa * global_orthographic_density                  -0.014262
global_aoa * global_synonyms_count                         0.036381
global_aoa * rel_aoa                                       0.039033
global_aoa * rel_clustering                               -0.237433
global_aoa * rel_frequency                                -0.030970
global_aoa * rel_letters_count                             0.082062
global_aoa * rel_orthographic_density                      0.085044
global_aoa * rel_synonyms_count                           -0.119775
global_clustering * global_frequency                       0.253063
global_clustering * global_letters_count                  -0.336712
global_clustering * global_orthographic_density           -0.146372
global_clustering * global_synonyms_count                 -0.383878
global_clustering * rel_aoa                               -0.140410
global_clustering * rel_clustering                        -0.109775
global_clustering * rel_frequency                          0.031092
global_clustering * rel_letters_count                      0.119150
global_clustering * rel_orthographic_density              -0.540492
global_clustering * rel_synonyms_count                    -0.124939
global_frequency * global_letters_count                    0.078350
global_frequency * global_orthographic_density             0.148759
global_frequency * global_synonyms_count                  -0.347011
global_frequency * rel_aoa                                 0.174335
global_frequency * rel_clustering                         -0.031041
global_frequency * rel_frequency                          -0.014754
global_frequency * rel_letters_count                      -0.125615
global_frequency * rel_orthographic_density                0.062357
global_frequency * rel_synonyms_count                      0.380636
global_letters_count * global_orthographic_density        -0.267229
global_letters_count * global_synonyms_count              -0.941131
global_letters_count * rel_aoa                             0.165378
global_letters_count * rel_clustering                      0.781370
global_letters_count * rel_frequency                       0.165468
global_letters_count * rel_letters_count                  -0.012167
global_letters_count * rel_orthographic_density            0.061866
global_letters_count * rel_synonyms_count                  0.761801
global_orthographic_density * global_synonyms_count       -0.949178
global_orthographic_density * rel_aoa                      0.114194
global_orthographic_density * rel_clustering               0.503498
global_orthographic_density * rel_frequency               -0.020983
global_orthographic_density * rel_letters_count            0.197841
global_orthographic_density * rel_orthographic_density     0.097051
global_orthographic_density * rel_synonyms_count           0.502315
global_synonyms_count * rel_aoa                            0.342002
global_synonyms_count * rel_clustering                     0.658513
global_synonyms_count * rel_frequency                      0.464374
global_synonyms_count * rel_letters_count                  0.876405
global_synonyms_count * rel_orthographic_density           1.354996
global_synonyms_count * rel_synonyms_count                 0.101606
rel_aoa * rel_clustering                                   0.217244
rel_aoa * rel_frequency                                   -0.108144
rel_aoa * rel_letters_count                               -0.163181
rel_aoa * rel_orthographic_density                        -0.112630
rel_aoa * rel_synonyms_count                              -0.127819
rel_clustering * rel_frequency                            -0.193603
rel_clustering * rel_letters_count                        -0.468965
rel_clustering * rel_orthographic_density                  0.398206
rel_clustering * rel_synonyms_count                        0.125382
rel_frequency * rel_letters_count                         -0.044261
rel_frequency * rel_orthographic_density                   0.056108
rel_frequency * rel_synonyms_count                        -0.433497
rel_letters_count * rel_orthographic_density              -0.034899
rel_letters_count * rel_synonyms_count                    -0.761044
rel_orthographic_density * rel_synonyms_count             -0.826483
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0706778714962959

intercept                      0.893958
global_aoa                    -0.011064
global_clustering              0.049462
global_frequency              -0.017008
global_letters_count          -0.014197
global_orthographic_density    0.015365
global_synonyms_count          0.234687
dtype: float64

Regressing global synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09557746410777046

intercept                                              2.303006
global_aoa                                             0.036525
global_clustering                                      0.397732
global_frequency                                       0.012588
global_letters_count                                  -0.141331
global_orthographic_density                           -0.606815
global_synonyms_count                                  0.426724
global_aoa * global_clustering                        -0.001531
global_aoa * global_frequency                         -0.010252
global_aoa * global_letters_count                     -0.000056
global_aoa * global_orthographic_density               0.026419
global_aoa * global_synonyms_count                     0.020608
global_clustering * global_frequency                  -0.010652
global_clustering * global_letters_count              -0.023272
global_clustering * global_orthographic_density       -0.098006
global_clustering * global_synonyms_count              0.077435
global_frequency * global_letters_count                0.000384
global_frequency * global_orthographic_density        -0.004978
global_frequency * global_synonyms_count              -0.009882
global_letters_count * global_orthographic_density    -0.020096
global_letters_count * global_synonyms_count           0.018774
global_orthographic_density * global_synonyms_count    0.057387
dtype: float64

Regressing rel synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.04475268953369993

intercept                      0.545039
global_aoa                    -0.013068
global_clustering              0.041461
global_frequency              -0.011040
global_letters_count          -0.013398
global_orthographic_density   -0.003571
global_synonyms_count          0.179029
dtype: float64

Regressing rel synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.07940268191623268

intercept                                              3.099295
global_aoa                                             0.073464
global_clustering                                      0.563138
global_frequency                                      -0.128638
global_letters_count                                  -0.195901
global_orthographic_density                           -0.664540
global_synonyms_count                                  0.517755
global_aoa * global_clustering                        -0.001868
global_aoa * global_frequency                         -0.008269
global_aoa * global_letters_count                     -0.006721
global_aoa * global_orthographic_density               0.014541
global_aoa * global_synonyms_count                     0.014284
global_clustering * global_frequency                  -0.024782
global_clustering * global_letters_count              -0.033306
global_clustering * global_orthographic_density       -0.093753
global_clustering * global_synonyms_count              0.091844
global_frequency * global_letters_count                0.005782
global_frequency * global_orthographic_density         0.011580
global_frequency * global_synonyms_count              -0.014904
global_letters_count * global_orthographic_density    -0.019571
global_letters_count * global_synonyms_count           0.022192
global_orthographic_density * global_synonyms_count    0.063151
dtype: float64

Regressing global synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.05793465756174043

intercept                   0.400066
rel_aoa                     0.017817
rel_clustering             -0.004126
rel_frequency              -0.016181
rel_letters_count          -0.015116
rel_orthographic_density    0.037062
rel_synonyms_count          0.228632
dtype: float64

Regressing global synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09232648887878925

intercept                                        0.461848
rel_aoa                                          0.019849
rel_clustering                                  -0.057714
rel_frequency                                   -0.007334
rel_letters_count                               -0.076341
rel_orthographic_density                         0.080584
rel_synonyms_count                               0.135500
rel_aoa * rel_clustering                        -0.000877
rel_aoa * rel_frequency                         -0.001523
rel_aoa * rel_letters_count                      0.018333
rel_aoa * rel_orthographic_density               0.043638
rel_aoa * rel_synonyms_count                     0.023898
rel_clustering * rel_frequency                  -0.000023
rel_clustering * rel_letters_count              -0.013230
rel_clustering * rel_orthographic_density       -0.083993
rel_clustering * rel_synonyms_count              0.099333
rel_frequency * rel_letters_count               -0.003220
rel_frequency * rel_orthographic_density        -0.001238
rel_frequency * rel_synonyms_count              -0.005086
rel_letters_count * rel_orthographic_density    -0.030732
rel_letters_count * rel_synonyms_count           0.000248
rel_orthographic_density * rel_synonyms_count   -0.009441
dtype: float64

Regressing rel synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.12762247678578864

intercept                   0.080048
rel_aoa                     0.002784
rel_clustering              0.027823
rel_frequency              -0.009039
rel_letters_count          -0.017785
rel_orthographic_density   -0.000807
rel_synonyms_count          0.353900
dtype: float64

Regressing rel synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.16062980272807137

intercept                                        0.159017
rel_aoa                                          0.011960
rel_clustering                                  -0.053319
rel_frequency                                    0.014914
rel_letters_count                               -0.076285
rel_orthographic_density                         0.055773
rel_synonyms_count                               0.359333
rel_aoa * rel_clustering                         0.005248
rel_aoa * rel_frequency                          0.000422
rel_aoa * rel_letters_count                      0.010840
rel_aoa * rel_orthographic_density               0.031306
rel_aoa * rel_synonyms_count                     0.015061
rel_clustering * rel_frequency                  -0.016359
rel_clustering * rel_letters_count              -0.020688
rel_clustering * rel_orthographic_density       -0.080365
rel_clustering * rel_synonyms_count              0.108688
rel_frequency * rel_letters_count               -0.006895
rel_frequency * rel_orthographic_density         0.001153
rel_frequency * rel_synonyms_count               0.012821
rel_letters_count * rel_orthographic_density    -0.029632
rel_letters_count * rel_synonyms_count           0.006598
rel_orthographic_density * rel_synonyms_count    0.048574
dtype: float64

Regressing global synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08213318342680065

intercept                      1.654289
global_aoa                    -0.041470
global_clustering              0.145046
global_frequency              -0.014759
global_letters_count           0.000830
global_orthographic_density   -0.008643
global_synonyms_count          0.183593
rel_aoa                        0.042151
rel_clustering                -0.108634
rel_frequency                  0.000601
rel_letters_count             -0.014148
rel_orthographic_density       0.023673
rel_synonyms_count             0.050340
dtype: float64

Regressing global synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18792030959014017

intercept                                                 0.800453
global_aoa                                                0.300458
global_clustering                                         1.340753
global_frequency                                          0.039332
global_letters_count                                      0.624220
global_orthographic_density                               0.699638
global_synonyms_count                                     3.926453
rel_aoa                                                   0.209546
rel_clustering                                           -0.892355
rel_frequency                                            -0.366583
rel_letters_count                                        -0.421212
rel_orthographic_density                                  0.487906
rel_synonyms_count                                       -4.913800
global_aoa * global_clustering                           -0.041208
global_aoa * global_frequency                            -0.021345
global_aoa * global_letters_count                        -0.047089
global_aoa * global_orthographic_density                 -0.070338
global_aoa * global_synonyms_count                        0.063665
global_aoa * rel_aoa                                      0.000409
global_aoa * rel_clustering                               0.062916
global_aoa * rel_frequency                                0.018641
global_aoa * rel_letters_count                            0.031046
global_aoa * rel_orthographic_density                     0.065910
global_aoa * rel_synonyms_count                           0.003538
global_clustering * global_frequency                     -0.070358
global_clustering * global_letters_count                 -0.035681
global_clustering * global_orthographic_density          -0.093705
global_clustering * global_synonyms_count                 0.470369
global_clustering * rel_aoa                               0.011658
global_clustering * rel_clustering                        0.024535
global_clustering * rel_frequency                         0.032447
global_clustering * rel_letters_count                     0.068774
global_clustering * rel_orthographic_density              0.233505
global_clustering * rel_synonyms_count                   -0.453163
global_frequency * global_letters_count                  -0.045545
global_frequency * global_orthographic_density           -0.070620
global_frequency * global_synonyms_count                  0.015557
global_frequency * rel_aoa                               -0.006478
global_frequency * rel_clustering                         0.082446
global_frequency * rel_frequency                          0.000733
global_frequency * rel_letters_count                      0.036076
global_frequency * rel_orthographic_density               0.053977
global_frequency * rel_synonyms_count                     0.035202
global_letters_count * global_orthographic_density        0.019791
global_letters_count * global_synonyms_count             -0.155260
global_letters_count * rel_aoa                           -0.005638
global_letters_count * rel_clustering                    -0.021902
global_letters_count * rel_frequency                      0.045913
global_letters_count * rel_letters_count                  0.005722
global_letters_count * rel_orthographic_density          -0.024767
global_letters_count * rel_synonyms_count                 0.207637
global_orthographic_density * global_synonyms_count      -0.253153
global_orthographic_density * rel_aoa                     0.002489
global_orthographic_density * rel_clustering             -0.082386
global_orthographic_density * rel_frequency               0.101870
global_orthographic_density * rel_letters_count           0.050461
global_orthographic_density * rel_orthographic_density    0.023601
global_orthographic_density * rel_synonyms_count          0.282692
global_synonyms_count * rel_aoa                          -0.092571
global_synonyms_count * rel_clustering                   -0.299701
global_synonyms_count * rel_frequency                     0.106213
global_synonyms_count * rel_letters_count                 0.130207
global_synonyms_count * rel_orthographic_density          0.126966
global_synonyms_count * rel_synonyms_count                0.173995
rel_aoa * rel_clustering                                 -0.037864
rel_aoa * rel_frequency                                   0.004055
rel_aoa * rel_letters_count                               0.029016
rel_aoa * rel_orthographic_density                        0.030767
rel_aoa * rel_synonyms_count                              0.041661
rel_clustering * rel_frequency                           -0.036640
rel_clustering * rel_letters_count                       -0.049922
rel_clustering * rel_orthographic_density                -0.166689
rel_clustering * rel_synonyms_count                       0.310847
rel_frequency * rel_letters_count                        -0.035621
rel_frequency * rel_orthographic_density                 -0.083469
rel_frequency * rel_synonyms_count                       -0.149956
rel_letters_count * rel_orthographic_density             -0.051733
rel_letters_count * rel_synonyms_count                   -0.141567
rel_orthographic_density * rel_synonyms_count            -0.096754
dtype: float64

Regressing rel synonyms_count with 752 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.20072837197711857

intercept                      1.172306
global_aoa                    -0.035094
global_clustering              0.108087
global_frequency              -0.005279
global_letters_count           0.005296
global_orthographic_density   -0.002662
global_synonyms_count         -0.582077
rel_aoa                        0.030915
rel_clustering                -0.079823
rel_frequency                 -0.003437
rel_letters_count             -0.018493
rel_orthographic_density      -0.002027
rel_synonyms_count             0.898665
dtype: float64

Regressing rel synonyms_count with 752 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.3159527026496939

intercept                                                -3.884631
global_aoa                                                0.384906
global_clustering                                         0.635473
global_frequency                                          0.288405
global_letters_count                                      0.721027
global_orthographic_density                               0.969368
global_synonyms_count                                     4.272558
rel_aoa                                                   0.297351
rel_clustering                                           -0.534745
rel_frequency                                            -0.618750
rel_letters_count                                        -0.633542
rel_orthographic_density                                  0.132518
rel_synonyms_count                                       -5.155658
global_aoa * global_clustering                           -0.032106
global_aoa * global_frequency                            -0.021065
global_aoa * global_letters_count                        -0.049467
global_aoa * global_orthographic_density                 -0.080191
global_aoa * global_synonyms_count                        0.048150
global_aoa * rel_aoa                                      0.001165
global_aoa * rel_clustering                               0.059643
global_aoa * rel_frequency                                0.017965
global_aoa * rel_letters_count                            0.034252
global_aoa * rel_orthographic_density                     0.071007
global_aoa * rel_synonyms_count                           0.017575
global_clustering * global_frequency                     -0.028505
global_clustering * global_letters_count                 -0.021481
global_clustering * global_orthographic_density          -0.060701
global_clustering * global_synonyms_count                 0.511040
global_clustering * rel_aoa                               0.013276
global_clustering * rel_clustering                        0.022483
global_clustering * rel_frequency                        -0.010996
global_clustering * rel_letters_count                     0.051083
global_clustering * rel_orthographic_density              0.210424
global_clustering * rel_synonyms_count                   -0.486044
global_frequency * global_letters_count                  -0.041568
global_frequency * global_orthographic_density           -0.065364
global_frequency * global_synonyms_count                 -0.067247
global_frequency * rel_aoa                               -0.014253
global_frequency * rel_clustering                         0.044389
global_frequency * rel_frequency                         -0.000553
global_frequency * rel_letters_count                      0.042038
global_frequency * rel_orthographic_density               0.061513
global_frequency * rel_synonyms_count                     0.130113
global_letters_count * global_orthographic_density        0.001844
global_letters_count * global_synonyms_count             -0.134726
global_letters_count * rel_aoa                           -0.008633
global_letters_count * rel_clustering                    -0.015195
global_letters_count * rel_frequency                      0.043639
global_letters_count * rel_letters_count                  0.005203
global_letters_count * rel_orthographic_density          -0.007983
global_letters_count * rel_synonyms_count                 0.184542
global_orthographic_density * global_synonyms_count      -0.210988
global_orthographic_density * rel_aoa                     0.002682
global_orthographic_density * rel_clustering             -0.042807
global_orthographic_density * rel_frequency               0.101075
global_orthographic_density * rel_letters_count           0.061736
global_orthographic_density * rel_orthographic_density    0.020673
global_orthographic_density * rel_synonyms_count          0.216131
global_synonyms_count * rel_aoa                          -0.069524
global_synonyms_count * rel_clustering                   -0.328382
global_synonyms_count * rel_frequency                     0.187558
global_synonyms_count * rel_letters_count                 0.142860
global_synonyms_count * rel_orthographic_density          0.167355
global_synonyms_count * rel_synonyms_count                0.185197
rel_aoa * rel_clustering                                 -0.042149
rel_aoa * rel_frequency                                   0.009653
rel_aoa * rel_letters_count                               0.025561
rel_aoa * rel_orthographic_density                        0.022912
rel_aoa * rel_synonyms_count                              0.015597
rel_clustering * rel_frequency                           -0.005183
rel_clustering * rel_letters_count                       -0.057713
rel_clustering * rel_orthographic_density                -0.197658
rel_clustering * rel_synonyms_count                       0.318436
rel_frequency * rel_letters_count                        -0.043198
rel_frequency * rel_orthographic_density                 -0.090461
rel_frequency * rel_synonyms_count                       -0.234723
rel_letters_count * rel_orthographic_density             -0.058647
rel_letters_count * rel_synonyms_count                   -0.153563
rel_orthographic_density * rel_synonyms_count            -0.112555
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 656 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12650398597811408

intercept                      1.714090
global_aoa                    -0.029781
global_clustering              0.053023
global_frequency               0.004443
global_letters_count          -0.046812
global_orthographic_density    0.243148
global_synonyms_count          0.041705
dtype: float64

Regressing global orthographic_density with 656 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1609930030961677

intercept                                              0.327738
global_aoa                                            -0.076030
global_clustering                                      0.649090
global_frequency                                       0.554511
global_letters_count                                   0.004734
global_orthographic_density                            1.218028
global_synonyms_count                                  0.242224
global_aoa * global_clustering                        -0.012204
global_aoa * global_frequency                         -0.019741
global_aoa * global_letters_count                      0.014687
global_aoa * global_orthographic_density               0.049852
global_aoa * global_synonyms_count                    -0.012858
global_clustering * global_frequency                  -0.016386
global_clustering * global_letters_count              -0.056772
global_clustering * global_orthographic_density        0.009320
global_clustering * global_synonyms_count             -0.068251
global_frequency * global_letters_count               -0.052256
global_frequency * global_orthographic_density        -0.131973
global_frequency * global_synonyms_count              -0.039375
global_letters_count * global_orthographic_density    -0.007273
global_letters_count * global_synonyms_count          -0.014993
global_orthographic_density * global_synonyms_count   -0.063112
dtype: float64

Regressing rel orthographic_density with 656 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09700072394975146

intercept                     -0.476194
global_aoa                    -0.013304
global_clustering              0.043035
global_frequency              -0.001953
global_letters_count          -0.057787
global_orthographic_density    0.188442
global_synonyms_count          0.076328
dtype: float64

Regressing rel orthographic_density with 656 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1321104864572401

intercept                                              0.828519
global_aoa                                            -0.166744
global_clustering                                      0.824468
global_frequency                                       0.326983
global_letters_count                                  -0.198558
global_orthographic_density                            0.886705
global_synonyms_count                                 -0.135829
global_aoa * global_clustering                        -0.014294
global_aoa * global_frequency                         -0.016521
global_aoa * global_letters_count                      0.025089
global_aoa * global_orthographic_density               0.058536
global_aoa * global_synonyms_count                    -0.021319
global_clustering * global_frequency                  -0.035969
global_clustering * global_letters_count              -0.058111
global_clustering * global_orthographic_density        0.022317
global_clustering * global_synonyms_count             -0.093171
global_frequency * global_letters_count               -0.043005
global_frequency * global_orthographic_density        -0.105735
global_frequency * global_synonyms_count              -0.045561
global_letters_count * global_orthographic_density     0.000186
global_letters_count * global_synonyms_count           0.039044
global_orthographic_density * global_synonyms_count   -0.021628
dtype: float64

Regressing global orthographic_density with 656 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09179881870024154

intercept                   1.515288
rel_aoa                    -0.012427
rel_clustering             -0.024332
rel_frequency              -0.031375
rel_letters_count          -0.001687
rel_orthographic_density    0.303008
rel_synonyms_count          0.086742
dtype: float64

Regressing global orthographic_density with 656 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10973795047995762

intercept                                        1.540439
rel_aoa                                          0.068228
rel_clustering                                   0.041200
rel_frequency                                   -0.036209
rel_letters_count                               -0.083900
rel_orthographic_density                         0.199728
rel_synonyms_count                               0.090679
rel_aoa * rel_clustering                         0.015577
rel_aoa * rel_frequency                          0.025223
rel_aoa * rel_letters_count                      0.001771
rel_aoa * rel_orthographic_density               0.025420
rel_aoa * rel_synonyms_count                    -0.022097
rel_clustering * rel_frequency                  -0.003890
rel_clustering * rel_letters_count              -0.020676
rel_clustering * rel_orthographic_density        0.058216
rel_clustering * rel_synonyms_count             -0.085983
rel_frequency * rel_letters_count               -0.024469
rel_frequency * rel_orthographic_density        -0.049788
rel_frequency * rel_synonyms_count              -0.014800
rel_letters_count * rel_orthographic_density    -0.025298
rel_letters_count * rel_synonyms_count          -0.009112
rel_orthographic_density * rel_synonyms_count   -0.039887
dtype: float64

Regressing rel orthographic_density with 656 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15943839835903517

intercept                  -0.524110
rel_aoa                     0.002283
rel_clustering              0.001073
rel_frequency               0.011696
rel_letters_count          -0.007558
rel_orthographic_density    0.385261
rel_synonyms_count          0.084320
dtype: float64

Regressing rel orthographic_density with 656 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17895858650157304

intercept                                       -0.423215
rel_aoa                                          0.092337
rel_clustering                                  -0.016433
rel_frequency                                    0.042544
rel_letters_count                               -0.113748
rel_orthographic_density                         0.286772
rel_synonyms_count                               0.024106
rel_aoa * rel_clustering                         0.003134
rel_aoa * rel_frequency                          0.020952
rel_aoa * rel_letters_count                      0.007674
rel_aoa * rel_orthographic_density               0.049258
rel_aoa * rel_synonyms_count                    -0.028341
rel_clustering * rel_frequency                  -0.028580
rel_clustering * rel_letters_count              -0.006034
rel_clustering * rel_orthographic_density        0.061152
rel_clustering * rel_synonyms_count             -0.057067
rel_frequency * rel_letters_count               -0.030772
rel_frequency * rel_orthographic_density        -0.039147
rel_frequency * rel_synonyms_count              -0.017122
rel_letters_count * rel_orthographic_density    -0.021877
rel_letters_count * rel_synonyms_count           0.012713
rel_orthographic_density * rel_synonyms_count   -0.026806
dtype: float64

Regressing global orthographic_density with 656 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14323847490172448

intercept                      3.320064
global_aoa                    -0.028279
global_clustering              0.186264
global_frequency              -0.006140
global_letters_count          -0.176576
global_orthographic_density    0.190428
global_synonyms_count         -0.165995
rel_aoa                        0.007255
rel_clustering                -0.151757
rel_frequency                  0.018748
rel_letters_count              0.149091
rel_orthographic_density       0.056955
rel_synonyms_count             0.245767
dtype: float64

Regressing global orthographic_density with 656 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.25507032723733636

intercept                                                 6.456941
global_aoa                                               -0.624236
global_clustering                                         0.953830
global_frequency                                          0.397255
global_letters_count                                     -0.494328
global_orthographic_density                               0.474789
global_synonyms_count                                    -4.258561
rel_aoa                                                   0.322632
rel_clustering                                           -0.238168
rel_frequency                                            -0.655763
rel_letters_count                                         0.253398
rel_orthographic_density                                  0.743395
rel_synonyms_count                                        9.325326
global_aoa * global_clustering                           -0.097874
global_aoa * global_frequency                            -0.054579
global_aoa * global_letters_count                         0.048802
global_aoa * global_orthographic_density                  0.140649
global_aoa * global_synonyms_count                        0.025795
global_aoa * rel_aoa                                      0.003934
global_aoa * rel_clustering                               0.120545
global_aoa * rel_frequency                                0.016243
global_aoa * rel_letters_count                           -0.018065
global_aoa * rel_orthographic_density                    -0.092158
global_aoa * rel_synonyms_count                           0.015429
global_clustering * global_frequency                     -0.020071
global_clustering * global_letters_count                 -0.008406
global_clustering * global_orthographic_density           0.001266
global_clustering * global_synonyms_count                -0.076041
global_clustering * rel_aoa                              -0.032829
global_clustering * rel_clustering                       -0.008072
global_clustering * rel_frequency                        -0.189228
global_clustering * rel_letters_count                     0.037237
global_clustering * rel_orthographic_density              0.294144
global_clustering * rel_synonyms_count                    0.472070
global_frequency * global_letters_count                   0.005594
global_frequency * global_orthographic_density           -0.098368
global_frequency * global_synonyms_count                  0.047950
global_frequency * rel_aoa                                0.005291
global_frequency * rel_clustering                         0.039774
global_frequency * rel_frequency                          0.019779
global_frequency * rel_letters_count                      0.023311
global_frequency * rel_orthographic_density               0.077139
global_frequency * rel_synonyms_count                    -0.209203
global_letters_count * global_orthographic_density       -0.107782
global_letters_count * global_synonyms_count              0.358903
global_letters_count * rel_aoa                           -0.055857
global_letters_count * rel_clustering                    -0.129553
global_letters_count * rel_frequency                     -0.103298
global_letters_count * rel_letters_count                 -0.009410
global_letters_count * rel_orthographic_density           0.226489
global_letters_count * rel_synonyms_count                -0.607903
global_orthographic_density * global_synonyms_count       0.593218
global_orthographic_density * rel_aoa                    -0.109951
global_orthographic_density * rel_clustering             -0.148865
global_orthographic_density * rel_frequency              -0.147734
global_orthographic_density * rel_letters_count           0.020885
global_orthographic_density * rel_orthographic_density    0.051120
global_orthographic_density * rel_synonyms_count         -0.631738
global_synonyms_count * rel_aoa                          -0.064374
global_synonyms_count * rel_clustering                   -0.420375
global_synonyms_count * rel_frequency                    -0.045079
global_synonyms_count * rel_letters_count                -0.258980
global_synonyms_count * rel_orthographic_density         -0.880051
global_synonyms_count * rel_synonyms_count               -0.134118
rel_aoa * rel_clustering                                 -0.015983
rel_aoa * rel_frequency                                   0.041011
rel_aoa * rel_letters_count                               0.045959
rel_aoa * rel_orthographic_density                        0.113481
rel_aoa * rel_synonyms_count                             -0.017440
rel_clustering * rel_frequency                            0.148925
rel_clustering * rel_letters_count                        0.069000
rel_clustering * rel_orthographic_density                -0.106802
rel_clustering * rel_synonyms_count                      -0.058391
rel_frequency * rel_letters_count                         0.025761
rel_frequency * rel_orthographic_density                  0.031613
rel_frequency * rel_synonyms_count                        0.134613
rel_letters_count * rel_orthographic_density             -0.129780
rel_letters_count * rel_synonyms_count                    0.466907
rel_orthographic_density * rel_synonyms_count             0.843395
dtype: float64

Regressing rel orthographic_density with 656 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.19672108293339297

intercept                      2.273878
global_aoa                    -0.018191
global_clustering              0.185982
global_frequency               0.013391
global_letters_count          -0.124420
global_orthographic_density   -0.531372
global_synonyms_count         -0.121980
rel_aoa                        0.005669
rel_clustering                -0.147576
rel_frequency                  0.007778
rel_letters_count              0.090624
rel_orthographic_density       0.836140
rel_synonyms_count             0.192286
dtype: float64

Regressing rel orthographic_density with 656 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3090754396321932

intercept                                                 7.980897
global_aoa                                               -0.603748
global_clustering                                         1.520874
global_frequency                                          0.160475
global_letters_count                                     -0.261707
global_orthographic_density                              -0.233273
global_synonyms_count                                    -3.450669
rel_aoa                                                   0.402505
rel_clustering                                           -0.719787
rel_frequency                                            -0.503686
rel_letters_count                                         0.125307
rel_orthographic_density                                  1.732978
rel_synonyms_count                                        7.857789
global_aoa * global_clustering                           -0.094845
global_aoa * global_frequency                            -0.039731
global_aoa * global_letters_count                         0.046839
global_aoa * global_orthographic_density                  0.076100
global_aoa * global_synonyms_count                        0.036053
global_aoa * rel_aoa                                      0.002379
global_aoa * rel_clustering                               0.129940
global_aoa * rel_frequency                                0.010458
global_aoa * rel_letters_count                           -0.024860
global_aoa * rel_orthographic_density                    -0.047614
global_aoa * rel_synonyms_count                          -0.001599
global_clustering * global_frequency                     -0.063177
global_clustering * global_letters_count                  0.015833
global_clustering * global_orthographic_density          -0.072832
global_clustering * global_synonyms_count                -0.069672
global_clustering * rel_aoa                              -0.009019
global_clustering * rel_clustering                       -0.007376
global_clustering * rel_frequency                        -0.143162
global_clustering * rel_letters_count                     0.030611
global_clustering * rel_orthographic_density              0.441588
global_clustering * rel_synonyms_count                    0.373401
global_frequency * global_letters_count                  -0.003719
global_frequency * global_orthographic_density           -0.122136
global_frequency * global_synonyms_count                  0.040910
global_frequency * rel_aoa                               -0.006451
global_frequency * rel_clustering                         0.061994
global_frequency * rel_frequency                          0.020185
global_frequency * rel_letters_count                      0.038870
global_frequency * rel_orthographic_density               0.117904
global_frequency * rel_synonyms_count                    -0.207718
global_letters_count * global_orthographic_density       -0.038677
global_letters_count * global_synonyms_count              0.281073
global_letters_count * rel_aoa                           -0.042243
global_letters_count * rel_clustering                    -0.175815
global_letters_count * rel_frequency                     -0.091437
global_letters_count * rel_letters_count                 -0.006949
global_letters_count * rel_orthographic_density           0.201626
global_letters_count * rel_synonyms_count                -0.482603
global_orthographic_density * global_synonyms_count       0.455102
global_orthographic_density * rel_aoa                    -0.044742
global_orthographic_density * rel_clustering             -0.010669
global_orthographic_density * rel_frequency              -0.108887
global_orthographic_density * rel_letters_count          -0.066486
global_orthographic_density * rel_orthographic_density    0.075123
global_orthographic_density * rel_synonyms_count         -0.514284
global_synonyms_count * rel_aoa                          -0.077906
global_synonyms_count * rel_clustering                   -0.301221
global_synonyms_count * rel_frequency                    -0.003988
global_synonyms_count * rel_letters_count                -0.228291
global_synonyms_count * rel_orthographic_density         -0.782200
global_synonyms_count * rel_synonyms_count               -0.142996
rel_aoa * rel_clustering                                 -0.046251
rel_aoa * rel_frequency                                   0.042951
rel_aoa * rel_letters_count                               0.040941
rel_aoa * rel_orthographic_density                        0.064884
rel_aoa * rel_synonyms_count                             -0.011952
rel_clustering * rel_frequency                            0.130510
rel_clustering * rel_letters_count                        0.105104
rel_clustering * rel_orthographic_density                -0.308319
rel_clustering * rel_synonyms_count                      -0.090189
rel_frequency * rel_letters_count                         0.009927
rel_frequency * rel_orthographic_density                 -0.014313
rel_frequency * rel_synonyms_count                        0.089233
rel_letters_count * rel_orthographic_density             -0.091768
rel_letters_count * rel_synonyms_count                    0.405078
rel_orthographic_density * rel_synonyms_count             0.748033
dtype: float64