Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.



In [1]:

    
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.



In [2]:

    
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.



In [3]:

    
model = Model(time=Time.discrete, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data









    



Got 5600 substitutions for model Model(time=Time.discrete, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=1)






    



100% (5600 of 5600) |######################| Elapsed Time: 0:01:40 Time: 0:01:40

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.



In [4]:

    
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.



In [5]:

    
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

$\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
$\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
$\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
$y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

$\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
$\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
$\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
$y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.



In [6]:

    
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()



In [7]:

    
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)



In [8]:

    
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)



In [9]:

    
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)



In [10]:

    
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above



In [11]:

    
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *   | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *   | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | **  | **  | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | **  | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | **  | *** | ns. |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | **  |
H_00 | **  | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | ns. | *** | **  | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | **  | ns. | ns. | *   |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare



In [12]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *   | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | ns. |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | **  | **  | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | **  |
H_00 | **  | ns. | ns. | ns. |



In [14]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);

2.1.2 Quantiles of distribution of appeared global feature values



In [15]:

    
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *   |
H_00 | *   | *   | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *   |
H_00 | *   | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | ns. | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *   | *   | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | ns. | **  | *** |
H_00 | **  | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | ns. | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | **  | ns. | ns. | *   |



In [16]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *   | *   | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | ns. | **  | *** |
H_00 | **  | ns. | ns. |



In [18]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);

2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values



In [19]:

    
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | **  | **  |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | *** | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | **  |
H_00 | ns. | ns. | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *   | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | **  | ns. | *** | **  |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | *** | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |



In [20]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *   | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *   | *** | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |



In [22]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);

2.2.2 Quantiles of distribution of appeared sentence-relative values



In [23]:

    
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | *   |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *   | ns. | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | **  |
H_00 | ns. | ns. | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | **  | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | ns. | *** | **  |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | **  | *** |
H_00 | *** | ns. | ns. | ns. |



In [24]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *** |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | *   | ns. | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |



In [26]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});

3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

its slope
its shape (e.g. several slope regimes?)
its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)



In [27]:

    
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features



In [28]:

    
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)









    



/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper



In [29]:

    
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)









    



/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA



In [30]:

    
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])









    



MLE estimates there are 10 components.

Those explain the following variance:
[ 0.53352043  0.16495268  0.08614343  0.07743131  0.03606997  0.02582389
  0.02239324  0.01909724  0.0128689   0.01082262]

We're plotting variation for the first 3 components:






    Out[30]:






  
    
      
      aoa
      betweenness
      clustering
      degree
      frequency
      letters_count
      orthographic_density
      pagerank
      phonemes_count
      phonological_density
      syllables_count
      synonyms_count
    
  
  
    
      Component-0
      -0.489840
      0.332932
      -0.098602
      0.248148
      0.258502
      -0.410251
      0.211155
      0.276602
      -0.363773
      0.258779
      -0.149618
      -0.002552
    
    
      Component-1
      -0.286201
      0.319530
      -0.093017
      0.254264
      0.321245
      0.448380
      -0.162363
      0.291978
      0.499387
      -0.229951
      0.155661
      -0.018348
    
    
      Component-2
      -0.746200
      -0.287752
      0.022277
      -0.058305
      -0.565024
      0.145110
      0.029877
      -0.077570
      0.066310
      -0.027218
      0.006588
      0.063939

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.



In [31]:

    
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data









    



100% (5600 of 5600) |######################| Elapsed Time: 0:01:27 Time: 0:01:27

Compute cluster averages (so as not to overestimate confidence intervals).



In [32]:

    
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)



In [33]:

    
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)









    



---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | *   |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | **  |
H_00 | ns. | *** | *** | *   |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *   | ns. | ns. | ns. |

4.2 On a subset of relevant features



In [34]:

    
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA



In [35]:

    
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])









    



MLE estimates there are 2 components.

Those explain the following variance:
[ 0.64234754  0.21640136]







    Out[35]:






  
    
      
      aoa
      frequency
      letters_count
    
  
  
    
      Component-0
      -0.757650
      0.349028
      -0.551493
    
    
      Component-1
      0.390963
      -0.433896
      -0.811715

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.



In [36]:

    
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data









    



100% (5600 of 5600) |######################| Elapsed Time: 0:00:38 Time: 0:00:38

Compute cluster averages (so as not to overestimate confidence intervals).



In [37]:

    
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components



In [38]:

    
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)









    



---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:



In [39]:

    
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))









    



Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 526 (cluster-unique) substitutions, but the PCA is in fact computed on 403 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.



In [40]:

    
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.



In [41]:

    
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()









    



---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
   ** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
    * global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
   ** global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.



In [42]:

    
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()









    



---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression



In [43]:

    
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures



In [44]:

    
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)



In [45]:

    
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()









    



----------------------------------------------------------------------
Regressing global frequency with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.04643795738756806

intercept                      7.668845
global_aoa                    -0.003175
global_clustering              0.170670
global_frequency               0.274203
global_letters_count          -0.030239
global_orthographic_density   -0.156032
global_synonyms_count         -0.081453
dtype: float64

Regressing global frequency with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.1357244332073516

intercept                                              14.614830
global_aoa                                             -1.159254
global_clustering                                       0.823215
global_frequency                                       -0.146602
global_letters_count                                   -0.439594
global_orthographic_density                             3.710492
global_synonyms_count                                   1.056758
global_aoa * global_clustering                         -0.008737
global_aoa * global_frequency                           0.071707
global_aoa * global_letters_count                       0.056802
global_aoa * global_orthographic_density                0.006711
global_aoa * global_synonyms_count                      0.106202
global_clustering * global_frequency                   -0.071978
global_clustering * global_letters_count               -0.063187
global_clustering * global_orthographic_density         0.501845
global_clustering * global_synonyms_count              -0.100510
global_frequency * global_letters_count                -0.038623
global_frequency * global_orthographic_density         -0.113236
global_frequency * global_synonyms_count               -0.188966
global_letters_count * global_orthographic_density      0.008998
global_letters_count * global_synonyms_count           -0.092174
global_orthographic_density * global_synonyms_count    -0.099966
dtype: float64

Regressing rel frequency with 297 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.04169772810137906

intercept                     -2.527203
global_aoa                    -0.037858
global_clustering              0.249925
global_frequency               0.235345
global_letters_count          -0.040991
global_orthographic_density   -0.383308
global_synonyms_count          0.024120
dtype: float64

Regressing rel frequency with 297 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.09467987360616414

intercept                                              0.739326
global_aoa                                            -1.026680
global_clustering                                      0.650824
global_frequency                                       0.062829
global_letters_count                                  -0.037699
global_orthographic_density                            2.961762
global_synonyms_count                                  1.463880
global_aoa * global_clustering                        -0.006937
global_aoa * global_frequency                          0.082205
global_aoa * global_letters_count                      0.025554
global_aoa * global_orthographic_density              -0.021455
global_aoa * global_synonyms_count                     0.110345
global_clustering * global_frequency                  -0.037035
global_clustering * global_letters_count              -0.073513
global_clustering * global_orthographic_density        0.413661
global_clustering * global_synonyms_count              0.093725
global_frequency * global_letters_count               -0.065558
global_frequency * global_orthographic_density        -0.098003
global_frequency * global_synonyms_count              -0.106929
global_letters_count * global_orthographic_density     0.026398
global_letters_count * global_synonyms_count          -0.097550
global_orthographic_density * global_synonyms_count   -0.013232
dtype: float64

Regressing global frequency with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.030679815503245278

intercept                   9.105244
rel_aoa                     0.093141
rel_clustering              0.132850
rel_frequency               0.180797
rel_letters_count          -0.036991
rel_orthographic_density   -0.100061
rel_synonyms_count         -0.118684
dtype: float64

Regressing global frequency with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09250611072877701

intercept                                        9.218483
rel_aoa                                          0.342976
rel_clustering                                   0.194752
rel_frequency                                    0.198375
rel_letters_count                               -0.104695
rel_orthographic_density                        -0.224266
rel_synonyms_count                              -0.140715
rel_aoa * rel_clustering                        -0.074151
rel_aoa * rel_frequency                          0.048594
rel_aoa * rel_letters_count                     -0.036882
rel_aoa * rel_orthographic_density              -0.018871
rel_aoa * rel_synonyms_count                     0.044112
rel_clustering * rel_frequency                  -0.096134
rel_clustering * rel_letters_count               0.046482
rel_clustering * rel_orthographic_density        0.388684
rel_clustering * rel_synonyms_count              0.255208
rel_frequency * rel_letters_count               -0.019281
rel_frequency * rel_orthographic_density        -0.056566
rel_frequency * rel_synonyms_count               0.017773
rel_letters_count * rel_orthographic_density    -0.029175
rel_letters_count * rel_synonyms_count          -0.256252
rel_orthographic_density * rel_synonyms_count   -0.561960
dtype: float64

Regressing rel frequency with 297 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.2554226578922375

intercept                  -1.604239
rel_aoa                     0.028527
rel_clustering              0.366787
rel_frequency               0.570731
rel_letters_count          -0.150014
rel_orthographic_density   -0.395491
rel_synonyms_count         -0.073575
dtype: float64

Regressing rel frequency with 297 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.2958347772574702

intercept                                       -1.480477
rel_aoa                                          0.122545
rel_clustering                                   0.282511
rel_frequency                                    0.649987
rel_letters_count                               -0.236775
rel_orthographic_density                        -0.652025
rel_synonyms_count                              -0.201449
rel_aoa * rel_clustering                        -0.094595
rel_aoa * rel_frequency                         -0.015257
rel_aoa * rel_letters_count                      0.013860
rel_aoa * rel_orthographic_density               0.119573
rel_aoa * rel_synonyms_count                     0.164447
rel_clustering * rel_frequency                  -0.129440
rel_clustering * rel_letters_count               0.034241
rel_clustering * rel_orthographic_density        0.312136
rel_clustering * rel_synonyms_count              0.122533
rel_frequency * rel_letters_count               -0.035212
rel_frequency * rel_orthographic_density        -0.080811
rel_frequency * rel_synonyms_count              -0.039283
rel_letters_count * rel_orthographic_density    -0.009409
rel_letters_count * rel_synonyms_count          -0.185251
rel_orthographic_density * rel_synonyms_count   -0.226421
dtype: float64

Regressing global frequency with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.06413057671297762

intercept                      6.055601
global_aoa                    -0.117268
global_clustering             -0.061342
global_frequency               0.266154
global_letters_count           0.077240
global_orthographic_density    0.053967
global_synonyms_count          0.146877
rel_aoa                        0.155864
rel_clustering                 0.291670
rel_frequency                  0.019669
rel_letters_count             -0.127663
rel_orthographic_density      -0.291300
rel_synonyms_count            -0.241346
dtype: float64

Regressing global frequency with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.33731604625308487

intercept                                                -87.946378
global_aoa                                                 1.060184
global_clustering                                        -22.306092
global_frequency                                           2.825732
global_letters_count                                      -1.907550
global_orthographic_density                                7.882270
global_synonyms_count                                     27.269524
rel_aoa                                                   -1.317515
rel_clustering                                            18.653148
rel_frequency                                             -5.983503
rel_letters_count                                          0.672075
rel_orthographic_density                                  -1.207170
rel_synonyms_count                                       -11.025201
global_aoa * global_clustering                             0.882779
global_aoa * global_frequency                              0.269323
global_aoa * global_letters_count                          0.260526
global_aoa * global_orthographic_density                   0.287475
global_aoa * global_synonyms_count                        -0.836716
global_aoa * rel_aoa                                       0.003597
global_aoa * rel_clustering                               -0.800222
global_aoa * rel_frequency                                -0.005830
global_aoa * rel_letters_count                            -0.101164
global_aoa * rel_orthographic_density                     -0.196654
global_aoa * rel_synonyms_count                            0.751558
global_clustering * global_frequency                       0.821482
global_clustering * global_letters_count                  -0.011527
global_clustering * global_orthographic_density            3.006645
global_clustering * global_synonyms_count                  1.976489
global_clustering * rel_aoa                               -0.771095
global_clustering * rel_clustering                        -0.130203
global_clustering * rel_frequency                         -1.122509
global_clustering * rel_letters_count                      0.053254
global_clustering * rel_orthographic_density              -2.190249
global_clustering * rel_synonyms_count                    -1.006198
global_frequency * global_letters_count                   -0.061968
global_frequency * global_orthographic_density             0.585749
global_frequency * global_synonyms_count                  -1.090808
global_frequency * rel_aoa                                -0.225970
global_frequency * rel_clustering                         -0.735168
global_frequency * rel_frequency                           0.003602
global_frequency * rel_letters_count                       0.071173
global_frequency * rel_orthographic_density               -0.695182
global_frequency * rel_synonyms_count                      0.551449
global_letters_count * global_orthographic_density         0.274229
global_letters_count * global_synonyms_count               0.259560
global_letters_count * rel_aoa                            -0.166877
global_letters_count * rel_clustering                      0.001932
global_letters_count * rel_frequency                       0.066350
global_letters_count * rel_letters_count                  -0.024961
global_letters_count * rel_orthographic_density           -0.527470
global_letters_count * rel_synonyms_count                 -1.021751
global_orthographic_density * global_synonyms_count        0.439630
global_orthographic_density * rel_aoa                     -0.131669
global_orthographic_density * rel_clustering              -2.207729
global_orthographic_density * rel_frequency               -0.573451
global_orthographic_density * rel_letters_count           -0.283604
global_orthographic_density * rel_orthographic_density    -0.378379
global_orthographic_density * rel_synonyms_count          -0.665867
global_synonyms_count * rel_aoa                            0.671548
global_synonyms_count * rel_clustering                    -1.509212
global_synonyms_count * rel_frequency                      1.092563
global_synonyms_count * rel_letters_count                  0.638910
global_synonyms_count * rel_orthographic_density           0.575706
global_synonyms_count * rel_synonyms_count                -0.114494
rel_aoa * rel_clustering                                   0.596033
rel_aoa * rel_frequency                                    0.068003
rel_aoa * rel_letters_count                                0.048841
rel_aoa * rel_orthographic_density                         0.080746
rel_aoa * rel_synonyms_count                              -0.502451
rel_clustering * rel_frequency                             0.889617
rel_clustering * rel_letters_count                        -0.066404
rel_clustering * rel_orthographic_density                  1.733327
rel_clustering * rel_synonyms_count                        0.683760
rel_frequency * rel_letters_count                         -0.108051
rel_frequency * rel_orthographic_density                   0.657665
rel_frequency * rel_synonyms_count                        -0.629616
rel_letters_count * rel_orthographic_density               0.326657
rel_letters_count * rel_synonyms_count                    -0.084123
rel_orthographic_density * rel_synonyms_count             -0.686856
dtype: float64

Regressing rel frequency with 297 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.34774975603855374

intercept                      5.955182
global_aoa                    -0.112044
global_clustering              0.068817
global_frequency              -0.650956
global_letters_count           0.098134
global_orthographic_density    0.050007
global_synonyms_count          0.018145
rel_aoa                        0.130688
rel_clustering                 0.214408
rel_frequency                  0.977123
rel_letters_count             -0.141699
rel_orthographic_density      -0.243203
rel_synonyms_count            -0.092301
dtype: float64

Regressing rel frequency with 297 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.5288549937930357

intercept                                                -91.084659
global_aoa                                                 1.792726
global_clustering                                        -21.520350
global_frequency                                           1.679353
global_letters_count                                      -0.821466
global_orthographic_density                                9.682612
global_synonyms_count                                     26.935452
rel_aoa                                                   -1.643750
rel_clustering                                            18.166089
rel_frequency                                             -4.678926
rel_letters_count                                         -0.300093
rel_orthographic_density                                  -2.635693
rel_synonyms_count                                       -11.983696
global_aoa * global_clustering                             0.866375
global_aoa * global_frequency                              0.241682
global_aoa * global_letters_count                          0.185550
global_aoa * global_orthographic_density                   0.191252
global_aoa * global_synonyms_count                        -0.807927
global_aoa * rel_aoa                                       0.007309
global_aoa * rel_clustering                               -0.777712
global_aoa * rel_frequency                                 0.018971
global_aoa * rel_letters_count                            -0.032272
global_aoa * rel_orthographic_density                     -0.105478
global_aoa * rel_synonyms_count                            0.724930
global_clustering * global_frequency                       0.766417
global_clustering * global_letters_count                   0.155912
global_clustering * global_orthographic_density            2.878489
global_clustering * global_synonyms_count                  1.687027
global_clustering * rel_aoa                               -0.767003
global_clustering * rel_clustering                        -0.221039
global_clustering * rel_frequency                         -1.014935
global_clustering * rel_letters_count                     -0.047789
global_clustering * rel_orthographic_density              -1.930300
global_clustering * rel_synonyms_count                    -0.810635
global_frequency * global_letters_count                    0.021607
global_frequency * global_orthographic_density             0.485884
global_frequency * global_synonyms_count                  -1.061259
global_frequency * rel_aoa                                -0.200752
global_frequency * rel_clustering                         -0.746668
global_frequency * rel_frequency                           0.023469
global_frequency * rel_letters_count                       0.011161
global_frequency * rel_orthographic_density               -0.563976
global_frequency * rel_synonyms_count                      0.590114
global_letters_count * global_orthographic_density         0.137413
global_letters_count * global_synonyms_count              -0.099176
global_letters_count * rel_aoa                            -0.163560
global_letters_count * rel_clustering                     -0.153834
global_letters_count * rel_frequency                       0.004099
global_letters_count * rel_letters_count                  -0.011296
global_letters_count * rel_orthographic_density           -0.349322
global_letters_count * rel_synonyms_count                 -0.667028
global_orthographic_density * global_synonyms_count        0.124369
global_orthographic_density * rel_aoa                     -0.123044
global_orthographic_density * rel_clustering              -2.120541
global_orthographic_density * rel_frequency               -0.451708
global_orthographic_density * rel_letters_count           -0.088567
global_orthographic_density * rel_orthographic_density    -0.259071
global_orthographic_density * rel_synonyms_count          -0.395531
global_synonyms_count * rel_aoa                            0.658226
global_synonyms_count * rel_clustering                    -1.259883
global_synonyms_count * rel_frequency                      1.011323
global_synonyms_count * rel_letters_count                  0.840068
global_synonyms_count * rel_orthographic_density           0.627307
global_synonyms_count * rel_synonyms_count                -0.082189
rel_aoa * rel_clustering                                   0.591679
rel_aoa * rel_frequency                                    0.036084
rel_aoa * rel_letters_count                                0.031019
rel_aoa * rel_orthographic_density                         0.083112
rel_aoa * rel_synonyms_count                              -0.492757
rel_clustering * rel_frequency                             0.846164
rel_clustering * rel_letters_count                         0.060156
rel_clustering * rel_orthographic_density                  1.570458
rel_clustering * rel_synonyms_count                        0.539195
rel_frequency * rel_letters_count                         -0.053722
rel_frequency * rel_orthographic_density                   0.548337
rel_frequency * rel_synonyms_count                        -0.607058
rel_letters_count * rel_orthographic_density               0.161385
rel_letters_count * rel_synonyms_count                    -0.312535
rel_orthographic_density * rel_synonyms_count             -0.759660
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 274 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.11046036737411213

intercept                      6.467545
global_aoa                     0.291062
global_clustering              0.063970
global_frequency              -0.111782
global_letters_count           0.044614
global_orthographic_density   -0.112280
global_synonyms_count          0.354700
dtype: float64

Regressing global aoa with 274 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.2095010949387751

intercept                                              10.303467
global_aoa                                              1.262570
global_clustering                                       2.030847
global_frequency                                        0.638955
global_letters_count                                   -0.051460
global_orthographic_density                            -7.424886
global_synonyms_count                                  -2.361640
global_aoa * global_clustering                          0.090918
global_aoa * global_frequency                          -0.038250
global_aoa * global_letters_count                      -0.040580
global_aoa * global_orthographic_density                0.126198
global_aoa * global_synonyms_count                      0.007449
global_clustering * global_frequency                   -0.010477
global_clustering * global_letters_count               -0.203965
global_clustering * global_orthographic_density        -1.016384
global_clustering * global_synonyms_count              -0.220420
global_frequency * global_letters_count                -0.089652
global_frequency * global_orthographic_density          0.015966
global_frequency * global_synonyms_count                0.018728
global_letters_count * global_orthographic_density      0.010673
global_letters_count * global_synonyms_count            0.039999
global_orthographic_density * global_synonyms_count     0.818580
dtype: float64

Regressing rel aoa with 274 measures, no interactions
           ^^^^^^^
R^2 = 0.061698878327682394

intercept                      0.284267
global_aoa                     0.130832
global_clustering             -0.005598
global_frequency              -0.185811
global_letters_count           0.159174
global_orthographic_density    0.203182
global_synonyms_count          0.309985
dtype: float64

Regressing rel aoa with 274 measures, with interactions
           ^^^^^^^
R^2 = 0.17886900974547126

intercept                                              21.239085
global_aoa                                              0.969521
global_clustering                                       2.971098
global_frequency                                       -0.877230
global_letters_count                                   -2.063151
global_orthographic_density                            -8.135680
global_synonyms_count                                  -0.310996
global_aoa * global_clustering                          0.048805
global_aoa * global_frequency                          -0.123649
global_aoa * global_letters_count                       0.036321
global_aoa * global_orthographic_density                0.267696
global_aoa * global_synonyms_count                     -0.022627
global_clustering * global_frequency                   -0.134758
global_clustering * global_letters_count               -0.188273
global_clustering * global_orthographic_density        -0.743454
global_clustering * global_synonyms_count              -0.232020
global_frequency * global_letters_count                 0.093811
global_frequency * global_orthographic_density          0.200427
global_frequency * global_synonyms_count               -0.157871
global_letters_count * global_orthographic_density      0.004588
global_letters_count * global_synonyms_count           -0.001830
global_orthographic_density * global_synonyms_count     0.598631
dtype: float64

Regressing global aoa with 274 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.04635745977007388

intercept                   6.874225
rel_aoa                     0.066811
rel_clustering              0.266506
rel_frequency               0.063409
rel_letters_count          -0.024544
rel_orthographic_density   -0.502535
rel_synonyms_count          0.368382
dtype: float64

Regressing global aoa with 274 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.17629846151476358

intercept                                        6.937952
rel_aoa                                         -0.057720
rel_clustering                                   0.620218
rel_frequency                                    0.146345
rel_letters_count                               -0.139997
rel_orthographic_density                        -0.234330
rel_synonyms_count                               1.320755
rel_aoa * rel_clustering                         0.199074
rel_aoa * rel_frequency                         -0.088039
rel_aoa * rel_letters_count                      0.038531
rel_aoa * rel_orthographic_density               0.317568
rel_aoa * rel_synonyms_count                     0.062210
rel_clustering * rel_frequency                   0.271641
rel_clustering * rel_letters_count              -0.145115
rel_clustering * rel_orthographic_density       -0.516256
rel_clustering * rel_synonyms_count             -0.688996
rel_frequency * rel_letters_count               -0.049044
rel_frequency * rel_orthographic_density         0.096268
rel_frequency * rel_synonyms_count              -0.148585
rel_letters_count * rel_orthographic_density    -0.042333
rel_letters_count * rel_synonyms_count          -0.084688
rel_orthographic_density * rel_synonyms_count    1.015226
dtype: float64

Regressing rel aoa with 274 measures, no interactions
           ^^^^^^^
R^2 = 0.22433204778016524

intercept                   0.460461
rel_aoa                     0.491969
rel_clustering             -0.038329
rel_frequency              -0.084727
rel_letters_count           0.061961
rel_orthographic_density    0.167706
rel_synonyms_count          0.234421
dtype: float64

Regressing rel aoa with 274 measures, with interactions
           ^^^^^^^
R^2 = 0.29529080093321447

intercept                                        0.777245
rel_aoa                                          0.593245
rel_clustering                                   0.159595
rel_frequency                                    0.069260
rel_letters_count                                0.081277
rel_orthographic_density                         0.770568
rel_synonyms_count                               0.873001
rel_aoa * rel_clustering                         0.066110
rel_aoa * rel_frequency                         -0.023965
rel_aoa * rel_letters_count                     -0.015979
rel_aoa * rel_orthographic_density               0.175898
rel_aoa * rel_synonyms_count                    -0.047376
rel_clustering * rel_frequency                   0.129204
rel_clustering * rel_letters_count              -0.117809
rel_clustering * rel_orthographic_density       -0.318416
rel_clustering * rel_synonyms_count             -0.216965
rel_frequency * rel_letters_count               -0.002027
rel_frequency * rel_orthographic_density         0.238883
rel_frequency * rel_synonyms_count              -0.105883
rel_letters_count * rel_orthographic_density    -0.031789
rel_letters_count * rel_synonyms_count          -0.027548
rel_orthographic_density * rel_synonyms_count    0.730041
dtype: float64

Regressing global aoa with 274 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.1403523249613121

intercept                      5.544723
global_aoa                     0.433659
global_clustering             -0.015707
global_frequency              -0.247056
global_letters_count           0.172254
global_orthographic_density    0.193019
global_synonyms_count          0.320804
rel_aoa                       -0.200095
rel_clustering                 0.044932
rel_frequency                  0.131474
rel_letters_count             -0.111777
rel_orthographic_density      -0.220202
rel_synonyms_count             0.033088
dtype: float64

Regressing global aoa with 274 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.4893736065187544

intercept                                                 202.785791
global_aoa                                                 -3.861413
global_clustering                                          37.674487
global_frequency                                           -4.800752
global_letters_count                                       -6.282717
global_orthographic_density                               -26.018724
global_synonyms_count                                     -45.850424
rel_aoa                                                     2.969750
rel_clustering                                            -18.443893
rel_frequency                                               8.159945
rel_letters_count                                           7.337487
rel_orthographic_density                                   13.752931
rel_synonyms_count                                         23.459392
global_aoa * global_clustering                             -0.865141
global_aoa * global_frequency                              -0.099714
global_aoa * global_letters_count                          -0.048568
global_aoa * global_orthographic_density                    0.045911
global_aoa * global_synonyms_count                          1.524615
global_aoa * rel_aoa                                        0.063963
global_aoa * rel_clustering                                 0.571562
global_aoa * rel_frequency                                  0.039992
global_aoa * rel_letters_count                             -0.160878
global_aoa * rel_orthographic_density                      -0.245487
global_aoa * rel_synonyms_count                            -1.551827
global_clustering * global_frequency                       -1.281642
global_clustering * global_letters_count                   -1.105821
global_clustering * global_orthographic_density            -6.002156
global_clustering * global_synonyms_count                  -2.324252
global_clustering * rel_aoa                                 0.740897
global_clustering * rel_clustering                          0.177523
global_clustering * rel_frequency                           1.654357
global_clustering * rel_letters_count                       0.610802
global_clustering * rel_orthographic_density                3.322283
global_clustering * rel_synonyms_count                      0.473319
global_frequency * global_letters_count                    -0.054870
global_frequency * global_orthographic_density             -1.152773
global_frequency * global_synonyms_count                    1.294273
global_frequency * rel_aoa                                  0.022009
global_frequency * rel_clustering                           0.186287
global_frequency * rel_frequency                           -0.022842
global_frequency * rel_letters_count                       -0.167412
global_frequency * rel_orthographic_density                 0.678058
global_frequency * rel_synonyms_count                      -0.987368
global_letters_count * global_orthographic_density          0.477675
global_letters_count * global_synonyms_count                0.661576
global_letters_count * rel_aoa                              0.032742
global_letters_count * rel_clustering                       0.810150
global_letters_count * rel_frequency                        0.044039
global_letters_count * rel_letters_count                    0.081633
global_letters_count * rel_orthographic_density             0.143131
global_letters_count * rel_synonyms_count                   0.852177
global_orthographic_density * global_synonyms_count         3.075224
global_orthographic_density * rel_aoa                       0.369637
global_orthographic_density * rel_clustering                4.731563
global_orthographic_density * rel_frequency                 0.986191
global_orthographic_density * rel_letters_count            -0.510775
global_orthographic_density * rel_orthographic_density      0.271165
global_orthographic_density * rel_synonyms_count           -2.260441
global_synonyms_count * rel_aoa                            -1.084535
global_synonyms_count * rel_clustering                      2.137643
global_synonyms_count * rel_frequency                      -1.294941
global_synonyms_count * rel_letters_count                  -1.849476
global_synonyms_count * rel_orthographic_density           -3.442097
global_synonyms_count * rel_synonyms_count                  0.586534
rel_aoa * rel_clustering                                   -0.331573
rel_aoa * rel_frequency                                    -0.022687
rel_aoa * rel_letters_count                                 0.105351
rel_aoa * rel_orthographic_density                          0.119528
rel_aoa * rel_synonyms_count                                1.043684
rel_clustering * rel_frequency                             -0.473241
rel_clustering * rel_letters_count                         -0.422574
rel_clustering * rel_orthographic_density                  -2.528205
rel_clustering * rel_synonyms_count                        -0.570174
rel_frequency * rel_letters_count                           0.190272
rel_frequency * rel_orthographic_density                   -0.312863
rel_frequency * rel_synonyms_count                          0.766006
rel_letters_count * rel_orthographic_density                0.135408
rel_letters_count * rel_synonyms_count                      0.585630
rel_orthographic_density * rel_synonyms_count               3.734540
dtype: float64

Regressing rel aoa with 274 measures, no interactions
           ^^^^^^^
R^2 = 0.2743489299272971

intercept                      5.624618
global_aoa                    -0.428007
global_clustering              0.047851
global_frequency              -0.262138
global_letters_count           0.087246
global_orthographic_density    0.027100
global_synonyms_count          0.377430
rel_aoa                        0.775840
rel_clustering                 0.054748
rel_frequency                  0.130905
rel_letters_count              0.018495
rel_orthographic_density      -0.037577
rel_synonyms_count            -0.064939
dtype: float64

Regressing rel aoa with 274 measures, with interactions
           ^^^^^^^
R^2 = 0.5665088863515516

intercept                                                 149.635375
global_aoa                                                 -3.702861
global_clustering                                          26.013060
global_frequency                                           -4.018265
global_letters_count                                       -4.106881
global_orthographic_density                               -23.014864
global_synonyms_count                                     -38.073954
rel_aoa                                                     3.013880
rel_clustering                                             -8.220720
rel_frequency                                               6.001134
rel_letters_count                                           6.322699
rel_orthographic_density                                   14.777210
rel_synonyms_count                                         20.418915
global_aoa * global_clustering                             -0.613663
global_aoa * global_frequency                              -0.088522
global_aoa * global_letters_count                          -0.039500
global_aoa * global_orthographic_density                    0.155472
global_aoa * global_synonyms_count                          1.538414
global_aoa * rel_aoa                                        0.022915
global_aoa * rel_clustering                                 0.368079
global_aoa * rel_frequency                                 -0.003400
global_aoa * rel_letters_count                             -0.166793
global_aoa * rel_orthographic_density                      -0.378082
global_aoa * rel_synonyms_count                            -1.541331
global_clustering * global_frequency                       -0.919428
global_clustering * global_letters_count                   -0.639582
global_clustering * global_orthographic_density            -4.221400
global_clustering * global_synonyms_count                  -2.877980
global_clustering * rel_aoa                                 0.354182
global_clustering * rel_clustering                          0.100566
global_clustering * rel_frequency                           1.066453
global_clustering * rel_letters_count                       0.518635
global_clustering * rel_orthographic_density                2.534184
global_clustering * rel_synonyms_count                      1.278337
global_frequency * global_letters_count                    -0.004387
global_frequency * global_orthographic_density             -0.526244
global_frequency * global_synonyms_count                    0.594276
global_frequency * rel_aoa                                  0.014534
global_frequency * rel_clustering                          -0.206177
global_frequency * rel_frequency                           -0.000398
global_frequency * rel_letters_count                       -0.137647
global_frequency * rel_orthographic_density                 0.240524
global_frequency * rel_synonyms_count                      -0.461835
global_letters_count * global_orthographic_density          0.464392
global_letters_count * global_synonyms_count                0.266942
global_letters_count * rel_aoa                             -0.052662
global_letters_count * rel_clustering                       0.383444
global_letters_count * rel_frequency                       -0.008796
global_letters_count * rel_letters_count                    0.092381
global_letters_count * rel_orthographic_density             0.135164
global_letters_count * rel_synonyms_count                   0.875768
global_orthographic_density * global_synonyms_count         2.094582
global_orthographic_density * rel_aoa                       0.061667
global_orthographic_density * rel_clustering                3.420023
global_orthographic_density * rel_frequency                 0.378218
global_orthographic_density * rel_letters_count            -0.413555
global_orthographic_density * rel_orthographic_density      0.168499
global_orthographic_density * rel_synonyms_count           -1.347758
global_synonyms_count * rel_aoa                            -0.989334
global_synonyms_count * rel_clustering                      1.905269
global_synonyms_count * rel_frequency                      -0.848994
global_synonyms_count * rel_letters_count                  -1.312415
global_synonyms_count * rel_orthographic_density           -2.380365
global_synonyms_count * rel_synonyms_count                  0.425229
rel_aoa * rel_clustering                                   -0.104072
rel_aoa * rel_frequency                                     0.005551
rel_aoa * rel_letters_count                                 0.155032
rel_aoa * rel_orthographic_density                          0.269129
rel_aoa * rel_synonyms_count                                0.824968
rel_clustering * rel_frequency                              0.052776
rel_clustering * rel_letters_count                         -0.177514
rel_clustering * rel_orthographic_density                  -1.875833
rel_clustering * rel_synonyms_count                        -0.725320
rel_frequency * rel_letters_count                           0.190273
rel_frequency * rel_orthographic_density                    0.084981
rel_frequency * rel_synonyms_count                          0.395669
rel_letters_count * rel_orthographic_density                0.124049
rel_letters_count * rel_synonyms_count                      0.357172
rel_orthographic_density * rel_synonyms_count               2.444441
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 232 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12317986765165312

intercept                     -2.624864
global_aoa                    -0.015280
global_clustering              0.295997
global_frequency              -0.083936
global_letters_count          -0.069460
global_orthographic_density   -0.054399
global_synonyms_count         -0.107271
dtype: float64

Regressing global clustering with 232 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.26992549095203333

intercept                                             -2.188313
global_aoa                                             0.455039
global_clustering                                      0.142086
global_frequency                                      -0.076605
global_letters_count                                  -0.540275
global_orthographic_density                           -0.653851
global_synonyms_count                                 -2.250162
global_aoa * global_clustering                         0.051716
global_aoa * global_frequency                         -0.057327
global_aoa * global_letters_count                      0.039826
global_aoa * global_orthographic_density               0.049334
global_aoa * global_synonyms_count                     0.057028
global_clustering * global_frequency                  -0.023004
global_clustering * global_letters_count               0.039707
global_clustering * global_orthographic_density       -0.068776
global_clustering * global_synonyms_count             -0.319285
global_frequency * global_letters_count                0.044633
global_frequency * global_orthographic_density        -0.003509
global_frequency * global_synonyms_count              -0.052872
global_letters_count * global_orthographic_density    -0.050707
global_letters_count * global_synonyms_count           0.041228
global_orthographic_density * global_synonyms_count    0.062683
dtype: float64

Regressing rel clustering with 232 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1037906719075885

intercept                      2.953116
global_aoa                    -0.008355
global_clustering              0.212550
global_frequency              -0.068991
global_letters_count          -0.079958
global_orthographic_density   -0.050514
global_synonyms_count         -0.175212
dtype: float64

Regressing rel clustering with 232 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.19738166911662147

intercept                                              4.815569
global_aoa                                             0.297835
global_clustering                                      0.352081
global_frequency                                      -0.102238
global_letters_count                                  -0.579309
global_orthographic_density                           -0.268671
global_synonyms_count                                 -2.610366
global_aoa * global_clustering                         0.041213
global_aoa * global_frequency                         -0.043084
global_aoa * global_letters_count                      0.035983
global_aoa * global_orthographic_density               0.060551
global_aoa * global_synonyms_count                     0.044556
global_clustering * global_frequency                  -0.032714
global_clustering * global_letters_count               0.002161
global_clustering * global_orthographic_density        0.008884
global_clustering * global_synonyms_count             -0.333650
global_frequency * global_letters_count                0.025696
global_frequency * global_orthographic_density        -0.007398
global_frequency * global_synonyms_count              -0.024637
global_letters_count * global_orthographic_density    -0.034683
global_letters_count * global_synonyms_count           0.052350
global_orthographic_density * global_synonyms_count    0.036248
dtype: float64

Regressing global clustering with 232 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.07054113195760492

intercept                  -5.753990
rel_aoa                     0.024157
rel_clustering              0.250427
rel_frequency              -0.011178
rel_letters_count          -0.070239
rel_orthographic_density   -0.035368
rel_synonyms_count         -0.100753
dtype: float64

Regressing global clustering with 232 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17162044787322284

intercept                                       -5.746713
rel_aoa                                         -0.045530
rel_clustering                                   0.235181
rel_frequency                                   -0.023618
rel_letters_count                               -0.062815
rel_orthographic_density                         0.063353
rel_synonyms_count                              -0.202292
rel_aoa * rel_clustering                         0.055301
rel_aoa * rel_frequency                         -0.025655
rel_aoa * rel_letters_count                      0.014215
rel_aoa * rel_orthographic_density               0.069416
rel_aoa * rel_synonyms_count                     0.059147
rel_clustering * rel_frequency                   0.028762
rel_clustering * rel_letters_count              -0.034632
rel_clustering * rel_orthographic_density       -0.147273
rel_clustering * rel_synonyms_count             -0.408260
rel_frequency * rel_letters_count                0.013425
rel_frequency * rel_orthographic_density         0.013031
rel_frequency * rel_synonyms_count              -0.075451
rel_letters_count * rel_orthographic_density    -0.030042
rel_letters_count * rel_synonyms_count           0.018910
rel_orthographic_density * rel_synonyms_count    0.099948
dtype: float64

Regressing rel clustering with 232 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.17050222388147174

intercept                   0.341313
rel_aoa                    -0.004276
rel_clustering              0.451062
rel_frequency              -0.006622
rel_letters_count          -0.057324
rel_orthographic_density   -0.037638
rel_synonyms_count         -0.095723
dtype: float64

Regressing rel clustering with 232 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2313139949439127

intercept                                        0.375718
rel_aoa                                         -0.018176
rel_clustering                                   0.417734
rel_frequency                                   -0.004742
rel_letters_count                               -0.046110
rel_orthographic_density                         0.035879
rel_synonyms_count                              -0.172327
rel_aoa * rel_clustering                         0.037381
rel_aoa * rel_frequency                         -0.015875
rel_aoa * rel_letters_count                      0.002259
rel_aoa * rel_orthographic_density               0.052588
rel_aoa * rel_synonyms_count                     0.052282
rel_clustering * rel_frequency                   0.002249
rel_clustering * rel_letters_count              -0.020524
rel_clustering * rel_orthographic_density       -0.060389
rel_clustering * rel_synonyms_count             -0.312187
rel_frequency * rel_letters_count                0.009489
rel_frequency * rel_orthographic_density         0.009789
rel_frequency * rel_synonyms_count              -0.048992
rel_letters_count * rel_orthographic_density    -0.023805
rel_letters_count * rel_synonyms_count           0.044693
rel_orthographic_density * rel_synonyms_count    0.131330
dtype: float64

Regressing global clustering with 232 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.17517926132894057

intercept                      0.248380
global_aoa                    -0.059206
global_clustering              0.385331
global_frequency              -0.245281
global_letters_count          -0.137831
global_orthographic_density    0.013613
global_synonyms_count         -0.139341
rel_aoa                        0.062979
rel_clustering                -0.083554
rel_frequency                  0.181759
rel_letters_count              0.068377
rel_orthographic_density      -0.068680
rel_synonyms_count             0.009158
dtype: float64

Regressing global clustering with 232 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.48358170651449595

intercept                                                 10.509134
global_aoa                                                 0.001539
global_clustering                                          2.962396
global_frequency                                          -2.196761
global_letters_count                                       0.359428
global_orthographic_density                                5.591001
global_synonyms_count                                      4.009927
rel_aoa                                                   -0.635731
rel_clustering                                            -5.001013
rel_frequency                                              1.579294
rel_letters_count                                          0.060905
rel_orthographic_density                                  -5.853431
rel_synonyms_count                                        -7.024093
global_aoa * global_clustering                            -0.116156
global_aoa * global_frequency                             -0.044227
global_aoa * global_letters_count                          0.006475
global_aoa * global_orthographic_density                  -0.248497
global_aoa * global_synonyms_count                        -0.147067
global_aoa * rel_aoa                                       0.034372
global_aoa * rel_clustering                                0.287125
global_aoa * rel_frequency                                -0.019744
global_aoa * rel_letters_count                             0.022600
global_aoa * rel_orthographic_density                      0.238289
global_aoa * rel_synonyms_count                            0.333375
global_clustering * global_frequency                      -0.336704
global_clustering * global_letters_count                   0.190783
global_clustering * global_orthographic_density            0.406741
global_clustering * global_synonyms_count                 -0.099871
global_clustering * rel_aoa                                0.141664
global_clustering * rel_clustering                        -0.235348
global_clustering * rel_frequency                          0.280344
global_clustering * rel_letters_count                      0.008781
global_clustering * rel_orthographic_density              -0.291274
global_clustering * rel_synonyms_count                     0.335694
global_frequency * global_letters_count                    0.098045
global_frequency * global_orthographic_density            -0.059879
global_frequency * global_synonyms_count                  -0.023084
global_frequency * rel_aoa                                 0.068323
global_frequency * rel_clustering                          0.198426
global_frequency * rel_frequency                           0.003586
global_frequency * rel_letters_count                      -0.066100
global_frequency * rel_orthographic_density                0.167240
global_frequency * rel_synonyms_count                      0.200044
global_letters_count * global_orthographic_density        -0.154970
global_letters_count * global_synonyms_count              -0.406208
global_letters_count * rel_aoa                             0.063543
global_letters_count * rel_clustering                     -0.075350
global_letters_count * rel_frequency                      -0.000253
global_letters_count * rel_letters_count                   0.018142
global_letters_count * rel_orthographic_density            0.130747
global_letters_count * rel_synonyms_count                  0.594920
global_orthographic_density * global_synonyms_count       -1.029103
global_orthographic_density * rel_aoa                      0.214572
global_orthographic_density * rel_clustering               0.106937
global_orthographic_density * rel_frequency                0.160947
global_orthographic_density * rel_letters_count            0.216312
global_orthographic_density * rel_orthographic_density    -0.032572
global_orthographic_density * rel_synonyms_count           1.255459
global_synonyms_count * rel_aoa                            0.050913
global_synonyms_count * rel_clustering                    -0.279514
global_synonyms_count * rel_frequency                     -0.059268
global_synonyms_count * rel_letters_count                  0.062107
global_synonyms_count * rel_orthographic_density           0.115809
global_synonyms_count * rel_synonyms_count                 0.057424
rel_aoa * rel_clustering                                  -0.189990
rel_aoa * rel_frequency                                   -0.038232
rel_aoa * rel_letters_count                               -0.064886
rel_aoa * rel_orthographic_density                        -0.079723
rel_aoa * rel_synonyms_count                              -0.155180
rel_clustering * rel_frequency                            -0.176095
rel_clustering * rel_letters_count                        -0.115096
rel_clustering * rel_orthographic_density                 -0.237791
rel_clustering * rel_synonyms_count                       -0.397711
rel_frequency * rel_letters_count                         -0.000364
rel_frequency * rel_orthographic_density                  -0.237342
rel_frequency * rel_synonyms_count                        -0.176623
rel_letters_count * rel_orthographic_density              -0.201891
rel_letters_count * rel_synonyms_count                    -0.213927
rel_orthographic_density * rel_synonyms_count             -0.118936
dtype: float64

Regressing rel clustering with 232 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.2865074362586241

intercept                      0.712318
global_aoa                    -0.045505
global_clustering             -0.475532
global_frequency              -0.220485
global_letters_count          -0.144094
global_orthographic_density    0.015116
global_synonyms_count         -0.110041
rel_aoa                        0.045829
rel_clustering                 0.860892
rel_frequency                  0.163771
rel_letters_count              0.076018
rel_orthographic_density      -0.084605
rel_synonyms_count            -0.020099
dtype: float64

Regressing rel clustering with 232 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.5422905324481866

intercept                                                 19.009584
global_aoa                                                -0.524229
global_clustering                                          2.777498
global_frequency                                          -2.307953
global_letters_count                                      -0.675629
global_orthographic_density                                3.993541
global_synonyms_count                                      3.223833
rel_aoa                                                   -0.433369
rel_clustering                                            -4.412729
rel_frequency                                              1.194300
rel_letters_count                                          0.537952
rel_orthographic_density                                  -4.776099
rel_synonyms_count                                        -6.309050
global_aoa * global_clustering                            -0.149877
global_aoa * global_frequency                             -0.036720
global_aoa * global_letters_count                          0.046458
global_aoa * global_orthographic_density                  -0.200768
global_aoa * global_synonyms_count                        -0.088475
global_aoa * rel_aoa                                       0.041122
global_aoa * rel_clustering                                0.235901
global_aoa * rel_frequency                                -0.031860
global_aoa * rel_letters_count                            -0.005573
global_aoa * rel_orthographic_density                      0.240791
global_aoa * rel_synonyms_count                            0.215879
global_clustering * global_frequency                      -0.324050
global_clustering * global_letters_count                   0.102467
global_clustering * global_orthographic_density            0.236952
global_clustering * global_synonyms_count                 -0.055118
global_clustering * rel_aoa                                0.134748
global_clustering * rel_clustering                        -0.251319
global_clustering * rel_frequency                          0.202699
global_clustering * rel_letters_count                      0.045856
global_clustering * rel_orthographic_density              -0.139983
global_clustering * rel_synonyms_count                     0.252444
global_frequency * global_letters_count                    0.118703
global_frequency * global_orthographic_density            -0.039944
global_frequency * global_synonyms_count                  -0.039607
global_frequency * rel_aoa                                 0.063350
global_frequency * rel_clustering                          0.193057
global_frequency * rel_frequency                           0.003941
global_frequency * rel_letters_count                      -0.061780
global_frequency * rel_orthographic_density                0.160399
global_frequency * rel_synonyms_count                      0.215935
global_letters_count * global_orthographic_density        -0.132447
global_letters_count * global_synonyms_count              -0.296173
global_letters_count * rel_aoa                             0.027192
global_letters_count * rel_clustering                      0.026834
global_letters_count * rel_frequency                       0.001924
global_letters_count * rel_letters_count                   0.011987
global_letters_count * rel_orthographic_density            0.113033
global_letters_count * rel_synonyms_count                  0.544510
global_orthographic_density * global_synonyms_count       -0.802268
global_orthographic_density * rel_aoa                      0.195465
global_orthographic_density * rel_clustering               0.225725
global_orthographic_density * rel_frequency                0.152031
global_orthographic_density * rel_letters_count            0.173177
global_orthographic_density * rel_orthographic_density    -0.029390
global_orthographic_density * rel_synonyms_count           0.981092
global_synonyms_count * rel_aoa                           -0.020881
global_synonyms_count * rel_clustering                    -0.195115
global_synonyms_count * rel_frequency                     -0.038500
global_synonyms_count * rel_letters_count                 -0.012936
global_synonyms_count * rel_orthographic_density          -0.011264
global_synonyms_count * rel_synonyms_count                 0.039562
rel_aoa * rel_clustering                                  -0.168572
rel_aoa * rel_frequency                                   -0.022054
rel_aoa * rel_letters_count                               -0.041244
rel_aoa * rel_orthographic_density                        -0.082257
rel_aoa * rel_synonyms_count                              -0.082158
rel_clustering * rel_frequency                            -0.123699
rel_clustering * rel_letters_count                        -0.158242
rel_clustering * rel_orthographic_density                 -0.367244
rel_clustering * rel_synonyms_count                       -0.363357
rel_frequency * rel_letters_count                         -0.010348
rel_frequency * rel_orthographic_density                  -0.215837
rel_frequency * rel_synonyms_count                        -0.188053
rel_letters_count * rel_orthographic_density              -0.177188
rel_letters_count * rel_synonyms_count                    -0.191015
rel_orthographic_density * rel_synonyms_count             -0.035694
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08897975348261489

intercept                      3.009594
global_aoa                     0.059561
global_clustering             -0.286452
global_frequency              -0.020924
global_letters_count           0.250577
global_orthographic_density   -0.068925
global_synonyms_count         -0.082024
dtype: float64

Regressing global letters_count with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15102070382926347

intercept                                             -16.990730
global_aoa                                              1.694596
global_clustering                                      -1.684721
global_frequency                                        2.195931
global_letters_count                                    0.814175
global_orthographic_density                            -2.809159
global_synonyms_count                                   1.085121
global_aoa * global_clustering                          0.165377
global_aoa * global_frequency                           0.013493
global_aoa * global_letters_count                      -0.115205
global_aoa * global_orthographic_density               -0.070440
global_aoa * global_synonyms_count                      0.016919
global_clustering * global_frequency                    0.273385
global_clustering * global_letters_count               -0.257313
global_clustering * global_orthographic_density        -0.573810
global_clustering * global_synonyms_count               0.131117
global_frequency * global_letters_count                -0.125707
global_frequency * global_orthographic_density          0.015033
global_frequency * global_synonyms_count                0.046809
global_letters_count * global_orthographic_density     -0.039272
global_letters_count * global_synonyms_count           -0.130903
global_orthographic_density * global_synonyms_count    -0.138247
dtype: float64

Regressing rel letters_count with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.06602140882695329

intercept                     -1.359860
global_aoa                     0.053700
global_clustering             -0.382560
global_frequency              -0.065124
global_letters_count           0.260148
global_orthographic_density    0.113049
global_synonyms_count         -0.215543
dtype: float64

Regressing rel letters_count with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11478060479218331

intercept                                             -17.398193
global_aoa                                              1.550736
global_clustering                                      -2.316515
global_frequency                                        1.946045
global_letters_count                                   -0.190567
global_orthographic_density                            -3.319239
global_synonyms_count                                  -0.132899
global_aoa * global_clustering                          0.195840
global_aoa * global_frequency                          -0.002587
global_aoa * global_letters_count                      -0.062585
global_aoa * global_orthographic_density                0.006766
global_aoa * global_synonyms_count                      0.029243
global_clustering * global_frequency                    0.302872
global_clustering * global_letters_count               -0.249438
global_clustering * global_orthographic_density        -0.480299
global_clustering * global_synonyms_count              -0.063276
global_frequency * global_letters_count                -0.054743
global_frequency * global_orthographic_density          0.086408
global_frequency * global_synonyms_count               -0.055944
global_letters_count * global_orthographic_density     -0.054384
global_letters_count * global_synonyms_count           -0.031674
global_orthographic_density * global_synonyms_count    -0.005612
dtype: float64

Regressing global letters_count with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.06257398925224733

intercept                   5.814625
rel_aoa                    -0.108533
rel_clustering             -0.086205
rel_frequency               0.039883
rel_letters_count           0.165006
rel_orthographic_density   -0.331490
rel_synonyms_count         -0.050772
dtype: float64

Regressing global letters_count with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09340318599725293

intercept                                        5.893698
rel_aoa                                         -0.188301
rel_clustering                                   0.313510
rel_frequency                                    0.139431
rel_letters_count                                0.215281
rel_orthographic_density                        -0.158462
rel_synonyms_count                              -0.131968
rel_aoa * rel_clustering                         0.036788
rel_aoa * rel_frequency                         -0.036480
rel_aoa * rel_letters_count                     -0.009061
rel_aoa * rel_orthographic_density               0.032647
rel_aoa * rel_synonyms_count                    -0.061555
rel_clustering * rel_frequency                   0.118735
rel_clustering * rel_letters_count              -0.020658
rel_clustering * rel_orthographic_density        0.114283
rel_clustering * rel_synonyms_count             -0.118805
rel_frequency * rel_letters_count               -0.013662
rel_frequency * rel_orthographic_density         0.118259
rel_frequency * rel_synonyms_count              -0.139905
rel_letters_count * rel_orthographic_density     0.037763
rel_letters_count * rel_synonyms_count          -0.050799
rel_orthographic_density * rel_synonyms_count    0.129526
dtype: float64

Regressing rel letters_count with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.16896276060935667

intercept                   1.150888
rel_aoa                    -0.077051
rel_clustering             -0.210975
rel_frequency              -0.167506
rel_letters_count           0.428528
rel_orthographic_density    0.090417
rel_synonyms_count         -0.031936
dtype: float64

Regressing rel letters_count with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.20373367600151626

intercept                                        1.180364
rel_aoa                                         -0.026800
rel_clustering                                   0.364912
rel_frequency                                   -0.140065
rel_letters_count                                0.587702
rel_orthographic_density                         0.372450
rel_synonyms_count                              -0.155978
rel_aoa * rel_clustering                         0.082181
rel_aoa * rel_frequency                         -0.005087
rel_aoa * rel_letters_count                     -0.078032
rel_aoa * rel_orthographic_density              -0.074786
rel_aoa * rel_synonyms_count                    -0.028786
rel_clustering * rel_frequency                   0.209064
rel_clustering * rel_letters_count               0.011695
rel_clustering * rel_orthographic_density        0.155891
rel_clustering * rel_synonyms_count              0.041667
rel_frequency * rel_letters_count                0.016982
rel_frequency * rel_orthographic_density         0.141938
rel_frequency * rel_synonyms_count              -0.094668
rel_letters_count * rel_orthographic_density     0.034326
rel_letters_count * rel_synonyms_count          -0.033753
rel_orthographic_density * rel_synonyms_count    0.098302
dtype: float64

Regressing global letters_count with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11858412674446273

intercept                     -0.924908
global_aoa                     0.237077
global_clustering             -0.461909
global_frequency               0.053769
global_letters_count           0.397191
global_orthographic_density    0.060939
global_synonyms_count         -0.095296
rel_aoa                       -0.253579
rel_clustering                 0.153855
rel_frequency                 -0.107545
rel_letters_count             -0.134488
rel_orthographic_density      -0.071717
rel_synonyms_count             0.027212
dtype: float64

Regressing global letters_count with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.37146443644114113

intercept                                                 39.493901
global_aoa                                                 3.108444
global_clustering                                         13.648600
global_frequency                                           1.680674
global_letters_count                                       0.196901
global_orthographic_density                               -9.716198
global_synonyms_count                                    -19.942579
rel_aoa                                                   -2.617994
rel_clustering                                           -12.112945
rel_frequency                                              2.835657
rel_letters_count                                         -1.896141
rel_orthographic_density                                  -1.519648
rel_synonyms_count                                        10.683748
global_aoa * global_clustering                            -0.127439
global_aoa * global_frequency                             -0.223741
global_aoa * global_letters_count                         -0.331506
global_aoa * global_orthographic_density                   0.071641
global_aoa * global_synonyms_count                         0.470620
global_aoa * rel_aoa                                       0.031588
global_aoa * rel_clustering                                0.277614
global_aoa * rel_frequency                                 0.180100
global_aoa * rel_letters_count                             0.089165
global_aoa * rel_orthographic_density                     -0.336367
global_aoa * rel_synonyms_count                           -0.326992
global_clustering * global_frequency                      -0.224730
global_clustering * global_letters_count                  -0.612348
global_clustering * global_orthographic_density           -3.166732
global_clustering * global_synonyms_count                 -1.193915
global_clustering * rel_aoa                                0.447807
global_clustering * rel_clustering                         0.222587
global_clustering * rel_frequency                          0.785704
global_clustering * rel_letters_count                     -0.058725
global_clustering * rel_orthographic_density               1.230637
global_clustering * rel_synonyms_count                     0.326151
global_frequency * global_letters_count                   -0.054230
global_frequency * global_orthographic_density            -0.674536
global_frequency * global_synonyms_count                   0.604935
global_frequency * rel_aoa                                 0.314255
global_frequency * rel_clustering                          0.539560
global_frequency * rel_frequency                           0.016497
global_frequency * rel_letters_count                      -0.026450
global_frequency * rel_orthographic_density                0.616747
global_frequency * rel_synonyms_count                     -0.429884
global_letters_count * global_orthographic_density        -0.403203
global_letters_count * global_synonyms_count               0.407657
global_letters_count * rel_aoa                             0.439258
global_letters_count * rel_clustering                      0.413320
global_letters_count * rel_frequency                      -0.056890
global_letters_count * rel_letters_count                   0.133964
global_letters_count * rel_orthographic_density            0.665418
global_letters_count * rel_synonyms_count                  0.067006
global_orthographic_density * global_synonyms_count        0.319830
global_orthographic_density * rel_aoa                     -0.046005
global_orthographic_density * rel_clustering               1.581330
global_orthographic_density * rel_frequency                0.502482
global_orthographic_density * rel_letters_count            0.315961
global_orthographic_density * rel_orthographic_density     0.441854
global_orthographic_density * rel_synonyms_count          -0.632880
global_synonyms_count * rel_aoa                           -0.466823
global_synonyms_count * rel_clustering                     1.515333
global_synonyms_count * rel_frequency                     -0.801936
global_synonyms_count * rel_letters_count                 -1.070702
global_synonyms_count * rel_orthographic_density          -0.878182
global_synonyms_count * rel_synonyms_count                -0.311250
rel_aoa * rel_clustering                                  -0.366520
rel_aoa * rel_frequency                                   -0.253420
rel_aoa * rel_letters_count                               -0.373709
rel_aoa * rel_orthographic_density                         0.269217
rel_aoa * rel_synonyms_count                               0.425364
rel_clustering * rel_frequency                            -0.685510
rel_clustering * rel_letters_count                         0.182832
rel_clustering * rel_orthographic_density                  0.415431
rel_clustering * rel_synonyms_count                       -0.597390
rel_frequency * rel_letters_count                          0.097103
rel_frequency * rel_orthographic_density                  -0.287316
rel_frequency * rel_synonyms_count                         0.604561
rel_letters_count * rel_orthographic_density              -0.161153
rel_letters_count * rel_synonyms_count                     0.468774
rel_orthographic_density * rel_synonyms_count              1.350615
dtype: float64

Regressing rel letters_count with 297 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.22287409380636758

intercept                     -1.569818
global_aoa                     0.172597
global_clustering             -0.501206
global_frequency               0.071039
global_letters_count          -0.475079
global_orthographic_density    0.017258
global_synonyms_count         -0.169353
rel_aoa                       -0.189591
rel_clustering                 0.213896
rel_frequency                 -0.144771
rel_letters_count              0.765103
rel_orthographic_density      -0.098066
rel_synonyms_count             0.083164
dtype: float64

Regressing rel letters_count with 297 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.433013086849089

intercept                                                 27.405500
global_aoa                                                 1.705907
global_clustering                                         10.647799
global_frequency                                           2.168373
global_letters_count                                       0.294775
global_orthographic_density                               -8.820264
global_synonyms_count                                    -17.823757
rel_aoa                                                   -1.433896
rel_clustering                                           -10.594934
rel_frequency                                              2.174269
rel_letters_count                                         -2.327373
rel_orthographic_density                                  -4.932278
rel_synonyms_count                                         8.710548
global_aoa * global_clustering                            -0.232940
global_aoa * global_frequency                             -0.170843
global_aoa * global_letters_count                         -0.265466
global_aoa * global_orthographic_density                  -0.025769
global_aoa * global_synonyms_count                         0.538079
global_aoa * rel_aoa                                       0.034547
global_aoa * rel_clustering                                0.453029
global_aoa * rel_frequency                                 0.144963
global_aoa * rel_letters_count                             0.038584
global_aoa * rel_orthographic_density                     -0.232796
global_aoa * rel_synonyms_count                           -0.462590
global_clustering * global_frequency                      -0.091993
global_clustering * global_letters_count                  -0.350144
global_clustering * global_orthographic_density           -2.846416
global_clustering * global_synonyms_count                 -0.772193
global_clustering * rel_aoa                                0.465592
global_clustering * rel_clustering                         0.173832
global_clustering * rel_frequency                          0.602741
global_clustering * rel_letters_count                     -0.329959
global_clustering * rel_orthographic_density               0.791961
global_clustering * rel_synonyms_count                    -0.173145
global_frequency * global_letters_count                   -0.072847
global_frequency * global_orthographic_density            -0.611182
global_frequency * global_synonyms_count                   0.615993
global_frequency * rel_aoa                                 0.268041
global_frequency * rel_clustering                          0.388592
global_frequency * rel_frequency                           0.015733
global_frequency * rel_letters_count                       0.003943
global_frequency * rel_orthographic_density                0.662332
global_frequency * rel_synonyms_count                     -0.451305
global_letters_count * global_orthographic_density        -0.243483
global_letters_count * global_synonyms_count               0.424113
global_letters_count * rel_aoa                             0.330429
global_letters_count * rel_clustering                      0.157198
global_letters_count * rel_frequency                      -0.089856
global_letters_count * rel_letters_count                   0.128832
global_letters_count * rel_orthographic_density            0.588204
global_letters_count * rel_synonyms_count                  0.031493
global_orthographic_density * global_synonyms_count        0.319649
global_orthographic_density * rel_aoa                     -0.045286
global_orthographic_density * rel_clustering               1.601830
global_orthographic_density * rel_frequency                0.402433
global_orthographic_density * rel_letters_count            0.203182
global_orthographic_density * rel_orthographic_density     0.381104
global_orthographic_density * rel_synonyms_count          -0.600223
global_synonyms_count * rel_aoa                           -0.445667
global_synonyms_count * rel_clustering                     1.422566
global_synonyms_count * rel_frequency                     -0.776114
global_synonyms_count * rel_letters_count                 -1.079251
global_synonyms_count * rel_orthographic_density          -0.642958
global_synonyms_count * rel_synonyms_count                -0.317989
rel_aoa * rel_clustering                                  -0.509385
rel_aoa * rel_frequency                                   -0.216807
rel_aoa * rel_letters_count                               -0.303278
rel_aoa * rel_orthographic_density                         0.222026
rel_aoa * rel_synonyms_count                               0.496246
rel_clustering * rel_frequency                            -0.510727
rel_clustering * rel_letters_count                         0.466999
rel_clustering * rel_orthographic_density                  0.487073
rel_clustering * rel_synonyms_count                       -0.310694
rel_frequency * rel_letters_count                          0.110710
rel_frequency * rel_orthographic_density                  -0.339326
rel_frequency * rel_synonyms_count                         0.615084
rel_letters_count * rel_orthographic_density              -0.135138
rel_letters_count * rel_synonyms_count                     0.464092
rel_orthographic_density * rel_synonyms_count              1.076034
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 291 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10833017520573651

intercept                      0.379190
global_aoa                    -0.003587
global_clustering             -0.001628
global_frequency              -0.005202
global_letters_count          -0.015590
global_orthographic_density    0.081524
global_synonyms_count          0.235023
dtype: float64

Regressing global synonyms_count with 291 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13767315928893598

intercept                                             -3.948404
global_aoa                                             0.400996
global_clustering                                     -0.190050
global_frequency                                       0.161289
global_letters_count                                   0.352548
global_orthographic_density                            0.829578
global_synonyms_count                                 -0.296390
global_aoa * global_clustering                         0.025747
global_aoa * global_frequency                         -0.009283
global_aoa * global_letters_count                     -0.022738
global_aoa * global_orthographic_density              -0.040064
global_aoa * global_synonyms_count                     0.058088
global_clustering * global_frequency                  -0.006336
global_clustering * global_letters_count               0.001068
global_clustering * global_orthographic_density        0.046172
global_clustering * global_synonyms_count              0.007170
global_frequency * global_letters_count               -0.019580
global_frequency * global_orthographic_density        -0.024341
global_frequency * global_synonyms_count               0.037154
global_letters_count * global_orthographic_density     0.006084
global_letters_count * global_synonyms_count          -0.029950
global_orthographic_density * global_synonyms_count    0.019967
dtype: float64

Regressing rel synonyms_count with 291 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.0708561462763998

intercept                      0.126907
global_aoa                    -0.006757
global_clustering             -0.013192
global_frequency               0.001132
global_letters_count          -0.026499
global_orthographic_density    0.037898
global_synonyms_count          0.178118
dtype: float64

Regressing rel synonyms_count with 291 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.09732987979440122

intercept                                             -3.846174
global_aoa                                             0.316827
global_clustering                                     -0.237732
global_frequency                                       0.146640
global_letters_count                                   0.330069
global_orthographic_density                            0.626553
global_synonyms_count                                 -0.104645
global_aoa * global_clustering                         0.019440
global_aoa * global_frequency                         -0.007099
global_aoa * global_letters_count                     -0.020852
global_aoa * global_orthographic_density              -0.025081
global_aoa * global_synonyms_count                     0.050793
global_clustering * global_frequency                  -0.002708
global_clustering * global_letters_count               0.006802
global_clustering * global_orthographic_density        0.050892
global_clustering * global_synonyms_count              0.012027
global_frequency * global_letters_count               -0.016257
global_frequency * global_orthographic_density        -0.013797
global_frequency * global_synonyms_count               0.023323
global_letters_count * global_orthographic_density     0.007541
global_letters_count * global_synonyms_count          -0.031077
global_orthographic_density * global_synonyms_count   -0.018580
dtype: float64

Regressing global synonyms_count with 291 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11255325805788552

intercept                   0.536355
rel_aoa                     0.024763
rel_clustering             -0.055005
rel_frequency              -0.004947
rel_letters_count          -0.038968
rel_orthographic_density    0.068345
rel_synonyms_count          0.222463
dtype: float64

Regressing global synonyms_count with 291 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16735371812468214

intercept                                        0.605100
rel_aoa                                         -0.017506
rel_clustering                                  -0.129048
rel_frequency                                    0.040196
rel_letters_count                               -0.082842
rel_orthographic_density                         0.061665
rel_synonyms_count                               0.165724
rel_aoa * rel_clustering                         0.018727
rel_aoa * rel_frequency                         -0.007592
rel_aoa * rel_letters_count                      0.026772
rel_aoa * rel_orthographic_density               0.054000
rel_aoa * rel_synonyms_count                     0.051613
rel_clustering * rel_frequency                  -0.040891
rel_clustering * rel_letters_count              -0.021201
rel_clustering * rel_orthographic_density        0.008920
rel_clustering * rel_synonyms_count             -0.029933
rel_frequency * rel_letters_count               -0.011338
rel_frequency * rel_orthographic_density         0.006854
rel_frequency * rel_synonyms_count               0.000921
rel_letters_count * rel_orthographic_density    -0.005745
rel_letters_count * rel_synonyms_count           0.016497
rel_orthographic_density * rel_synonyms_count    0.025240
dtype: float64

Regressing rel synonyms_count with 291 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.17446655640671338

intercept                   0.194004
rel_aoa                     0.006999
rel_clustering              0.008159
rel_frequency               0.000948
rel_letters_count          -0.038062
rel_orthographic_density    0.019898
rel_synonyms_count          0.364514
dtype: float64

Regressing rel synonyms_count with 291 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.21331997602637695

intercept                                        0.266042
rel_aoa                                         -0.026364
rel_clustering                                  -0.092225
rel_frequency                                    0.048998
rel_letters_count                               -0.078734
rel_orthographic_density                         0.007583
rel_synonyms_count                               0.384984
rel_aoa * rel_clustering                         0.018238
rel_aoa * rel_frequency                         -0.004194
rel_aoa * rel_letters_count                      0.022348
rel_aoa * rel_orthographic_density               0.041277
rel_aoa * rel_synonyms_count                     0.028110
rel_clustering * rel_frequency                  -0.039288
rel_clustering * rel_letters_count              -0.011019
rel_clustering * rel_orthographic_density       -0.003024
rel_clustering * rel_synonyms_count             -0.003785
rel_frequency * rel_letters_count               -0.013672
rel_frequency * rel_orthographic_density         0.007975
rel_frequency * rel_synonyms_count               0.008424
rel_letters_count * rel_orthographic_density     0.003152
rel_letters_count * rel_synonyms_count           0.002461
rel_orthographic_density * rel_synonyms_count    0.023608
dtype: float64

Regressing global synonyms_count with 291 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13636687406034587

intercept                      0.974786
global_aoa                    -0.032570
global_clustering              0.132248
global_frequency              -0.005989
global_letters_count           0.057395
global_orthographic_density    0.168470
global_synonyms_count          0.096502
rel_aoa                        0.045136
rel_clustering                -0.156316
rel_frequency                  0.003978
rel_letters_count             -0.075892
rel_orthographic_density      -0.078583
rel_synonyms_count             0.141463
dtype: float64

Regressing global synonyms_count with 291 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3700391168492013

intercept                                                  7.924708
global_aoa                                                 0.421172
global_clustering                                          4.336423
global_frequency                                          -0.035760
global_letters_count                                       0.517588
global_orthographic_density                                2.913640
global_synonyms_count                                      5.623209
rel_aoa                                                   -0.509623
rel_clustering                                            -6.953069
rel_frequency                                             -0.277431
rel_letters_count                                         -0.470761
rel_orthographic_density                                  -2.302668
rel_synonyms_count                                       -11.099976
global_aoa * global_clustering                            -0.084687
global_aoa * global_frequency                             -0.020524
global_aoa * global_letters_count                         -0.085610
global_aoa * global_orthographic_density                  -0.165210
global_aoa * global_synonyms_count                         0.027556
global_aoa * rel_aoa                                       0.001812
global_aoa * rel_clustering                                0.164539
global_aoa * rel_frequency                                 0.038530
global_aoa * rel_letters_count                             0.049190
global_aoa * rel_orthographic_density                      0.120625
global_aoa * rel_synonyms_count                            0.045016
global_clustering * global_frequency                      -0.187498
global_clustering * global_letters_count                  -0.116845
global_clustering * global_orthographic_density           -0.564955
global_clustering * global_synonyms_count                  0.492810
global_clustering * rel_aoa                               -0.025188
global_clustering * rel_clustering                        -0.128808
global_clustering * rel_frequency                          0.114200
global_clustering * rel_letters_count                      0.048360
global_clustering * rel_orthographic_density               0.401332
global_clustering * rel_synonyms_count                    -0.921216
global_frequency * global_letters_count                   -0.019327
global_frequency * global_orthographic_density            -0.394310
global_frequency * global_synonyms_count                  -0.055976
global_frequency * rel_aoa                                -0.007830
global_frequency * rel_clustering                          0.233248
global_frequency * rel_frequency                          -0.005897
global_frequency * rel_letters_count                      -0.021704
global_frequency * rel_orthographic_density                0.313659
global_frequency * rel_synonyms_count                      0.200074
global_letters_count * global_orthographic_density        -0.192026
global_letters_count * global_synonyms_count              -0.339707
global_letters_count * rel_aoa                             0.065522
global_letters_count * rel_clustering                      0.228922
global_letters_count * rel_frequency                       0.011250
global_letters_count * rel_letters_count                  -0.002191
global_letters_count * rel_orthographic_density            0.110340
global_letters_count * rel_synonyms_count                  0.423400
global_orthographic_density * global_synonyms_count       -0.277518
global_orthographic_density * rel_aoa                      0.068255
global_orthographic_density * rel_clustering               0.709088
global_orthographic_density * rel_frequency                0.359390
global_orthographic_density * rel_letters_count            0.312012
global_orthographic_density * rel_orthographic_density     0.092331
global_orthographic_density * rel_synonyms_count           0.494380
global_synonyms_count * rel_aoa                           -0.023666
global_synonyms_count * rel_clustering                    -0.332799
global_synonyms_count * rel_frequency                      0.132173
global_synonyms_count * rel_letters_count                  0.337567
global_synonyms_count * rel_orthographic_density           0.239083
global_synonyms_count * rel_synonyms_count                 0.196247
rel_aoa * rel_clustering                                  -0.022659
rel_aoa * rel_frequency                                   -0.016941
rel_aoa * rel_letters_count                                0.001610
rel_aoa * rel_orthographic_density                         0.056329
rel_aoa * rel_synonyms_count                               0.034607
rel_clustering * rel_frequency                            -0.222831
rel_clustering * rel_letters_count                        -0.167458
rel_clustering * rel_orthographic_density                 -0.476632
rel_clustering * rel_synonyms_count                        0.693807
rel_frequency * rel_letters_count                          0.017596
rel_frequency * rel_orthographic_density                  -0.232227
rel_frequency * rel_synonyms_count                        -0.272497
rel_letters_count * rel_orthographic_density              -0.196008
rel_letters_count * rel_synonyms_count                    -0.370491
rel_orthographic_density * rel_synonyms_count             -0.399319
dtype: float64

Regressing rel synonyms_count with 291 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.3040900439022398

intercept                      0.674434
global_aoa                    -0.032321
global_clustering              0.114782
global_frequency              -0.003667
global_letters_count           0.069339
global_orthographic_density    0.184293
global_synonyms_count         -0.756108
rel_aoa                        0.040730
rel_clustering                -0.119826
rel_frequency                  0.000685
rel_letters_count             -0.077250
rel_orthographic_density      -0.099139
rel_synonyms_count             1.059623
dtype: float64

Regressing rel synonyms_count with 291 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.4987189798320669

intercept                                                 15.034175
global_aoa                                                 0.129886
global_clustering                                          5.189225
global_frequency                                          -0.576173
global_letters_count                                       0.128029
global_orthographic_density                                3.266717
global_synonyms_count                                      2.680272
rel_aoa                                                   -0.108040
rel_clustering                                            -7.315851
rel_frequency                                              0.061113
rel_letters_count                                         -0.353117
rel_orthographic_density                                  -2.573687
rel_synonyms_count                                        -8.228881
global_aoa * global_clustering                            -0.109534
global_aoa * global_frequency                             -0.016180
global_aoa * global_letters_count                         -0.070999
global_aoa * global_orthographic_density                  -0.161886
global_aoa * global_synonyms_count                         0.075158
global_aoa * rel_aoa                                       0.000133
global_aoa * rel_clustering                                0.179185
global_aoa * rel_frequency                                 0.034294
global_aoa * rel_letters_count                             0.045837
global_aoa * rel_orthographic_density                      0.129422
global_aoa * rel_synonyms_count                           -0.003693
global_clustering * global_frequency                      -0.257448
global_clustering * global_letters_count                  -0.160902
global_clustering * global_orthographic_density           -0.435170
global_clustering * global_synonyms_count                  0.362161
global_clustering * rel_aoa                                0.009808
global_clustering * rel_clustering                        -0.097507
global_clustering * rel_frequency                          0.158619
global_clustering * rel_letters_count                      0.076437
global_clustering * rel_orthographic_density               0.312033
global_clustering * rel_synonyms_count                    -0.787196
global_frequency * global_letters_count                   -0.023952
global_frequency * global_orthographic_density            -0.370438
global_frequency * global_synonyms_count                   0.017294
global_frequency * rel_aoa                                -0.018357
global_frequency * rel_clustering                          0.283717
global_frequency * rel_frequency                          -0.008548
global_frequency * rel_letters_count                      -0.001641
global_frequency * rel_orthographic_density                0.294032
global_frequency * rel_synonyms_count                      0.132676
global_letters_count * global_orthographic_density        -0.140538
global_letters_count * global_synonyms_count              -0.261314
global_letters_count * rel_aoa                             0.054403
global_letters_count * rel_clustering                      0.262747
global_letters_count * rel_frequency                       0.025024
global_letters_count * rel_letters_count                  -0.005083
global_letters_count * rel_orthographic_density            0.070871
global_letters_count * rel_synonyms_count                  0.370501
global_orthographic_density * global_synonyms_count       -0.316037
global_orthographic_density * rel_aoa                      0.061235
global_orthographic_density * rel_clustering               0.599114
global_orthographic_density * rel_frequency                0.337559
global_orthographic_density * rel_letters_count            0.249503
global_orthographic_density * rel_orthographic_density     0.088336
global_orthographic_density * rel_synonyms_count           0.549021
global_synonyms_count * rel_aoa                           -0.044067
global_synonyms_count * rel_clustering                    -0.221774
global_synonyms_count * rel_frequency                      0.089158
global_synonyms_count * rel_letters_count                  0.270016
global_synonyms_count * rel_orthographic_density           0.290525
global_synonyms_count * rel_synonyms_count                 0.195582
rel_aoa * rel_clustering                                  -0.050521
rel_aoa * rel_frequency                                   -0.010169
rel_aoa * rel_letters_count                                0.004669
rel_aoa * rel_orthographic_density                         0.048469
rel_aoa * rel_synonyms_count                               0.030660
rel_clustering * rel_frequency                            -0.247075
rel_clustering * rel_letters_count                        -0.190914
rel_clustering * rel_orthographic_density                 -0.410052
rel_clustering * rel_synonyms_count                        0.557478
rel_frequency * rel_letters_count                         -0.005288
rel_frequency * rel_orthographic_density                  -0.210680
rel_frequency * rel_synonyms_count                        -0.239521
rel_letters_count * rel_orthographic_density              -0.143378
rel_letters_count * rel_synonyms_count                    -0.330496
rel_orthographic_density * rel_synonyms_count             -0.463696
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 246 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14914084279640016

intercept                      0.849488
global_aoa                     0.002483
global_clustering              0.059510
global_frequency               0.020284
global_letters_count          -0.000098
global_orthographic_density    0.354601
global_synonyms_count          0.200350
dtype: float64

Regressing global orthographic_density with 246 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.19903448105422372

intercept                                             -2.013203
global_aoa                                            -0.116167
global_clustering                                     -0.181520
global_frequency                                       0.302379
global_letters_count                                   0.145674
global_orthographic_density                            1.926574
global_synonyms_count                                 -0.289651
global_aoa * global_clustering                        -0.031713
global_aoa * global_frequency                         -0.018737
global_aoa * global_letters_count                      0.013405
global_aoa * global_orthographic_density               0.025969
global_aoa * global_synonyms_count                    -0.009239
global_clustering * global_frequency                   0.006495
global_clustering * global_letters_count               0.044476
global_clustering * global_orthographic_density        0.114305
global_clustering * global_synonyms_count             -0.015609
global_frequency * global_letters_count                0.001009
global_frequency * global_orthographic_density        -0.118663
global_frequency * global_synonyms_count               0.081762
global_letters_count * global_orthographic_density     0.009903
global_letters_count * global_synonyms_count          -0.025941
global_orthographic_density * global_synonyms_count   -0.101236
dtype: float64

Regressing rel orthographic_density with 246 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.14993655922605487

intercept                     -1.649284
global_aoa                     0.005110
global_clustering              0.039081
global_frequency               0.045697
global_letters_count          -0.003285
global_orthographic_density    0.312918
global_synonyms_count          0.213927
dtype: float64

Regressing rel orthographic_density with 246 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.18806946860134255

intercept                                             -0.538000
global_aoa                                            -0.229142
global_clustering                                      0.404518
global_frequency                                       0.074941
global_letters_count                                  -0.005467
global_orthographic_density                            1.453793
global_synonyms_count                                 -0.470341
global_aoa * global_clustering                        -0.035860
global_aoa * global_frequency                         -0.006683
global_aoa * global_letters_count                      0.015844
global_aoa * global_orthographic_density               0.002080
global_aoa * global_synonyms_count                    -0.015149
global_clustering * global_frequency                  -0.028566
global_clustering * global_letters_count               0.010508
global_clustering * global_orthographic_density        0.077266
global_clustering * global_synonyms_count             -0.050659
global_frequency * global_letters_count               -0.008895
global_frequency * global_orthographic_density        -0.090387
global_frequency * global_synonyms_count               0.064996
global_letters_count * global_orthographic_density     0.029310
global_letters_count * global_synonyms_count          -0.000626
global_orthographic_density * global_synonyms_count   -0.062596
dtype: float64

Regressing global orthographic_density with 246 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13921030280776114

intercept                   1.469861
rel_aoa                    -0.000307
rel_clustering              0.060559
rel_frequency              -0.010432
rel_letters_count           0.049609
rel_orthographic_density    0.434795
rel_synonyms_count          0.213104
dtype: float64

Regressing global orthographic_density with 246 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.17452418255998914

intercept                                        1.354461
rel_aoa                                          0.040302
rel_clustering                                   0.122750
rel_frequency                                   -0.083269
rel_letters_count                                0.010646
rel_orthographic_density                         0.244352
rel_synonyms_count                               0.366477
rel_aoa * rel_clustering                        -0.022460
rel_aoa * rel_frequency                         -0.002070
rel_aoa * rel_letters_count                     -0.008128
rel_aoa * rel_orthographic_density              -0.002774
rel_aoa * rel_synonyms_count                     0.002066
rel_clustering * rel_frequency                   0.012565
rel_clustering * rel_letters_count               0.054850
rel_clustering * rel_orthographic_density        0.150533
rel_clustering * rel_synonyms_count              0.025248
rel_frequency * rel_letters_count               -0.003037
rel_frequency * rel_orthographic_density        -0.087546
rel_frequency * rel_synonyms_count               0.051602
rel_letters_count * rel_orthographic_density    -0.018708
rel_letters_count * rel_synonyms_count           0.035247
rel_orthographic_density * rel_synonyms_count    0.084282
dtype: float64

Regressing rel orthographic_density with 246 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.22866108828547926

intercept                  -0.532079
rel_aoa                    -0.010620
rel_clustering              0.030079
rel_frequency               0.042415
rel_letters_count           0.057254
rel_orthographic_density    0.505548
rel_synonyms_count          0.176267
dtype: float64

Regressing rel orthographic_density with 246 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.24936793029342674

intercept                                       -0.595091
rel_aoa                                         -0.007739
rel_clustering                                  -0.019519
rel_frequency                                    0.013845
rel_letters_count                                0.026365
rel_orthographic_density                         0.306932
rel_synonyms_count                               0.240747
rel_aoa * rel_clustering                        -0.036023
rel_aoa * rel_frequency                         -0.007676
rel_aoa * rel_letters_count                      0.004763
rel_aoa * rel_orthographic_density               0.003252
rel_aoa * rel_synonyms_count                    -0.002492
rel_clustering * rel_frequency                  -0.018102
rel_clustering * rel_letters_count               0.053668
rel_clustering * rel_orthographic_density        0.104913
rel_clustering * rel_synonyms_count              0.012471
rel_frequency * rel_letters_count               -0.010105
rel_frequency * rel_orthographic_density        -0.071182
rel_frequency * rel_synonyms_count               0.033500
rel_letters_count * rel_orthographic_density     0.009553
rel_letters_count * rel_synonyms_count           0.036968
rel_orthographic_density * rel_synonyms_count    0.049238
dtype: float64

Regressing global orthographic_density with 246 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16943126061052405

intercept                      2.484930
global_aoa                     0.011573
global_clustering              0.087394
global_frequency              -0.013266
global_letters_count          -0.172249
global_orthographic_density    0.158609
global_synonyms_count          0.140735
rel_aoa                       -0.011982
rel_clustering                -0.016437
rel_frequency                  0.039313
rel_letters_count              0.183385
rel_orthographic_density       0.207503
rel_synonyms_count             0.074235
dtype: float64

Regressing global orthographic_density with 246 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.396877479745554

intercept                                                -31.413725
global_aoa                                                 0.348262
global_clustering                                         -6.088490
global_frequency                                           1.420225
global_letters_count                                       0.674832
global_orthographic_density                                1.984340
global_synonyms_count                                      8.206579
rel_aoa                                                   -0.777429
rel_clustering                                             6.144063
rel_frequency                                             -2.091574
rel_letters_count                                         -0.176209
rel_orthographic_density                                  -0.518811
rel_synonyms_count                                        -2.190261
global_aoa * global_clustering                             0.208755
global_aoa * global_frequency                              0.061481
global_aoa * global_letters_count                          0.046171
global_aoa * global_orthographic_density                   0.067792
global_aoa * global_synonyms_count                        -0.445789
global_aoa * rel_aoa                                       0.015466
global_aoa * rel_clustering                               -0.278033
global_aoa * rel_frequency                                -0.063525
global_aoa * rel_letters_count                             0.008467
global_aoa * rel_orthographic_density                     -0.017417
global_aoa * rel_synonyms_count                            0.550365
global_clustering * global_frequency                       0.296586
global_clustering * global_letters_count                   0.107629
global_clustering * global_orthographic_density            0.289371
global_clustering * global_synonyms_count                  0.095649
global_clustering * rel_aoa                               -0.390195
global_clustering * rel_clustering                         0.134136
global_clustering * rel_frequency                         -0.514068
global_clustering * rel_letters_count                      0.129242
global_clustering * rel_orthographic_density               0.281323
global_clustering * rel_synonyms_count                     0.123125
global_frequency * global_letters_count                    0.015439
global_frequency * global_orthographic_density             0.011058
global_frequency * global_synonyms_count                  -0.200826
global_frequency * rel_aoa                                -0.046171
global_frequency * rel_clustering                         -0.191679
global_frequency * rel_frequency                          -0.032002
global_frequency * rel_letters_count                       0.023631
global_frequency * rel_orthographic_density                0.160035
global_frequency * rel_synonyms_count                      0.139435
global_letters_count * global_orthographic_density        -0.257329
global_letters_count * global_synonyms_count              -0.350795
global_letters_count * rel_aoa                            -0.124874
global_letters_count * rel_clustering                      0.101044
global_letters_count * rel_frequency                      -0.002759
global_letters_count * rel_letters_count                  -0.011463
global_letters_count * rel_orthographic_density            0.279482
global_letters_count * rel_synonyms_count                 -0.367725
global_orthographic_density * global_synonyms_count       -0.421897
global_orthographic_density * rel_aoa                     -0.343854
global_orthographic_density * rel_clustering              -0.651263
global_orthographic_density * rel_frequency               -0.263139
global_orthographic_density * rel_letters_count            0.271530
global_orthographic_density * rel_orthographic_density    -0.175514
global_orthographic_density * rel_synonyms_count          -0.056499
global_synonyms_count * rel_aoa                            0.380899
global_synonyms_count * rel_clustering                    -0.578796
global_synonyms_count * rel_frequency                      0.409508
global_synonyms_count * rel_letters_count                  0.611937
global_synonyms_count * rel_orthographic_density           0.292977
global_synonyms_count * rel_synonyms_count                -0.308498
rel_aoa * rel_clustering                                   0.322579
rel_aoa * rel_frequency                                    0.040235
rel_aoa * rel_letters_count                                0.039627
rel_aoa * rel_orthographic_density                         0.233153
rel_aoa * rel_synonyms_count                              -0.453840
rel_clustering * rel_frequency                             0.319920
rel_clustering * rel_letters_count                        -0.202371
rel_clustering * rel_orthographic_density                  0.378084
rel_clustering * rel_synonyms_count                        0.450633
rel_frequency * rel_letters_count                         -0.052358
rel_frequency * rel_orthographic_density                  -0.003549
rel_frequency * rel_synonyms_count                        -0.251516
rel_letters_count * rel_orthographic_density              -0.379070
rel_letters_count * rel_synonyms_count                    -0.038356
rel_orthographic_density * rel_synonyms_count              0.133456
dtype: float64

Regressing rel orthographic_density with 246 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.26664164856308115

intercept                      1.478286
global_aoa                     0.027444
global_clustering              0.090374
global_frequency              -0.005871
global_letters_count          -0.139194
global_orthographic_density   -0.475000
global_synonyms_count          0.218303
rel_aoa                       -0.036901
rel_clustering                -0.019555
rel_frequency                  0.044997
rel_letters_count              0.144561
rel_orthographic_density       0.902297
rel_synonyms_count            -0.023685
dtype: float64

Regressing rel orthographic_density with 246 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.46170525099094906

intercept                                                -24.366487
global_aoa                                                 0.741918
global_clustering                                         -5.445334
global_frequency                                           0.892718
global_letters_count                                      -0.182489
global_orthographic_density                               -0.732269
global_synonyms_count                                      6.666890
rel_aoa                                                   -0.782696
rel_clustering                                             5.057092
rel_frequency                                             -1.701469
rel_letters_count                                          1.025049
rel_orthographic_density                                   2.826971
rel_synonyms_count                                        -0.878896
global_aoa * global_clustering                             0.211747
global_aoa * global_frequency                              0.041418
global_aoa * global_letters_count                          0.052528
global_aoa * global_orthographic_density                  -0.003096
global_aoa * global_synonyms_count                        -0.465625
global_aoa * rel_aoa                                       0.003548
global_aoa * rel_clustering                               -0.258728
global_aoa * rel_frequency                                -0.044935
global_aoa * rel_letters_count                             0.012838
global_aoa * rel_orthographic_density                      0.077475
global_aoa * rel_synonyms_count                            0.602078
global_clustering * global_frequency                       0.255441
global_clustering * global_letters_count                   0.092191
global_clustering * global_orthographic_density            0.236383
global_clustering * global_synonyms_count                  0.040948
global_clustering * rel_aoa                               -0.333275
global_clustering * rel_clustering                         0.098647
global_clustering * rel_frequency                         -0.424906
global_clustering * rel_letters_count                      0.197929
global_clustering * rel_orthographic_density               0.392249
global_clustering * rel_synonyms_count                     0.216092
global_frequency * global_letters_count                    0.042749
global_frequency * global_orthographic_density             0.129811
global_frequency * global_synonyms_count                  -0.135724
global_frequency * rel_aoa                                -0.023871
global_frequency * rel_clustering                         -0.163642
global_frequency * rel_frequency                          -0.025227
global_frequency * rel_letters_count                      -0.011137
global_frequency * rel_orthographic_density                0.009730
global_frequency * rel_synonyms_count                      0.056965
global_letters_count * global_orthographic_density        -0.036242
global_letters_count * global_synonyms_count              -0.266194
global_letters_count * rel_aoa                            -0.136542
global_letters_count * rel_clustering                      0.069067
global_letters_count * rel_frequency                      -0.016754
global_letters_count * rel_letters_count                  -0.018822
global_letters_count * rel_orthographic_density            0.064351
global_letters_count * rel_synonyms_count                 -0.390448
global_orthographic_density * global_synonyms_count       -0.242286
global_orthographic_density * rel_aoa                     -0.260076
global_orthographic_density * rel_clustering              -0.427222
global_orthographic_density * rel_frequency               -0.225470
global_orthographic_density * rel_letters_count            0.092404
global_orthographic_density * rel_orthographic_density    -0.102864
global_orthographic_density * rel_synonyms_count          -0.161625
global_synonyms_count * rel_aoa                            0.341579
global_synonyms_count * rel_clustering                    -0.317568
global_synonyms_count * rel_frequency                      0.367164
global_synonyms_count * rel_letters_count                  0.434293
global_synonyms_count * rel_orthographic_density          -0.003467
global_synonyms_count * rel_synonyms_count                -0.314672
rel_aoa * rel_clustering                                   0.267698
rel_aoa * rel_frequency                                    0.018711
rel_aoa * rel_letters_count                                0.056254
rel_aoa * rel_orthographic_density                         0.123970
rel_aoa * rel_synonyms_count                              -0.461108
rel_clustering * rel_frequency                             0.258278
rel_clustering * rel_letters_count                        -0.245029
rel_clustering * rel_orthographic_density                  0.021370
rel_clustering * rel_synonyms_count                        0.115699
rel_frequency * rel_letters_count                         -0.040437
rel_frequency * rel_orthographic_density                  -0.016144
rel_frequency * rel_synonyms_count                        -0.213953
rel_letters_count * rel_orthographic_density              -0.192103
rel_letters_count * rel_synonyms_count                     0.092492
rel_orthographic_density * rel_synonyms_count              0.345642
dtype: float64

	aoa	betweenness	clustering	degree	frequency	letters_count	orthographic_density	pagerank	phonemes_count	phonological_density	syllables_count	synonyms_count
Component-0	-0.489840	0.332932	-0.098602	0.248148	0.258502	-0.410251	0.211155	0.276602	-0.363773	0.258779	-0.149618	-0.002552
Component-1	-0.286201	0.319530	-0.093017	0.254264	0.321245	0.448380	-0.162363	0.291978	0.499387	-0.229951	0.155661	-0.018348
Component-2	-0.746200	-0.287752	0.022277	-0.058305	-0.565024	0.145110	0.029877	-0.077570	0.066310	-0.027218	0.006588	0.063939

	aoa	frequency	letters_count
Component-0	-0.757650	0.349028	-0.551493
Component-1	0.390963	-0.433896	-0.811715