Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.



In [1]:

    
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.



In [2]:

    
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.



In [3]:

    
model = Model(time=Time.discrete, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data









    



Got 8978 substitutions for model Model(time=Time.discrete, source=Source.all, past=Past.last_bin, durl=Durl.exclude_past, max_distance=2)






    



100% (8978 of 8978) |######################| Elapsed Time: 0:02:22 Time: 0:02:22

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.



In [4]:

    
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.



In [5]:

    
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

$\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
$\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
$\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
$y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

$\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
$\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
$\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
$y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.



In [6]:

    
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n - 1))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()



In [7]:

    
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)



In [8]:

    
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)



In [9]:

    
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n - 1))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)



In [10]:

    
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above



In [11]:

    
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | *   | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | ns. |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | **  |
H_00 | **  | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *   | *** | *** |
H_00 | *   | ns. | ns. | **  |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare



In [12]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | **  | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | **  |
H_00 | **  | ns. | ns. | ns. |



In [14]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);

2.1.2 Quantiles of distribution of appeared global feature values



In [15]:

    
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | **  | *   | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | ns. |
H_00 | *   | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | *   | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | ns. | ns. | **  |



In [16]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | ns. | *   | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | *   | ns. | ns. | *   |



In [18]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);

2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values



In [19]:

    
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | **  |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | *   | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | **  |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *   | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *   | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | *** | *** | *   |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | *** | *** |
H_00 | *   | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | *** | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | ns. | ns. | ns. |



In [20]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True})









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | **  | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | **  |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | ns. |
H_00 | ns. | *   | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | **  |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *   | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | **  | *** | *** |
H_00 | *   | ns. | ns. | ns. |



In [22]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);

2.2.2 Quantiles of distribution of appeared sentence-relative values



In [23]:

    
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})









    



-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | ns. | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | ns. |
H_00 | ns. | ns. | ns. | ns. |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | **  |
H_00 | ns. | **  | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | ns. | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | *   | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | ns. | ns. | ns. |



In [24]:

    
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:

    
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi($disappearing word$) - \phi($sentence$)$',
          r'$\phi($appearing word$) - \phi($sentence$)$',
          plot_kws={'relative': True, 'quantiles': True})









    



---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | **  | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | **  |
H_00 | ns. | **  | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | ns. | *   | *   |



In [26]:

    
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi($source word$) - \phi($sentence$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi,r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});

3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

its slope
its shape (e.g. several slope regimes?)
its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)



In [27]:

    
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features



In [28]:

    
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)









    



/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper



In [29]:

    
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi($word$) - \phi($sentence$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)









    



/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA



In [30]:

    
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])









    



MLE estimates there are 10 components.

Those explain the following variance:
[ 0.55122311  0.17694431  0.07010802  0.06688494  0.03303531  0.02842675
  0.0197582   0.01801932  0.01455933  0.00991983]

We're plotting variation for the first 3 components:






    Out[30]:






  
    
      
      aoa
      betweenness
      clustering
      degree
      frequency
      letters_count
      orthographic_density
      pagerank
      phonemes_count
      phonological_density
      syllables_count
      synonyms_count
    
  
  
    
      Component-0
      -0.480639
      0.235220
      -0.085941
      0.223020
      0.221617
      -0.459272
      0.208394
      0.257139
      -0.429278
      0.269621
      -0.172668
      0.012316
    
    
      Component-1
      0.367462
      -0.361454
      0.135680
      -0.278595
      -0.326051
      -0.403357
      0.125370
      -0.298287
      -0.445819
      0.197887
      -0.162948
      0.011571
    
    
      Component-2
      -0.760629
      -0.491724
      0.057437
      -0.135817
      -0.315228
      0.131807
      0.047868
      -0.161522
      0.058676
      -0.079684
      0.037981
      0.039356

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.



In [31]:

    
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data









    



100% (8978 of 8978) |######################| Elapsed Time: 0:01:54 Time: 0:01:54

Compute cluster averages (so as not to overestimate confidence intervals).



In [32]:

    
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)



In [33]:

    
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)









    



---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | **  | *** | *** | ns. |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | *   | *   | ns. | ns. |

4.2 On a subset of relevant features



In [34]:

    
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA



In [35]:

    
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])









    



MLE estimates there are 2 components.

Those explain the following variance:
[ 0.66563708  0.2072879 ]







    Out[35]:






  
    
      
      aoa
      frequency
      letters_count
    
  
  
    
      Component-0
      -0.761088
      0.339546
      -0.552678
    
    
      Component-1
      0.423269
      -0.385666
      -0.819820

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.



In [36]:

    
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data









    



100% (8978 of 8978) |######################| Elapsed Time: 0:00:57 Time: 0:00:57

Compute cluster averages (so as not to overestimate confidence intervals).



In [37]:

    
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components



In [38]:

    
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)









    



---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | **  | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | **  |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:



In [39]:

    
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))









    



Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 769 (cluster-unique) substitutions, but the PCA is in fact computed on 589 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.



In [40]:

    
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.



In [41]:

    
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()









    



---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
   ** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
    * sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.



In [42]:

    
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()









    



---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
   ** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
    * global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
  *** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression



In [43]:

    
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures



In [44]:

    
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)



In [45]:

    
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()









    



----------------------------------------------------------------------
Regressing global frequency with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07820939454795783

intercept                      7.375935
global_aoa                    -0.002024
global_clustering              0.306571
global_frequency               0.379842
global_letters_count          -0.011313
global_orthographic_density   -0.092282
global_synonyms_count         -0.017598
dtype: float64

Regressing global frequency with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.13435825846910143

intercept                                              17.549799
global_aoa                                             -1.194802
global_clustering                                       1.235173
global_frequency                                        0.263464
global_letters_count                                   -1.043017
global_orthographic_density                             0.232688
global_synonyms_count                                   1.294773
global_aoa * global_clustering                         -0.036942
global_aoa * global_frequency                           0.035650
global_aoa * global_letters_count                       0.075294
global_aoa * global_orthographic_density                0.151608
global_aoa * global_synonyms_count                     -0.029357
global_clustering * global_frequency                   -0.027230
global_clustering * global_letters_count               -0.097556
global_clustering * global_orthographic_density         0.157455
global_clustering * global_synonyms_count              -0.060754
global_frequency * global_letters_count                -0.014916
global_frequency * global_orthographic_density         -0.079549
global_frequency * global_synonyms_count               -0.158327
global_letters_count * global_orthographic_density      0.051461
global_letters_count * global_synonyms_count           -0.017959
global_orthographic_density * global_synonyms_count     0.035034
dtype: float64

Regressing rel frequency with 458 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.06451404088470702

intercept                     -3.874034
global_aoa                     0.001982
global_clustering              0.334678
global_frequency               0.359657
global_letters_count           0.016606
global_orthographic_density   -0.261161
global_synonyms_count          0.083050
dtype: float64

Regressing rel frequency with 458 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.10804457363339215

intercept                                              7.160313
global_aoa                                            -1.033853
global_clustering                                      1.706010
global_frequency                                      -0.098514
global_letters_count                                  -0.741498
global_orthographic_density                            0.642434
global_synonyms_count                                  1.543976
global_aoa * global_clustering                        -0.022529
global_aoa * global_frequency                          0.059836
global_aoa * global_letters_count                      0.040801
global_aoa * global_orthographic_density               0.080971
global_aoa * global_synonyms_count                    -0.007253
global_clustering * global_frequency                  -0.073764
global_clustering * global_letters_count              -0.132934
global_clustering * global_orthographic_density        0.225686
global_clustering * global_synonyms_count             -0.082931
global_frequency * global_letters_count               -0.036619
global_frequency * global_orthographic_density        -0.047205
global_frequency * global_synonyms_count              -0.162830
global_letters_count * global_orthographic_density     0.061023
global_letters_count * global_synonyms_count          -0.071039
global_orthographic_density * global_synonyms_count    0.021863
dtype: float64

Regressing global frequency with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.04574706854779842

intercept                   9.421431
rel_aoa                     0.088685
rel_clustering              0.159424
rel_frequency               0.250069
rel_letters_count          -0.022572
rel_orthographic_density   -0.036096
rel_synonyms_count         -0.007814
dtype: float64

Regressing global frequency with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.08523143746591777

intercept                                        9.504971
rel_aoa                                          0.302442
rel_clustering                                   0.055724
rel_frequency                                    0.289289
rel_letters_count                               -0.000503
rel_orthographic_density                        -0.129607
rel_synonyms_count                              -0.030600
rel_aoa * rel_clustering                        -0.019522
rel_aoa * rel_frequency                          0.045732
rel_aoa * rel_letters_count                     -0.038416
rel_aoa * rel_orthographic_density              -0.010437
rel_aoa * rel_synonyms_count                    -0.061378
rel_clustering * rel_frequency                  -0.120648
rel_clustering * rel_letters_count               0.040624
rel_clustering * rel_orthographic_density        0.352756
rel_clustering * rel_synonyms_count              0.007208
rel_frequency * rel_letters_count               -0.016445
rel_frequency * rel_orthographic_density        -0.015192
rel_frequency * rel_synonyms_count              -0.071259
rel_letters_count * rel_orthographic_density     0.016466
rel_letters_count * rel_synonyms_count          -0.154282
rel_orthographic_density * rel_synonyms_count   -0.257652
dtype: float64

Regressing rel frequency with 458 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.25812254363116616

intercept                  -1.444560
rel_aoa                     0.036765
rel_clustering              0.379442
rel_frequency               0.622751
rel_letters_count          -0.095497
rel_orthographic_density   -0.326760
rel_synonyms_count          0.053941
dtype: float64

Regressing rel frequency with 458 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.2914446420215465

intercept                                       -1.471628
rel_aoa                                          0.124194
rel_clustering                                   0.388539
rel_frequency                                    0.664531
rel_letters_count                               -0.021078
rel_orthographic_density                        -0.534296
rel_synonyms_count                              -0.003878
rel_aoa * rel_clustering                        -0.038592
rel_aoa * rel_frequency                         -0.014328
rel_aoa * rel_letters_count                     -0.017454
rel_aoa * rel_orthographic_density               0.075329
rel_aoa * rel_synonyms_count                     0.016739
rel_clustering * rel_frequency                  -0.101797
rel_clustering * rel_letters_count              -0.017175
rel_clustering * rel_orthographic_density        0.274133
rel_clustering * rel_synonyms_count             -0.202609
rel_frequency * rel_letters_count               -0.008364
rel_frequency * rel_orthographic_density        -0.040948
rel_frequency * rel_synonyms_count              -0.119768
rel_letters_count * rel_orthographic_density     0.026389
rel_letters_count * rel_synonyms_count          -0.128791
rel_orthographic_density * rel_synonyms_count   -0.070107
dtype: float64

Regressing global frequency with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.0899254562812205

intercept                      7.391881
global_aoa                    -0.089599
global_clustering              0.331007
global_frequency               0.345388
global_letters_count           0.086971
global_orthographic_density    0.229319
global_synonyms_count         -0.151462
rel_aoa                        0.130028
rel_clustering                -0.021432
rel_frequency                  0.044261
rel_letters_count             -0.110694
rel_orthographic_density      -0.377308
rel_synonyms_count             0.160677
dtype: float64

Regressing global frequency with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.25442209203715627

intercept                                                  1.911478
global_aoa                                                -1.220647
global_clustering                                         -9.117169
global_frequency                                          -0.287212
global_letters_count                                      -3.867848
global_orthographic_density                               -3.112419
global_synonyms_count                                     -1.731825
rel_aoa                                                    2.233992
rel_clustering                                             8.833471
rel_frequency                                              0.211168
rel_letters_count                                          1.231000
rel_orthographic_density                                   5.517340
rel_synonyms_count                                        10.642869
global_aoa * global_clustering                             0.282052
global_aoa * global_frequency                              0.117560
global_aoa * global_letters_count                          0.324714
global_aoa * global_orthographic_density                   0.249765
global_aoa * global_synonyms_count                        -0.461564
global_aoa * rel_aoa                                      -0.007642
global_aoa * rel_clustering                               -0.389396
global_aoa * rel_frequency                                -0.023347
global_aoa * rel_letters_count                            -0.164218
global_aoa * rel_orthographic_density                      0.045401
global_aoa * rel_synonyms_count                            0.372374
global_clustering * global_frequency                       0.359158
global_clustering * global_letters_count                   0.203156
global_clustering * global_orthographic_density            1.315410
global_clustering * global_synonyms_count                  1.094072
global_clustering * rel_aoa                               -0.200546
global_clustering * rel_clustering                         0.257582
global_clustering * rel_frequency                         -0.354246
global_clustering * rel_letters_count                     -0.292932
global_clustering * rel_orthographic_density              -0.910294
global_clustering * rel_synonyms_count                    -0.667547
global_frequency * global_letters_count                    0.112791
global_frequency * global_orthographic_density             0.539483
global_frequency * global_synonyms_count                   0.344593
global_frequency * rel_aoa                                -0.166317
global_frequency * rel_clustering                         -0.076706
global_frequency * rel_frequency                          -0.027630
global_frequency * rel_letters_count                      -0.055997
global_frequency * rel_orthographic_density               -0.666766
global_frequency * rel_synonyms_count                     -0.755886
global_letters_count * global_orthographic_density         0.629463
global_letters_count * global_synonyms_count               0.964412
global_letters_count * rel_aoa                            -0.236118
global_letters_count * rel_clustering                     -0.318335
global_letters_count * rel_frequency                      -0.070348
global_letters_count * rel_letters_count                  -0.006767
global_letters_count * rel_orthographic_density           -0.748271
global_letters_count * rel_synonyms_count                 -1.307795
global_orthographic_density * global_synonyms_count        1.168823
global_orthographic_density * rel_aoa                     -0.236883
global_orthographic_density * rel_clustering              -1.431268
global_orthographic_density * rel_frequency               -0.589522
global_orthographic_density * rel_letters_count           -0.640645
global_orthographic_density * rel_orthographic_density    -0.436586
global_orthographic_density * rel_synonyms_count          -1.031695
global_synonyms_count * rel_aoa                            0.162668
global_synonyms_count * rel_clustering                    -0.472472
global_synonyms_count * rel_frequency                     -0.287950
global_synonyms_count * rel_letters_count                 -0.225411
global_synonyms_count * rel_orthographic_density          -0.379307
global_synonyms_count * rel_synonyms_count                -0.037043
rel_aoa * rel_clustering                                   0.188441
rel_aoa * rel_frequency                                    0.096098
rel_aoa * rel_letters_count                                0.082864
rel_aoa * rel_orthographic_density                         0.009481
rel_aoa * rel_synonyms_count                              -0.083511
rel_clustering * rel_frequency                             0.048353
rel_clustering * rel_letters_count                         0.363207
rel_clustering * rel_orthographic_density                  1.096152
rel_clustering * rel_synonyms_count                        0.148762
rel_frequency * rel_letters_count                         -0.003138
rel_frequency * rel_orthographic_density                   0.624776
rel_frequency * rel_synonyms_count                         0.557341
rel_letters_count * rel_orthographic_density               0.595664
rel_letters_count * rel_synonyms_count                     0.420340
rel_orthographic_density * rel_synonyms_count              0.154940
dtype: float64

Regressing rel frequency with 458 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3213747035821668

intercept                      5.850630
global_aoa                    -0.078947
global_clustering              0.415957
global_frequency              -0.503129
global_letters_count           0.159220
global_orthographic_density    0.269185
global_synonyms_count         -0.237498
rel_aoa                        0.108035
rel_clustering                -0.062549
rel_frequency                  0.935195
rel_letters_count             -0.172336
rel_orthographic_density      -0.399762
rel_synonyms_count             0.284015
dtype: float64

Regressing rel frequency with 458 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.44183091914036665

intercept                                                 -4.360033
global_aoa                                                -0.285475
global_clustering                                         -8.250478
global_frequency                                          -1.076017
global_letters_count                                      -4.119156
global_orthographic_density                                1.115727
global_synonyms_count                                     -0.932001
rel_aoa                                                    1.846573
rel_clustering                                             7.600251
rel_frequency                                              1.167695
rel_letters_count                                          1.567957
rel_orthographic_density                                   1.802362
rel_synonyms_count                                        10.912323
global_aoa * global_clustering                             0.257080
global_aoa * global_frequency                              0.056774
global_aoa * global_letters_count                          0.268386
global_aoa * global_orthographic_density                   0.168555
global_aoa * global_synonyms_count                        -0.425870
global_aoa * rel_aoa                                       0.000477
global_aoa * rel_clustering                               -0.352142
global_aoa * rel_frequency                                 0.037575
global_aoa * rel_letters_count                            -0.119494
global_aoa * rel_orthographic_density                      0.104868
global_aoa * rel_synonyms_count                            0.308495
global_clustering * global_frequency                       0.309198
global_clustering * global_letters_count                   0.183797
global_clustering * global_orthographic_density            1.419994
global_clustering * global_synonyms_count                  1.038063
global_clustering * rel_aoa                               -0.208105
global_clustering * rel_clustering                         0.219177
global_clustering * rel_frequency                         -0.257669
global_clustering * rel_letters_count                     -0.262767
global_clustering * rel_orthographic_density              -0.961130
global_clustering * rel_synonyms_count                    -0.546524
global_frequency * global_letters_count                    0.208469
global_frequency * global_orthographic_density             0.369019
global_frequency * global_synonyms_count                   0.229657
global_frequency * rel_aoa                                -0.139517
global_frequency * rel_clustering                         -0.057326
global_frequency * rel_frequency                          -0.034492
global_frequency * rel_letters_count                      -0.130273
global_frequency * rel_orthographic_density               -0.498385
global_frequency * rel_synonyms_count                     -0.657574
global_letters_count * global_orthographic_density         0.437507
global_letters_count * global_synonyms_count               0.977226
global_letters_count * rel_aoa                            -0.265582
global_letters_count * rel_clustering                     -0.240588
global_letters_count * rel_frequency                      -0.134715
global_letters_count * rel_letters_count                  -0.006601
global_letters_count * rel_orthographic_density           -0.561360
global_letters_count * rel_synonyms_count                 -1.303210
global_orthographic_density * global_synonyms_count        1.092271
global_orthographic_density * rel_aoa                     -0.223605
global_orthographic_density * rel_clustering              -1.371834
global_orthographic_density * rel_frequency               -0.364012
global_orthographic_density * rel_letters_count           -0.484624
global_orthographic_density * rel_orthographic_density    -0.366733
global_orthographic_density * rel_synonyms_count          -1.151082
global_synonyms_count * rel_aoa                            0.289205
global_synonyms_count * rel_clustering                    -0.603230
global_synonyms_count * rel_frequency                     -0.249608
global_synonyms_count * rel_letters_count                 -0.300621
global_synonyms_count * rel_orthographic_density          -0.199105
global_synonyms_count * rel_synonyms_count                -0.015966
rel_aoa * rel_clustering                                   0.185528
rel_aoa * rel_frequency                                    0.057629
rel_aoa * rel_letters_count                                0.098608
rel_aoa * rel_orthographic_density                        -0.010383
rel_aoa * rel_synonyms_count                              -0.195505
rel_clustering * rel_frequency                            -0.033382
rel_clustering * rel_letters_count                         0.268810
rel_clustering * rel_orthographic_density                  1.071810
rel_clustering * rel_synonyms_count                        0.185482
rel_frequency * rel_letters_count                          0.046970
rel_frequency * rel_orthographic_density                   0.432913
rel_frequency * rel_synonyms_count                         0.516479
rel_letters_count * rel_orthographic_density               0.470142
rel_letters_count * rel_synonyms_count                     0.471373
rel_orthographic_density * rel_synonyms_count              0.140044
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 413 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.10912451883125607

intercept                      3.527097
global_aoa                     0.361235
global_clustering             -0.321410
global_frequency              -0.114174
global_letters_count           0.047893
global_orthographic_density    0.026556
global_synonyms_count          0.290245
dtype: float64

Regressing global aoa with 413 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.1749227993962882

intercept                                             -11.221632
global_aoa                                              1.404987
global_clustering                                      -0.864605
global_frequency                                        1.179245
global_letters_count                                    1.915407
global_orthographic_density                            -2.814730
global_synonyms_count                                  -2.192982
global_aoa * global_clustering                          0.045432
global_aoa * global_frequency                          -0.037426
global_aoa * global_letters_count                      -0.085612
global_aoa * global_orthographic_density               -0.012634
global_aoa * global_synonyms_count                      0.241412
global_clustering * global_frequency                    0.066669
global_clustering * global_letters_count                0.054320
global_clustering * global_orthographic_density        -0.491183
global_clustering * global_synonyms_count              -0.187827
global_frequency * global_letters_count                -0.094311
global_frequency * global_orthographic_density         -0.000088
global_frequency * global_synonyms_count               -0.091922
global_letters_count * global_orthographic_density     -0.042923
global_letters_count * global_synonyms_count           -0.027189
global_orthographic_density * global_synonyms_count     0.637091
dtype: float64

Regressing rel aoa with 413 measures, no interactions
           ^^^^^^^
R^2 = 0.05132981016620708

intercept                     -0.341677
global_aoa                     0.137713
global_clustering             -0.196634
global_frequency              -0.208251
global_letters_count           0.109461
global_orthographic_density    0.204156
global_synonyms_count          0.261222
dtype: float64

Regressing rel aoa with 413 measures, with interactions
           ^^^^^^^
R^2 = 0.1395728195808298

intercept                                             -9.902835
global_aoa                                             1.644102
global_clustering                                     -0.491854
global_frequency                                       0.813861
global_letters_count                                   0.483622
global_orthographic_density                           -3.073618
global_synonyms_count                                 -0.837552
global_aoa * global_clustering                         0.061081
global_aoa * global_frequency                         -0.121722
global_aoa * global_letters_count                     -0.038123
global_aoa * global_orthographic_density               0.068518
global_aoa * global_synonyms_count                     0.177765
global_clustering * global_frequency                   0.032056
global_clustering * global_letters_count               0.019515
global_clustering * global_orthographic_density       -0.353173
global_clustering * global_synonyms_count             -0.176389
global_frequency * global_letters_count                0.005861
global_frequency * global_orthographic_density         0.051744
global_frequency * global_synonyms_count              -0.203385
global_letters_count * global_orthographic_density    -0.016896
global_letters_count * global_synonyms_count          -0.022278
global_orthographic_density * global_synonyms_count    0.641956
dtype: float64

Regressing global aoa with 413 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.029225555674777467

intercept                   6.969289
rel_aoa                     0.090850
rel_clustering             -0.001789
rel_frequency               0.074743
rel_letters_count           0.019436
rel_orthographic_density   -0.347772
rel_synonyms_count          0.333772
dtype: float64

Regressing global aoa with 413 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.11865651772185794

intercept                                        7.003876
rel_aoa                                         -0.163680
rel_clustering                                   0.404537
rel_frequency                                    0.140203
rel_letters_count                               -0.151955
rel_orthographic_density                        -0.263598
rel_synonyms_count                               0.815487
rel_aoa * rel_clustering                         0.141254
rel_aoa * rel_frequency                         -0.114462
rel_aoa * rel_letters_count                      0.012221
rel_aoa * rel_orthographic_density               0.186191
rel_aoa * rel_synonyms_count                     0.231451
rel_clustering * rel_frequency                   0.278883
rel_clustering * rel_letters_count              -0.005749
rel_clustering * rel_orthographic_density       -0.252078
rel_clustering * rel_synonyms_count              0.019834
rel_frequency * rel_letters_count               -0.028669
rel_frequency * rel_orthographic_density         0.033305
rel_frequency * rel_synonyms_count              -0.059959
rel_letters_count * rel_orthographic_density    -0.085532
rel_letters_count * rel_synonyms_count          -0.035714
rel_orthographic_density * rel_synonyms_count    0.828669
dtype: float64

Regressing rel aoa with 413 measures, no interactions
           ^^^^^^^
R^2 = 0.20351318008300856

intercept                   0.513710
rel_aoa                     0.488269
rel_clustering             -0.262093
rel_frequency              -0.124271
rel_letters_count           0.027635
rel_orthographic_density    0.218087
rel_synonyms_count          0.192387
dtype: float64

Regressing rel aoa with 413 measures, with interactions
           ^^^^^^^
R^2 = 0.26446034147621345

intercept                                        0.790656
rel_aoa                                          0.465859
rel_clustering                                   0.027848
rel_frequency                                    0.012935
rel_letters_count                                0.010244
rel_orthographic_density                         0.743507
rel_synonyms_count                               0.521911
rel_aoa * rel_clustering                         0.019986
rel_aoa * rel_frequency                         -0.045888
rel_aoa * rel_letters_count                     -0.011929
rel_aoa * rel_orthographic_density               0.118398
rel_aoa * rel_synonyms_count                     0.110477
rel_clustering * rel_frequency                   0.195893
rel_clustering * rel_letters_count              -0.006093
rel_clustering * rel_orthographic_density       -0.210555
rel_clustering * rel_synonyms_count              0.116188
rel_frequency * rel_letters_count                0.007759
rel_frequency * rel_orthographic_density         0.209806
rel_frequency * rel_synonyms_count              -0.106274
rel_letters_count * rel_orthographic_density    -0.047619
rel_letters_count * rel_synonyms_count          -0.033837
rel_orthographic_density * rel_synonyms_count    0.665535
dtype: float64

Regressing global aoa with 413 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.13979990442146373

intercept                      3.597515
global_aoa                     0.506241
global_clustering             -0.312933
global_frequency              -0.234340
global_letters_count           0.142877
global_orthographic_density    0.108974
global_synonyms_count          0.218904
rel_aoa                       -0.226792
rel_clustering                -0.010406
rel_frequency                  0.110737
rel_letters_count             -0.089409
rel_orthographic_density      -0.004686
rel_synonyms_count             0.064090
dtype: float64

Regressing global aoa with 413 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.3381960950103884

intercept                                                 64.023615
global_aoa                                                 2.021192
global_clustering                                         17.749213
global_frequency                                          -0.369577
global_letters_count                                       0.288689
global_orthographic_density                               -9.631026
global_synonyms_count                                    -33.154143
rel_aoa                                                   -0.774091
rel_clustering                                            -4.445735
rel_frequency                                              3.159118
rel_letters_count                                          4.171518
rel_orthographic_density                                   9.345945
rel_synonyms_count                                        24.070291
global_aoa * global_clustering                            -0.211196
global_aoa * global_frequency                             -0.079873
global_aoa * global_letters_count                         -0.382876
global_aoa * global_orthographic_density                  -0.195361
global_aoa * global_synonyms_count                         0.767742
global_aoa * rel_aoa                                       0.074132
global_aoa * rel_clustering                                0.102658
global_aoa * rel_frequency                                 0.075548
global_aoa * rel_letters_count                             0.141750
global_aoa * rel_orthographic_density                     -0.154896
global_aoa * rel_synonyms_count                           -0.536262
global_clustering * global_frequency                      -0.570765
global_clustering * global_letters_count                  -0.849470
global_clustering * global_orthographic_density           -2.843150
global_clustering * global_synonyms_count                 -3.245442
global_clustering * rel_aoa                                0.224938
global_clustering * rel_clustering                        -0.016224
global_clustering * rel_frequency                          0.773868
global_clustering * rel_letters_count                      0.895057
global_clustering * rel_orthographic_density               1.865359
global_clustering * rel_synonyms_count                     2.439685
global_frequency * global_letters_count                   -0.242978
global_frequency * global_orthographic_density            -0.586822
global_frequency * global_synonyms_count                   0.119791
global_frequency * rel_aoa                                 0.049732
global_frequency * rel_clustering                         -0.273243
global_frequency * rel_frequency                           0.020154
global_frequency * rel_letters_count                       0.005573
global_frequency * rel_orthographic_density                0.192666
global_frequency * rel_synonyms_count                     -0.141986
global_letters_count * global_orthographic_density         0.008390
global_letters_count * global_synonyms_count               0.794199
global_letters_count * rel_aoa                             0.104249
global_letters_count * rel_clustering                      0.509024
global_letters_count * rel_frequency                       0.092776
global_letters_count * rel_letters_count                   0.061014
global_letters_count * rel_orthographic_density            0.194545
global_letters_count * rel_synonyms_count                 -0.138598
global_orthographic_density * global_synonyms_count        2.231599
global_orthographic_density * rel_aoa                      0.210418
global_orthographic_density * rel_clustering               2.180955
global_orthographic_density * rel_frequency                0.282163
global_orthographic_density * rel_letters_count           -0.018345
global_orthographic_density * rel_orthographic_density     0.461810
global_orthographic_density * rel_synonyms_count          -1.954724
global_synonyms_count * rel_aoa                           -0.123992
global_synonyms_count * rel_clustering                     1.555865
global_synonyms_count * rel_frequency                     -0.497489
global_synonyms_count * rel_letters_count                 -1.594530
global_synonyms_count * rel_orthographic_density          -2.692360
global_synonyms_count * rel_synonyms_count                 0.226893
rel_aoa * rel_clustering                                   0.001647
rel_aoa * rel_frequency                                   -0.050113
rel_aoa * rel_letters_count                               -0.003993
rel_aoa * rel_orthographic_density                         0.211891
rel_aoa * rel_synonyms_count                               0.079804
rel_clustering * rel_frequency                             0.160769
rel_clustering * rel_letters_count                        -0.466909
rel_clustering * rel_orthographic_density                 -1.250867
rel_clustering * rel_synonyms_count                       -0.958672
rel_frequency * rel_letters_count                          0.035480
rel_frequency * rel_orthographic_density                   0.223723
rel_frequency * rel_synonyms_count                         0.254346
rel_letters_count * rel_orthographic_density               0.047715
rel_letters_count * rel_synonyms_count                     1.033507
rel_orthographic_density * rel_synonyms_count              3.331526
dtype: float64

Regressing rel aoa with 413 measures, no interactions
           ^^^^^^^
R^2 = 0.2377316749712095

intercept                      2.593561
global_aoa                    -0.338131
global_clustering             -0.218135
global_frequency              -0.205303
global_letters_count           0.131222
global_orthographic_density    0.091689
global_synonyms_count          0.352789
rel_aoa                        0.718789
rel_clustering                 0.023519
rel_frequency                  0.035541
rel_letters_count             -0.049585
rel_orthographic_density       0.026865
rel_synonyms_count            -0.125542
dtype: float64

Regressing rel aoa with 413 measures, with interactions
           ^^^^^^^
R^2 = 0.421103796015033

intercept                                                 35.230922
global_aoa                                                 0.760934
global_clustering                                          9.675102
global_frequency                                          -0.623738
global_letters_count                                       0.674750
global_orthographic_density                               -6.665809
global_synonyms_count                                    -23.894164
rel_aoa                                                   -0.135801
rel_clustering                                             1.390938
rel_frequency                                              2.036817
rel_letters_count                                          4.419707
rel_orthographic_density                                   7.353281
rel_synonyms_count                                        17.359390
global_aoa * global_clustering                            -0.053818
global_aoa * global_frequency                             -0.063935
global_aoa * global_letters_count                         -0.222525
global_aoa * global_orthographic_density                   0.022684
global_aoa * global_synonyms_count                         0.663320
global_aoa * rel_aoa                                       0.040408
global_aoa * rel_clustering                                0.038519
global_aoa * rel_frequency                                 0.020100
global_aoa * rel_letters_count                             0.016288
global_aoa * rel_orthographic_density                     -0.308645
global_aoa * rel_synonyms_count                           -0.516482
global_clustering * global_frequency                      -0.393658
global_clustering * global_letters_count                  -0.418066
global_clustering * global_orthographic_density           -1.461746
global_clustering * global_synonyms_count                 -3.152342
global_clustering * rel_aoa                                0.003470
global_clustering * rel_clustering                        -0.013194
global_clustering * rel_frequency                          0.467001
global_clustering * rel_letters_count                      0.713889
global_clustering * rel_orthographic_density               1.001327
global_clustering * rel_synonyms_count                     2.537673
global_frequency * global_letters_count                   -0.128917
global_frequency * global_orthographic_density            -0.187792
global_frequency * global_synonyms_count                  -0.325212
global_frequency * rel_aoa                                 0.054558
global_frequency * rel_clustering                         -0.411150
global_frequency * rel_frequency                           0.039121
global_frequency * rel_letters_count                      -0.039142
global_frequency * rel_orthographic_density               -0.071034
global_frequency * rel_synonyms_count                      0.304379
global_letters_count * global_orthographic_density        -0.033703
global_letters_count * global_synonyms_count               0.421064
global_letters_count * rel_aoa                             0.007987
global_letters_count * rel_clustering                      0.257075
global_letters_count * rel_frequency                       0.025165
global_letters_count * rel_letters_count                   0.064927
global_letters_count * rel_orthographic_density            0.293729
global_letters_count * rel_synonyms_count                  0.063118
global_orthographic_density * global_synonyms_count        1.583464
global_orthographic_density * rel_aoa                      0.003114
global_orthographic_density * rel_clustering               1.041650
global_orthographic_density * rel_frequency                0.060521
global_orthographic_density * rel_letters_count           -0.006852
global_orthographic_density * rel_orthographic_density     0.305249
global_orthographic_density * rel_synonyms_count          -1.488310
global_synonyms_count * rel_aoa                           -0.005698
global_synonyms_count * rel_clustering                     1.289680
global_synonyms_count * rel_frequency                     -0.082451
global_synonyms_count * rel_letters_count                 -1.048726
global_synonyms_count * rel_orthographic_density          -1.722104
global_synonyms_count * rel_synonyms_count                 0.236067
rel_aoa * rel_clustering                                   0.088661
rel_aoa * rel_frequency                                   -0.029060
rel_aoa * rel_letters_count                                0.061989
rel_aoa * rel_orthographic_density                         0.280761
rel_aoa * rel_synonyms_count                              -0.070847
rel_clustering * rel_frequency                             0.376113
rel_clustering * rel_letters_count                        -0.384687
rel_clustering * rel_orthographic_density                 -0.473738
rel_clustering * rel_synonyms_count                       -0.825271
rel_frequency * rel_letters_count                          0.085631
rel_frequency * rel_orthographic_density                   0.315455
rel_frequency * rel_synonyms_count                        -0.172205
rel_letters_count * rel_orthographic_density              -0.006463
rel_letters_count * rel_synonyms_count                     0.666738
rel_orthographic_density * rel_synonyms_count              2.390527
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 359 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.056100451696111005

intercept                     -3.612442
global_aoa                    -0.039805
global_clustering              0.178032
global_frequency              -0.071636
global_letters_count          -0.012763
global_orthographic_density   -0.057446
global_synonyms_count         -0.058564
dtype: float64

Regressing global clustering with 359 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.13962366411559135

intercept                                             -4.477820
global_aoa                                             0.177607
global_clustering                                      0.125788
global_frequency                                       0.014552
global_letters_count                                  -0.002353
global_orthographic_density                           -0.449731
global_synonyms_count                                 -0.491980
global_aoa * global_clustering                         0.007714
global_aoa * global_frequency                         -0.039308
global_aoa * global_letters_count                      0.020032
global_aoa * global_orthographic_density               0.036596
global_aoa * global_synonyms_count                     0.052483
global_clustering * global_frequency                  -0.017060
global_clustering * global_letters_count               0.053183
global_clustering * global_orthographic_density       -0.093498
global_clustering * global_synonyms_count             -0.098288
global_frequency * global_letters_count                0.019235
global_frequency * global_orthographic_density        -0.030138
global_frequency * global_synonyms_count               0.023121
global_letters_count * global_orthographic_density    -0.023213
global_letters_count * global_synonyms_count          -0.080273
global_orthographic_density * global_synonyms_count   -0.213433
dtype: float64

Regressing rel clustering with 359 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.04745679236418221

intercept                      2.258779
global_aoa                    -0.031485
global_clustering              0.113403
global_frequency              -0.068766
global_letters_count          -0.036672
global_orthographic_density   -0.093447
global_synonyms_count         -0.106953
dtype: float64

Regressing rel clustering with 359 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.09704742708591774

intercept                                              3.026328
global_aoa                                             0.052043
global_clustering                                      0.158265
global_frequency                                      -0.036735
global_letters_count                                  -0.233423
global_orthographic_density                           -0.420810
global_synonyms_count                                 -1.122833
global_aoa * global_clustering                        -0.002995
global_aoa * global_frequency                         -0.034143
global_aoa * global_letters_count                      0.022316
global_aoa * global_orthographic_density               0.042730
global_aoa * global_synonyms_count                     0.060068
global_clustering * global_frequency                  -0.013134
global_clustering * global_letters_count               0.030592
global_clustering * global_orthographic_density       -0.025357
global_clustering * global_synonyms_count             -0.146122
global_frequency * global_letters_count                0.023161
global_frequency * global_orthographic_density        -0.010071
global_frequency * global_synonyms_count               0.009301
global_letters_count * global_orthographic_density    -0.005881
global_letters_count * global_synonyms_count          -0.036307
global_orthographic_density * global_synonyms_count   -0.119518
dtype: float64

Regressing global clustering with 359 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.01240841487836164

intercept                  -5.795464
rel_aoa                    -0.001313
rel_clustering              0.105422
rel_frequency              -0.010714
rel_letters_count          -0.018108
rel_orthographic_density   -0.029714
rel_synonyms_count         -0.022374
dtype: float64

Regressing global clustering with 359 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.060393825309859706

intercept                                       -5.815892
rel_aoa                                         -0.017441
rel_clustering                                   0.099940
rel_frequency                                   -0.037383
rel_letters_count                                0.013697
rel_orthographic_density                         0.019469
rel_synonyms_count                              -0.124386
rel_aoa * rel_clustering                         0.030702
rel_aoa * rel_frequency                         -0.018300
rel_aoa * rel_letters_count                      0.007666
rel_aoa * rel_orthographic_density               0.063040
rel_aoa * rel_synonyms_count                     0.075259
rel_clustering * rel_frequency                   0.036275
rel_clustering * rel_letters_count               0.005861
rel_clustering * rel_orthographic_density       -0.080572
rel_clustering * rel_synonyms_count             -0.108404
rel_frequency * rel_letters_count                0.021142
rel_frequency * rel_orthographic_density         0.013907
rel_frequency * rel_synonyms_count              -0.020137
rel_letters_count * rel_orthographic_density    -0.014395
rel_letters_count * rel_synonyms_count          -0.035992
rel_orthographic_density * rel_synonyms_count   -0.077206
dtype: float64

Regressing rel clustering with 359 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.10195300954931984

intercept                   0.236768
rel_aoa                    -0.033576
rel_clustering              0.327117
rel_frequency              -0.010428
rel_letters_count          -0.015624
rel_orthographic_density   -0.060327
rel_synonyms_count         -0.011853
dtype: float64

Regressing rel clustering with 359 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.12846509749975177

intercept                                        0.225069
rel_aoa                                         -0.037092
rel_clustering                                   0.308246
rel_frequency                                   -0.029215
rel_letters_count                                0.009861
rel_orthographic_density                        -0.040167
rel_synonyms_count                              -0.086020
rel_aoa * rel_clustering                         0.014022
rel_aoa * rel_frequency                         -0.012128
rel_aoa * rel_letters_count                     -0.004475
rel_aoa * rel_orthographic_density               0.024286
rel_aoa * rel_synonyms_count                     0.064980
rel_clustering * rel_frequency                   0.026031
rel_clustering * rel_letters_count               0.033141
rel_clustering * rel_orthographic_density       -0.014109
rel_clustering * rel_synonyms_count             -0.032270
rel_frequency * rel_letters_count                0.014469
rel_frequency * rel_orthographic_density         0.006397
rel_frequency * rel_synonyms_count              -0.004344
rel_letters_count * rel_orthographic_density    -0.006505
rel_letters_count * rel_synonyms_count          -0.016384
rel_orthographic_density * rel_synonyms_count   -0.035370
dtype: float64

Regressing global clustering with 359 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.09384642474474446

intercept                     -0.581262
global_aoa                    -0.063685
global_clustering              0.374033
global_frequency              -0.181112
global_letters_count          -0.067861
global_orthographic_density   -0.092408
global_synonyms_count         -0.264898
rel_aoa                        0.035843
rel_clustering                -0.218017
rel_frequency                  0.120466
rel_letters_count              0.053054
rel_orthographic_density       0.048041
rel_synonyms_count             0.214467
dtype: float64

Regressing global clustering with 359 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.32484880788910003

intercept                                                 28.353015
global_aoa                                                -0.215229
global_clustering                                          4.219511
global_frequency                                          -2.194864
global_letters_count                                      -1.102855
global_orthographic_density                               -4.486990
global_synonyms_count                                     -1.307056
rel_aoa                                                   -0.333758
rel_clustering                                            -4.641087
rel_frequency                                              1.661774
rel_letters_count                                          1.411185
rel_orthographic_density                                   2.876811
rel_synonyms_count                                         0.874423
global_aoa * global_clustering                            -0.080230
global_aoa * global_frequency                             -0.015599
global_aoa * global_letters_count                         -0.007135
global_aoa * global_orthographic_density                  -0.104365
global_aoa * global_synonyms_count                         0.021463
global_aoa * rel_aoa                                       0.035986
global_aoa * rel_clustering                                0.147384
global_aoa * rel_frequency                                -0.006056
global_aoa * rel_letters_count                             0.023133
global_aoa * rel_orthographic_density                      0.133306
global_aoa * rel_synonyms_count                            0.104248
global_clustering * global_frequency                      -0.247895
global_clustering * global_letters_count                   0.008285
global_clustering * global_orthographic_density           -0.428498
global_clustering * global_synonyms_count                 -0.265979
global_clustering * rel_aoa                                0.069109
global_clustering * rel_clustering                        -0.279705
global_clustering * rel_frequency                          0.203686
global_clustering * rel_letters_count                      0.137656
global_clustering * rel_orthographic_density               0.261647
global_clustering * rel_synonyms_count                     0.512024
global_frequency * global_letters_count                    0.084050
global_frequency * global_orthographic_density             0.174538
global_frequency * global_synonyms_count                   0.050980
global_frequency * rel_aoa                                 0.021496
global_frequency * rel_clustering                          0.055214
global_frequency * rel_frequency                           0.013439
global_frequency * rel_letters_count                      -0.052442
global_frequency * rel_orthographic_density               -0.115032
global_frequency * rel_synonyms_count                      0.079396
global_letters_count * global_orthographic_density         0.113511
global_letters_count * global_synonyms_count              -0.077719
global_letters_count * rel_aoa                             0.034896
global_letters_count * rel_clustering                      0.071599
global_letters_count * rel_frequency                      -0.039337
global_letters_count * rel_letters_count                   0.006656
global_letters_count * rel_orthographic_density           -0.158529
global_letters_count * rel_synonyms_count                  0.041759
global_orthographic_density * global_synonyms_count       -0.439883
global_orthographic_density * rel_aoa                      0.139311
global_orthographic_density * rel_clustering               0.693444
global_orthographic_density * rel_frequency               -0.126116
global_orthographic_density * rel_letters_count           -0.064345
global_orthographic_density * rel_orthographic_density    -0.003181
global_orthographic_density * rel_synonyms_count           0.352624
global_synonyms_count * rel_aoa                           -0.077855
global_synonyms_count * rel_clustering                    -0.302547
global_synonyms_count * rel_frequency                     -0.068884
global_synonyms_count * rel_letters_count                 -0.226782
global_synonyms_count * rel_orthographic_density          -0.322691
global_synonyms_count * rel_synonyms_count                -0.053755
rel_aoa * rel_clustering                                  -0.024952
rel_aoa * rel_frequency                                   -0.005846
rel_aoa * rel_letters_count                               -0.044845
rel_aoa * rel_orthographic_density                        -0.043006
rel_aoa * rel_synonyms_count                               0.054126
rel_clustering * rel_frequency                            -0.020635
rel_clustering * rel_letters_count                        -0.169138
rel_clustering * rel_orthographic_density                 -0.558923
rel_clustering * rel_synonyms_count                       -0.100804
rel_frequency * rel_letters_count                          0.018738
rel_frequency * rel_orthographic_density                   0.079812
rel_frequency * rel_synonyms_count                        -0.057443
rel_letters_count * rel_orthographic_density               0.098204
rel_letters_count * rel_synonyms_count                     0.191702
rel_orthographic_density * rel_synonyms_count              0.378829
dtype: float64

Regressing rel clustering with 359 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.19761085037160497

intercept                     -0.041460
global_aoa                    -0.049579
global_clustering             -0.485249
global_frequency              -0.161278
global_letters_count          -0.076308
global_orthographic_density   -0.101285
global_synonyms_count         -0.232745
rel_aoa                        0.019532
rel_clustering                 0.722552
rel_frequency                  0.102960
rel_letters_count              0.063352
rel_orthographic_density       0.044225
rel_synonyms_count             0.187358
dtype: float64

Regressing rel clustering with 359 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.38805049574665573

intercept                                                 29.228298
global_aoa                                                -0.289178
global_clustering                                          3.000968
global_frequency                                          -2.112278
global_letters_count                                      -1.659934
global_orthographic_density                               -4.485907
global_synonyms_count                                     -2.306346
rel_aoa                                                   -0.289622
rel_clustering                                            -3.307196
rel_frequency                                              1.456751
rel_letters_count                                          1.839337
rel_orthographic_density                                   3.277615
rel_synonyms_count                                         1.724893
global_aoa * global_clustering                            -0.059270
global_aoa * global_frequency                             -0.012606
global_aoa * global_letters_count                          0.018848
global_aoa * global_orthographic_density                  -0.063214
global_aoa * global_synonyms_count                         0.033604
global_aoa * rel_aoa                                       0.034526
global_aoa * rel_clustering                                0.073491
global_aoa * rel_frequency                                -0.012407
global_aoa * rel_letters_count                             0.000345
global_aoa * rel_orthographic_density                      0.103021
global_aoa * rel_synonyms_count                            0.059242
global_clustering * global_frequency                      -0.217071
global_clustering * global_letters_count                  -0.015783
global_clustering * global_orthographic_density           -0.393580
global_clustering * global_synonyms_count                 -0.331847
global_clustering * rel_aoa                                0.039736
global_clustering * rel_clustering                        -0.300934
global_clustering * rel_frequency                          0.171089
global_clustering * rel_letters_count                      0.149678
global_clustering * rel_orthographic_density               0.297968
global_clustering * rel_synonyms_count                     0.563575
global_frequency * global_letters_count                    0.105883
global_frequency * global_orthographic_density             0.169844
global_frequency * global_synonyms_count                  -0.020908
global_frequency * rel_aoa                                 0.014849
global_frequency * rel_clustering                          0.033506
global_frequency * rel_frequency                           0.008377
global_frequency * rel_letters_count                      -0.062966
global_frequency * rel_orthographic_density               -0.111425
global_frequency * rel_synonyms_count                      0.158580
global_letters_count * global_orthographic_density         0.114846
global_letters_count * global_synonyms_count               0.070590
global_letters_count * rel_aoa                             0.010684
global_letters_count * rel_clustering                      0.126865
global_letters_count * rel_frequency                      -0.036014
global_letters_count * rel_letters_count                   0.002466
global_letters_count * rel_orthographic_density           -0.152365
global_letters_count * rel_synonyms_count                 -0.059463
global_orthographic_density * global_synonyms_count       -0.103369
global_orthographic_density * rel_aoa                      0.109472
global_orthographic_density * rel_clustering               0.623953
global_orthographic_density * rel_frequency               -0.091889
global_orthographic_density * rel_letters_count           -0.081728
global_orthographic_density * rel_orthographic_density    -0.004565
global_orthographic_density * rel_synonyms_count          -0.000127
global_synonyms_count * rel_aoa                           -0.061458
global_synonyms_count * rel_clustering                    -0.240464
global_synonyms_count * rel_frequency                     -0.032324
global_synonyms_count * rel_letters_count                 -0.323878
global_synonyms_count * rel_orthographic_density          -0.518202
global_synonyms_count * rel_synonyms_count                -0.050904
rel_aoa * rel_clustering                                   0.022196
rel_aoa * rel_frequency                                    0.003632
rel_aoa * rel_letters_count                               -0.026625
rel_aoa * rel_orthographic_density                        -0.034977
rel_aoa * rel_synonyms_count                               0.038281
rel_clustering * rel_frequency                            -0.009032
rel_clustering * rel_letters_count                        -0.196245
rel_clustering * rel_orthographic_density                 -0.544960
rel_clustering * rel_synonyms_count                       -0.100988
rel_frequency * rel_letters_count                          0.019026
rel_frequency * rel_orthographic_density                   0.069108
rel_frequency * rel_synonyms_count                        -0.095599
rel_letters_count * rel_orthographic_density               0.102513
rel_letters_count * rel_synonyms_count                     0.264807
rel_orthographic_density * rel_synonyms_count              0.581752
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07621615212984911

intercept                      2.652698
global_aoa                     0.091113
global_clustering             -0.456075
global_frequency              -0.058081
global_letters_count           0.164125
global_orthographic_density   -0.119045
global_synonyms_count          0.072793
dtype: float64

Regressing global letters_count with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12142711051018018

intercept                                             -18.669056
global_aoa                                              1.490656
global_clustering                                      -3.164437
global_frequency                                        1.501962
global_letters_count                                    0.837845
global_orthographic_density                            -1.067911
global_synonyms_count                                   1.487596
global_aoa * global_clustering                          0.167132
global_aoa * global_frequency                           0.018136
global_aoa * global_letters_count                      -0.091569
global_aoa * global_orthographic_density               -0.082805
global_aoa * global_synonyms_count                      0.096223
global_clustering * global_frequency                    0.258307
global_clustering * global_letters_count               -0.105349
global_clustering * global_orthographic_density        -0.126617
global_clustering * global_synonyms_count               0.066219
global_frequency * global_letters_count                -0.054855
global_frequency * global_orthographic_density          0.120721
global_frequency * global_synonyms_count               -0.081584
global_letters_count * global_orthographic_density     -0.046088
global_letters_count * global_synonyms_count           -0.110530
global_orthographic_density * global_synonyms_count    -0.248820
dtype: float64

Regressing rel letters_count with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05060310570869053

intercept                     -1.072625
global_aoa                     0.044794
global_clustering             -0.482486
global_frequency              -0.103929
global_letters_count           0.182960
global_orthographic_density    0.048970
global_synonyms_count         -0.034429
dtype: float64

Regressing rel letters_count with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0993501734137947

intercept                                             -18.130451
global_aoa                                              1.109863
global_clustering                                      -3.657857
global_frequency                                        1.224752
global_letters_count                                    0.125929
global_orthographic_density                            -2.448938
global_synonyms_count                                   0.816282
global_aoa * global_clustering                          0.154821
global_aoa * global_frequency                           0.012902
global_aoa * global_letters_count                      -0.054322
global_aoa * global_orthographic_density               -0.022415
global_aoa * global_synonyms_count                      0.100953
global_clustering * global_frequency                    0.301227
global_clustering * global_letters_count               -0.068218
global_clustering * global_orthographic_density        -0.147342
global_clustering * global_synonyms_count              -0.003815
global_frequency * global_letters_count                 0.014686
global_frequency * global_orthographic_density          0.218205
global_frequency * global_synonyms_count               -0.145232
global_letters_count * global_orthographic_density     -0.038004
global_letters_count * global_synonyms_count           -0.026884
global_orthographic_density * global_synonyms_count    -0.110808
dtype: float64

Regressing global letters_count with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.04839202668305309

intercept                   5.782405
rel_aoa                    -0.090169
rel_clustering             -0.141479
rel_frequency               0.018542
rel_letters_count           0.139033
rel_orthographic_density   -0.327722
rel_synonyms_count          0.110489
dtype: float64

Regressing global letters_count with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07048294540599565

intercept                                        5.752054
rel_aoa                                         -0.268117
rel_clustering                                   0.198168
rel_frequency                                    0.074139
rel_letters_count                                0.232443
rel_orthographic_density                        -0.206459
rel_synonyms_count                              -0.091568
rel_aoa * rel_clustering                         0.032544
rel_aoa * rel_frequency                         -0.047057
rel_aoa * rel_letters_count                      0.005636
rel_aoa * rel_orthographic_density              -0.007875
rel_aoa * rel_synonyms_count                     0.047009
rel_clustering * rel_frequency                   0.099166
rel_clustering * rel_letters_count              -0.075869
rel_clustering * rel_orthographic_density       -0.078808
rel_clustering * rel_synonyms_count              0.053546
rel_frequency * rel_letters_count                0.002620
rel_frequency * rel_orthographic_density         0.074048
rel_frequency * rel_synonyms_count              -0.070296
rel_letters_count * rel_orthographic_density     0.041301
rel_letters_count * rel_synonyms_count          -0.052533
rel_orthographic_density * rel_synonyms_count   -0.026585
dtype: float64

Regressing rel letters_count with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.12306626391549846

intercept                   1.232036
rel_aoa                    -0.061940
rel_clustering             -0.270561
rel_frequency              -0.177437
rel_letters_count           0.346540
rel_orthographic_density    0.039945
rel_synonyms_count          0.080002
dtype: float64

Regressing rel letters_count with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.14885647371553967

intercept                                        1.278733
rel_aoa                                         -0.116274
rel_clustering                                   0.111986
rel_frequency                                   -0.144788
rel_letters_count                                0.454346
rel_orthographic_density                         0.340033
rel_synonyms_count                              -0.165467
rel_aoa * rel_clustering                         0.031194
rel_aoa * rel_frequency                         -0.011974
rel_aoa * rel_letters_count                     -0.038715
rel_aoa * rel_orthographic_density              -0.084395
rel_aoa * rel_synonyms_count                     0.078895
rel_clustering * rel_frequency                   0.165812
rel_clustering * rel_letters_count               0.016500
rel_clustering * rel_orthographic_density       -0.007644
rel_clustering * rel_synonyms_count              0.136958
rel_frequency * rel_letters_count                0.016536
rel_frequency * rel_orthographic_density         0.125304
rel_frequency * rel_synonyms_count              -0.048901
rel_letters_count * rel_orthographic_density     0.026957
rel_letters_count * rel_synonyms_count          -0.048707
rel_orthographic_density * rel_synonyms_count   -0.066020
dtype: float64

Regressing global letters_count with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10219411518453203

intercept                     -1.215732
global_aoa                     0.243269
global_clustering             -0.801429
global_frequency              -0.021281
global_letters_count           0.199806
global_orthographic_density   -0.046855
global_synonyms_count          0.211183
rel_aoa                       -0.239025
rel_clustering                 0.401234
rel_frequency                 -0.055969
rel_letters_count             -0.024889
rel_orthographic_density      -0.058998
rel_synonyms_count            -0.117385
dtype: float64

Regressing global letters_count with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.25587483103509645

intercept                                                -3.868276
global_aoa                                                4.438116
global_clustering                                         3.793670
global_frequency                                          1.699256
global_letters_count                                     -1.136281
global_orthographic_density                              -1.887159
global_synonyms_count                                    -4.687693
rel_aoa                                                  -3.055215
rel_clustering                                           -8.215833
rel_frequency                                             0.369564
rel_letters_count                                         0.663834
rel_orthographic_density                                 -2.282779
rel_synonyms_count                                        2.027730
global_aoa * global_clustering                            0.343410
global_aoa * global_frequency                            -0.068477
global_aoa * global_letters_count                        -0.342835
global_aoa * global_orthographic_density                  0.081603
global_aoa * global_synonyms_count                        0.337729
global_aoa * rel_aoa                                      0.019130
global_aoa * rel_clustering                              -0.057459
global_aoa * rel_frequency                                0.144503
global_aoa * rel_letters_count                            0.113663
global_aoa * rel_orthographic_density                    -0.379001
global_aoa * rel_synonyms_count                          -0.303811
global_clustering * global_frequency                      0.015275
global_clustering * global_letters_count                 -1.037719
global_clustering * global_orthographic_density          -0.840034
global_clustering * global_synonyms_count                -0.886157
global_clustering * rel_aoa                              -0.063858
global_clustering * rel_clustering                        0.013391
global_clustering * rel_frequency                         0.277336
global_clustering * rel_letters_count                     0.646646
global_clustering * rel_orthographic_density             -0.068236
global_clustering * rel_synonyms_count                    0.593759
global_frequency * global_letters_count                  -0.209612
global_frequency * global_orthographic_density           -0.093664
global_frequency * global_synonyms_count                 -0.172612
global_frequency * rel_aoa                                0.118340
global_frequency * rel_clustering                         0.336627
global_frequency * rel_frequency                          0.038837
global_frequency * rel_letters_count                      0.166530
global_frequency * rel_orthographic_density               0.110826
global_frequency * rel_synonyms_count                     0.243657
global_letters_count * global_orthographic_density       -0.428240
global_letters_count * global_synonyms_count              0.176314
global_letters_count * rel_aoa                            0.275958
global_letters_count * rel_clustering                     0.995183
global_letters_count * rel_frequency                     -0.000397
global_letters_count * rel_letters_count                  0.102105
global_letters_count * rel_orthographic_density           0.372405
global_letters_count * rel_synonyms_count                 0.241222
global_orthographic_density * global_synonyms_count      -0.697346
global_orthographic_density * rel_aoa                    -0.119102
global_orthographic_density * rel_clustering              0.502045
global_orthographic_density * rel_frequency               0.068932
global_orthographic_density * rel_letters_count           0.356026
global_orthographic_density * rel_orthographic_density    0.361823
global_orthographic_density * rel_synonyms_count         -0.190366
global_synonyms_count * rel_aoa                          -0.143364
global_synonyms_count * rel_clustering                    0.841660
global_synonyms_count * rel_frequency                    -0.045709
global_synonyms_count * rel_letters_count                -0.488184
global_synonyms_count * rel_orthographic_density          0.670954
global_synonyms_count * rel_synonyms_count               -0.096197
rel_aoa * rel_clustering                                 -0.048564
rel_aoa * rel_frequency                                  -0.173219
rel_aoa * rel_letters_count                              -0.170811
rel_aoa * rel_orthographic_density                        0.305987
rel_aoa * rel_synonyms_count                              0.193151
rel_clustering * rel_frequency                           -0.314208
rel_clustering * rel_letters_count                       -0.620326
rel_clustering * rel_orthographic_density                 0.513687
rel_clustering * rel_synonyms_count                      -0.649071
rel_frequency * rel_letters_count                         0.016639
rel_frequency * rel_orthographic_density                  0.103832
rel_frequency * rel_synonyms_count                       -0.115786
rel_letters_count * rel_orthographic_density              0.037922
rel_letters_count * rel_synonyms_count                    0.033496
rel_orthographic_density * rel_synonyms_count             0.212174
dtype: float64

Regressing rel letters_count with 458 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.18504662388480464

intercept                     -2.334608
global_aoa                     0.159823
global_clustering             -0.794340
global_frequency               0.022921
global_letters_count          -0.540638
global_orthographic_density   -0.046973
global_synonyms_count          0.207569
rel_aoa                       -0.154847
rel_clustering                 0.423070
rel_frequency                 -0.106184
rel_letters_count              0.756938
rel_orthographic_density      -0.075050
rel_synonyms_count            -0.150009
dtype: float64

Regressing rel letters_count with 458 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.309341313834236

intercept                                                -2.108898
global_aoa                                                2.745008
global_clustering                                         0.984267
global_frequency                                          0.902804
global_letters_count                                     -2.103878
global_orthographic_density                              -4.047861
global_synonyms_count                                    -4.166095
rel_aoa                                                  -1.673899
rel_clustering                                           -4.847460
rel_frequency                                             0.599633
rel_letters_count                                         0.655261
rel_orthographic_density                                 -3.693679
rel_synonyms_count                                        1.580031
global_aoa * global_clustering                            0.244083
global_aoa * global_frequency                            -0.004763
global_aoa * global_letters_count                        -0.259154
global_aoa * global_orthographic_density                  0.050754
global_aoa * global_synonyms_count                        0.278138
global_aoa * rel_aoa                                      0.016740
global_aoa * rel_clustering                               0.048153
global_aoa * rel_frequency                                0.088123
global_aoa * rel_letters_count                            0.080936
global_aoa * rel_orthographic_density                    -0.281137
global_aoa * rel_synonyms_count                          -0.287665
global_clustering * global_frequency                      0.124050
global_clustering * global_letters_count                 -0.703184
global_clustering * global_orthographic_density          -0.566285
global_clustering * global_synonyms_count                -0.416880
global_clustering * rel_aoa                              -0.032811
global_clustering * rel_clustering                       -0.025665
global_clustering * rel_frequency                         0.137221
global_clustering * rel_letters_count                     0.302432
global_clustering * rel_orthographic_density             -0.464632
global_clustering * rel_synonyms_count                   -0.123975
global_frequency * global_letters_count                  -0.070683
global_frequency * global_orthographic_density            0.191293
global_frequency * global_synonyms_count                 -0.022040
global_frequency * rel_aoa                                0.086407
global_frequency * rel_clustering                         0.142328
global_frequency * rel_frequency                          0.053079
global_frequency * rel_letters_count                      0.071907
global_frequency * rel_orthographic_density              -0.000716
global_frequency * rel_synonyms_count                    -0.037736
global_letters_count * global_orthographic_density       -0.280207
global_letters_count * global_synonyms_count              0.393497
global_letters_count * rel_aoa                            0.181971
global_letters_count * rel_clustering                     0.629953
global_letters_count * rel_frequency                     -0.090183
global_letters_count * rel_letters_count                  0.087161
global_letters_count * rel_orthographic_density           0.330163
global_letters_count * rel_synonyms_count                -0.044473
global_orthographic_density * global_synonyms_count      -0.719174
global_orthographic_density * rel_aoa                    -0.240275
global_orthographic_density * rel_clustering              0.270388
global_orthographic_density * rel_frequency              -0.207140
global_orthographic_density * rel_letters_count           0.342630
global_orthographic_density * rel_orthographic_density    0.358301
global_orthographic_density * rel_synonyms_count          0.015512
global_synonyms_count * rel_aoa                          -0.140583
global_synonyms_count * rel_clustering                    0.821384
global_synonyms_count * rel_frequency                    -0.163444
global_synonyms_count * rel_letters_count                -0.628967
global_synonyms_count * rel_orthographic_density          0.755581
global_synonyms_count * rel_synonyms_count               -0.167742
rel_aoa * rel_clustering                                 -0.143537
rel_aoa * rel_frequency                                  -0.132673
rel_aoa * rel_letters_count                              -0.127405
rel_aoa * rel_orthographic_density                        0.363214
rel_aoa * rel_synonyms_count                              0.277558
rel_clustering * rel_frequency                           -0.096543
rel_clustering * rel_letters_count                       -0.226455
rel_clustering * rel_orthographic_density                 0.789948
rel_clustering * rel_synonyms_count                      -0.288767
rel_frequency * rel_letters_count                         0.076127
rel_frequency * rel_orthographic_density                  0.214339
rel_frequency * rel_synonyms_count                        0.115926
rel_letters_count * rel_orthographic_density             -0.047296
rel_letters_count * rel_synonyms_count                    0.234452
rel_orthographic_density * rel_synonyms_count             0.013579
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 446 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07563838151924751

intercept                      0.871709
global_aoa                    -0.004464
global_clustering              0.029693
global_frequency              -0.027971
global_letters_count          -0.023789
global_orthographic_density    0.051906
global_synonyms_count          0.182192
dtype: float64

Regressing global synonyms_count with 446 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09884838498502824

intercept                                              1.244995
global_aoa                                             0.126200
global_clustering                                      0.305444
global_frequency                                      -0.024268
global_letters_count                                   0.026816
global_orthographic_density                           -0.156217
global_synonyms_count                                 -0.613215
global_aoa * global_clustering                        -0.009499
global_aoa * global_frequency                         -0.015208
global_aoa * global_letters_count                     -0.006789
global_aoa * global_orthographic_density              -0.014325
global_aoa * global_synonyms_count                     0.038502
global_clustering * global_frequency                  -0.018512
global_clustering * global_letters_count               0.003606
global_clustering * global_orthographic_density       -0.032979
global_clustering * global_synonyms_count             -0.050736
global_frequency * global_letters_count                0.000356
global_frequency * global_orthographic_density        -0.000208
global_frequency * global_synonyms_count              -0.003873
global_letters_count * global_orthographic_density     0.015068
global_letters_count * global_synonyms_count           0.025250
global_orthographic_density * global_synonyms_count    0.091604
dtype: float64

Regressing rel synonyms_count with 446 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.04690287404721161

intercept                      0.473417
global_aoa                    -0.008795
global_clustering              0.005544
global_frequency              -0.021192
global_letters_count          -0.027410
global_orthographic_density    0.026156
global_synonyms_count          0.122924
dtype: float64

Regressing rel synonyms_count with 446 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.0715164844278724

intercept                                              0.711690
global_aoa                                             0.089444
global_clustering                                      0.141095
global_frequency                                      -0.086216
global_letters_count                                   0.060996
global_orthographic_density                           -0.127597
global_synonyms_count                                 -0.659782
global_aoa * global_clustering                        -0.006130
global_aoa * global_frequency                         -0.010639
global_aoa * global_letters_count                     -0.006718
global_aoa * global_orthographic_density              -0.008585
global_aoa * global_synonyms_count                     0.047603
global_clustering * global_frequency                  -0.017938
global_clustering * global_letters_count               0.014602
global_clustering * global_orthographic_density       -0.000368
global_clustering * global_synonyms_count             -0.043506
global_frequency * global_letters_count                0.004050
global_frequency * global_orthographic_density         0.015992
global_frequency * global_synonyms_count              -0.007340
global_letters_count * global_orthographic_density     0.005980
global_letters_count * global_synonyms_count           0.026086
global_orthographic_density * global_synonyms_count    0.086239
dtype: float64

Regressing global synonyms_count with 446 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07579621858165364

intercept                   0.451220
rel_aoa                     0.028833
rel_clustering             -0.024612
rel_frequency              -0.023969
rel_letters_count          -0.038700
rel_orthographic_density    0.052237
rel_synonyms_count          0.169601
dtype: float64

Regressing global synonyms_count with 446 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11300654236219863

intercept                                        0.507656
rel_aoa                                          0.006613
rel_clustering                                  -0.095122
rel_frequency                                   -0.006256
rel_letters_count                               -0.087348
rel_orthographic_density                         0.034977
rel_synonyms_count                               0.080366
rel_aoa * rel_clustering                        -0.009129
rel_aoa * rel_frequency                         -0.009314
rel_aoa * rel_letters_count                      0.016666
rel_aoa * rel_orthographic_density               0.036562
rel_aoa * rel_synonyms_count                     0.032059
rel_clustering * rel_frequency                  -0.006930
rel_clustering * rel_letters_count               0.011067
rel_clustering * rel_orthographic_density       -0.035312
rel_clustering * rel_synonyms_count             -0.058075
rel_frequency * rel_letters_count               -0.005832
rel_frequency * rel_orthographic_density        -0.004637
rel_frequency * rel_synonyms_count               0.000467
rel_letters_count * rel_orthographic_density    -0.009200
rel_letters_count * rel_synonyms_count           0.045150
rel_orthographic_density * rel_synonyms_count    0.037893
dtype: float64

Regressing rel synonyms_count with 446 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.11762620023693592

intercept                   0.111921
rel_aoa                     0.009383
rel_clustering              0.017089
rel_frequency              -0.018711
rel_letters_count          -0.036503
rel_orthographic_density    0.010931
rel_synonyms_count          0.287694
dtype: float64

Regressing rel synonyms_count with 446 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.15109672279021757

intercept                                        0.186338
rel_aoa                                         -0.006571
rel_clustering                                  -0.095358
rel_frequency                                    0.008635
rel_letters_count                               -0.083222
rel_orthographic_density                         0.017296
rel_synonyms_count                               0.278385
rel_aoa * rel_clustering                        -0.003201
rel_aoa * rel_frequency                         -0.005969
rel_aoa * rel_letters_count                      0.013074
rel_aoa * rel_orthographic_density               0.026115
rel_aoa * rel_synonyms_count                     0.016660
rel_clustering * rel_frequency                  -0.014300
rel_clustering * rel_letters_count               0.016519
rel_clustering * rel_orthographic_density       -0.044831
rel_clustering * rel_synonyms_count             -0.029480
rel_frequency * rel_letters_count               -0.006579
rel_frequency * rel_orthographic_density         0.004580
rel_frequency * rel_synonyms_count               0.012779
rel_letters_count * rel_orthographic_density    -0.005969
rel_letters_count * rel_synonyms_count           0.043557
rel_orthographic_density * rel_synonyms_count    0.063287
dtype: float64

Regressing global synonyms_count with 446 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09739991386804703

intercept                      1.485727
global_aoa                    -0.038611
global_clustering              0.133709
global_frequency              -0.028038
global_letters_count           0.023418
global_orthographic_density    0.088479
global_synonyms_count          0.104291
rel_aoa                        0.052449
rel_clustering                -0.122954
rel_frequency                  0.001506
rel_letters_count             -0.051986
rel_orthographic_density      -0.038506
rel_synonyms_count             0.077633
dtype: float64

Regressing global synonyms_count with 446 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.24411288746155066

intercept                                                 1.111779
global_aoa                                                0.508201
global_clustering                                         1.637346
global_frequency                                          0.128940
global_letters_count                                      0.462808
global_orthographic_density                               1.389485
global_synonyms_count                                     3.102437
rel_aoa                                                  -0.382792
rel_clustering                                           -2.057511
rel_frequency                                            -0.193722
rel_letters_count                                        -0.604056
rel_orthographic_density                                 -0.974170
rel_synonyms_count                                       -6.988279
global_aoa * global_clustering                           -0.038997
global_aoa * global_frequency                            -0.034777
global_aoa * global_letters_count                        -0.043248
global_aoa * global_orthographic_density                 -0.111394
global_aoa * global_synonyms_count                        0.011124
global_aoa * rel_aoa                                     -0.009725
global_aoa * rel_clustering                               0.077538
global_aoa * rel_frequency                                0.021073
global_aoa * rel_letters_count                            0.022092
global_aoa * rel_orthographic_density                     0.087386
global_aoa * rel_synonyms_count                           0.068664
global_clustering * global_frequency                     -0.073523
global_clustering * global_letters_count                  0.009353
global_clustering * global_orthographic_density          -0.198250
global_clustering * global_synonyms_count                 0.237701
global_clustering * rel_aoa                              -0.032664
global_clustering * rel_clustering                       -0.011240
global_clustering * rel_frequency                         0.046132
global_clustering * rel_letters_count                    -0.030519
global_clustering * rel_orthographic_density              0.196523
global_clustering * rel_synonyms_count                   -0.611403
global_frequency * global_letters_count                  -0.002109
global_frequency * global_orthographic_density           -0.164306
global_frequency * global_synonyms_count                  0.063342
global_frequency * rel_aoa                                0.017277
global_frequency * rel_clustering                         0.084468
global_frequency * rel_frequency                          0.007481
global_frequency * rel_letters_count                     -0.018273
global_frequency * rel_orthographic_density               0.131174
global_frequency * rel_synonyms_count                    -0.011105
global_letters_count * global_orthographic_density        0.000954
global_letters_count * global_synonyms_count             -0.324346
global_letters_count * rel_aoa                            0.026704
global_letters_count * rel_clustering                    -0.037092
global_letters_count * rel_frequency                      0.000886
global_letters_count * rel_letters_count                  0.013547
global_letters_count * rel_orthographic_density           0.005683
global_letters_count * rel_synonyms_count                 0.376290
global_orthographic_density * global_synonyms_count      -0.475455
global_orthographic_density * rel_aoa                    -0.000339
global_orthographic_density * rel_clustering              0.193234
global_orthographic_density * rel_frequency               0.130514
global_orthographic_density * rel_letters_count           0.165608
global_orthographic_density * rel_orthographic_density    0.134285
global_orthographic_density * rel_synonyms_count          0.606379
global_synonyms_count * rel_aoa                          -0.112175
global_synonyms_count * rel_clustering                    0.083470
global_synonyms_count * rel_frequency                    -0.025949
global_synonyms_count * rel_letters_count                 0.236101
global_synonyms_count * rel_orthographic_density          0.281052
global_synonyms_count * rel_synonyms_count                0.098957
rel_aoa * rel_clustering                                 -0.019136
rel_aoa * rel_frequency                                  -0.021719
rel_aoa * rel_letters_count                               0.007972
rel_aoa * rel_orthographic_density                        0.033322
rel_aoa * rel_synonyms_count                              0.081875
rel_clustering * rel_frequency                           -0.060588
rel_clustering * rel_letters_count                        0.068679
rel_clustering * rel_orthographic_density                -0.234911
rel_clustering * rel_synonyms_count                       0.233170
rel_frequency * rel_letters_count                         0.024320
rel_frequency * rel_orthographic_density                 -0.091722
rel_frequency * rel_synonyms_count                       -0.041471
rel_letters_count * rel_orthographic_density             -0.084109
rel_letters_count * rel_synonyms_count                   -0.212501
rel_orthographic_density * rel_synonyms_count            -0.317833
dtype: float64

Regressing rel synonyms_count with 446 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.21876441895578935

intercept                      0.860340
global_aoa                    -0.040079
global_clustering              0.073995
global_frequency              -0.021106
global_letters_count           0.033462
global_orthographic_density    0.104497
global_synonyms_count         -0.622005
rel_aoa                        0.045987
rel_clustering                -0.056422
rel_frequency                 -0.001345
rel_letters_count             -0.059107
rel_orthographic_density      -0.080017
rel_synonyms_count             0.875065
dtype: float64

Regressing rel synonyms_count with 446 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.361660528557886

intercept                                                -1.974317
global_aoa                                                0.241436
global_clustering                                         1.254920
global_frequency                                          0.167670
global_letters_count                                      0.559755
global_orthographic_density                               2.783855
global_synonyms_count                                     3.166774
rel_aoa                                                  -0.213929
rel_clustering                                           -1.573258
rel_frequency                                            -0.545338
rel_letters_count                                        -0.835405
rel_orthographic_density                                 -2.582151
rel_synonyms_count                                       -7.374683
global_aoa * global_clustering                           -0.046649
global_aoa * global_frequency                            -0.019258
global_aoa * global_letters_count                        -0.031394
global_aoa * global_orthographic_density                 -0.111435
global_aoa * global_synonyms_count                        0.025209
global_aoa * rel_aoa                                     -0.007106
global_aoa * rel_clustering                               0.089604
global_aoa * rel_frequency                                0.017044
global_aoa * rel_letters_count                            0.021907
global_aoa * rel_orthographic_density                     0.100254
global_aoa * rel_synonyms_count                           0.063311
global_clustering * global_frequency                     -0.070253
global_clustering * global_letters_count                  0.017948
global_clustering * global_orthographic_density          -0.081560
global_clustering * global_synonyms_count                 0.202168
global_clustering * rel_aoa                              -0.032927
global_clustering * rel_clustering                        0.001420
global_clustering * rel_frequency                         0.012077
global_clustering * rel_letters_count                    -0.027349
global_clustering * rel_orthographic_density              0.134647
global_clustering * rel_synonyms_count                   -0.555246
global_frequency * global_letters_count                  -0.001661
global_frequency * global_orthographic_density           -0.199705
global_frequency * global_synonyms_count                 -0.035755
global_frequency * rel_aoa                                0.001322
global_frequency * rel_clustering                         0.062712
global_frequency * rel_frequency                          0.005896
global_frequency * rel_letters_count                     -0.000941
global_frequency * rel_orthographic_density               0.207027
global_frequency * rel_synonyms_count                     0.109515
global_letters_count * global_orthographic_density       -0.046432
global_letters_count * global_synonyms_count             -0.297249
global_letters_count * rel_aoa                            0.017300
global_letters_count * rel_clustering                    -0.038787
global_letters_count * rel_frequency                      0.010892
global_letters_count * rel_letters_count                  0.012312
global_letters_count * rel_orthographic_density           0.061166
global_letters_count * rel_synonyms_count                 0.363110
global_orthographic_density * global_synonyms_count      -0.499922
global_orthographic_density * rel_aoa                     0.001725
global_orthographic_density * rel_clustering              0.130020
global_orthographic_density * rel_frequency               0.185376
global_orthographic_density * rel_letters_count           0.182336
global_orthographic_density * rel_orthographic_density    0.109444
global_orthographic_density * rel_synonyms_count          0.715674
global_synonyms_count * rel_aoa                          -0.069724
global_synonyms_count * rel_clustering                    0.126366
global_synonyms_count * rel_frequency                     0.072188
global_synonyms_count * rel_letters_count                 0.240349
global_synonyms_count * rel_orthographic_density          0.410551
global_synonyms_count * rel_synonyms_count                0.121113
rel_aoa * rel_clustering                                 -0.023575
rel_aoa * rel_frequency                                  -0.015523
rel_aoa * rel_letters_count                               0.007277
rel_aoa * rel_orthographic_density                        0.025071
rel_aoa * rel_synonyms_count                              0.025409
rel_clustering * rel_frequency                           -0.021745
rel_clustering * rel_letters_count                        0.051296
rel_clustering * rel_orthographic_density                -0.213731
rel_clustering * rel_synonyms_count                       0.168513
rel_frequency * rel_letters_count                         0.003647
rel_frequency * rel_orthographic_density                 -0.159815
rel_frequency * rel_synonyms_count                       -0.162003
rel_letters_count * rel_orthographic_density             -0.123536
rel_letters_count * rel_synonyms_count                   -0.235277
rel_orthographic_density * rel_synonyms_count            -0.521550
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 382 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09789452671807675

intercept                      1.379965
global_aoa                    -0.019169
global_clustering              0.124221
global_frequency               0.026693
global_letters_count           0.015255
global_orthographic_density    0.297722
global_synonyms_count          0.082081
dtype: float64

Regressing global orthographic_density with 382 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12933350576192293

intercept                                              4.794028
global_aoa                                            -0.482706
global_clustering                                      0.843030
global_frequency                                       0.291994
global_letters_count                                  -0.257916
global_orthographic_density                            0.399734
global_synonyms_count                                 -0.497204
global_aoa * global_clustering                        -0.064957
global_aoa * global_frequency                         -0.020724
global_aoa * global_letters_count                      0.034062
global_aoa * global_orthographic_density               0.068805
global_aoa * global_synonyms_count                    -0.017755
global_clustering * global_frequency                  -0.011360
global_clustering * global_letters_count              -0.011512
global_clustering * global_orthographic_density       -0.068327
global_clustering * global_synonyms_count             -0.004583
global_frequency * global_letters_count               -0.008578
global_frequency * global_orthographic_density        -0.110031
global_frequency * global_synonyms_count               0.035533
global_letters_count * global_orthographic_density     0.002906
global_letters_count * global_synonyms_count           0.042196
global_orthographic_density * global_synonyms_count    0.056491
dtype: float64

Regressing rel orthographic_density with 382 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.0813553067307643

intercept                     -0.880763
global_aoa                    -0.005181
global_clustering              0.123370
global_frequency               0.050972
global_letters_count          -0.011286
global_orthographic_density    0.208783
global_synonyms_count          0.105575
dtype: float64

Regressing rel orthographic_density with 382 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11272663400868632

intercept                                              3.539560
global_aoa                                            -0.376309
global_clustering                                      1.236300
global_frequency                                       0.205604
global_letters_count                                  -0.251212
global_orthographic_density                            0.562292
global_synonyms_count                                 -0.647106
global_aoa * global_clustering                        -0.046003
global_aoa * global_frequency                         -0.006889
global_aoa * global_letters_count                      0.024424
global_aoa * global_orthographic_density               0.027241
global_aoa * global_synonyms_count                    -0.016003
global_clustering * global_frequency                  -0.040338
global_clustering * global_letters_count              -0.050722
global_clustering * global_orthographic_density       -0.072374
global_clustering * global_synonyms_count             -0.061128
global_frequency * global_letters_count               -0.030626
global_frequency * global_orthographic_density        -0.119965
global_frequency * global_synonyms_count               0.030302
global_letters_count * global_orthographic_density     0.024620
global_letters_count * global_synonyms_count           0.033183
global_orthographic_density * global_synonyms_count    0.021865
dtype: float64

Regressing global orthographic_density with 382 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08164117905797975

intercept                   1.490323
rel_aoa                     0.025175
rel_clustering              0.059160
rel_frequency              -0.003259
rel_letters_count           0.040477
rel_orthographic_density    0.371054
rel_synonyms_count          0.113897
dtype: float64

Regressing global orthographic_density with 382 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12057947654756063

intercept                                        1.403291
rel_aoa                                          0.117358
rel_clustering                                   0.112499
rel_frequency                                   -0.085081
rel_letters_count                               -0.020606
rel_orthographic_density                         0.221208
rel_synonyms_count                               0.257453
rel_aoa * rel_clustering                        -0.037473
rel_aoa * rel_frequency                          0.015063
rel_aoa * rel_letters_count                     -0.010870
rel_aoa * rel_orthographic_density               0.001041
rel_aoa * rel_synonyms_count                    -0.009032
rel_clustering * rel_frequency                   0.024447
rel_clustering * rel_letters_count               0.048658
rel_clustering * rel_orthographic_density        0.080093
rel_clustering * rel_synonyms_count             -0.004041
rel_frequency * rel_letters_count               -0.004706
rel_frequency * rel_orthographic_density        -0.077137
rel_frequency * rel_synonyms_count               0.033396
rel_letters_count * rel_orthographic_density    -0.029182
rel_letters_count * rel_synonyms_count           0.062203
rel_orthographic_density * rel_synonyms_count    0.157057
dtype: float64

Regressing rel orthographic_density with 382 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13443426102929346

intercept                  -0.536577
rel_aoa                     0.019366
rel_clustering              0.060603
rel_frequency               0.048811
rel_letters_count           0.034306
rel_orthographic_density    0.410155
rel_synonyms_count          0.085590
dtype: float64

Regressing rel orthographic_density with 382 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1613139626848017

intercept                                       -0.580054
rel_aoa                                          0.096963
rel_clustering                                   0.095554
rel_frequency                                    0.011522
rel_letters_count                               -0.012033
rel_orthographic_density                         0.236959
rel_synonyms_count                               0.131123
rel_aoa * rel_clustering                        -0.030217
rel_aoa * rel_frequency                          0.011718
rel_aoa * rel_letters_count                     -0.004885
rel_aoa * rel_orthographic_density               0.011876
rel_aoa * rel_synonyms_count                    -0.016603
rel_clustering * rel_frequency                   0.008080
rel_clustering * rel_letters_count               0.023982
rel_clustering * rel_orthographic_density        0.057666
rel_clustering * rel_synonyms_count             -0.023645
rel_frequency * rel_letters_count               -0.015763
rel_frequency * rel_orthographic_density        -0.066693
rel_frequency * rel_synonyms_count               0.012807
rel_letters_count * rel_orthographic_density    -0.000895
rel_letters_count * rel_synonyms_count           0.047878
rel_orthographic_density * rel_synonyms_count    0.083221
dtype: float64

Regressing global orthographic_density with 382 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11603483995054231

intercept                      3.019423
global_aoa                    -0.053408
global_clustering              0.175748
global_frequency              -0.006276
global_letters_count          -0.100659
global_orthographic_density    0.195873
global_synonyms_count         -0.064928
rel_aoa                        0.057031
rel_clustering                -0.049995
rel_frequency                  0.044915
rel_letters_count              0.123433
rel_orthographic_density       0.101817
rel_synonyms_count             0.178393
dtype: float64

Regressing global orthographic_density with 382 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2630541427768328

intercept                                                -8.425130
global_aoa                                               -0.567098
global_clustering                                        -0.793244
global_frequency                                          1.009044
global_letters_count                                      1.341809
global_orthographic_density                              -0.112965
global_synonyms_count                                     3.368236
rel_aoa                                                  -0.683584
rel_clustering                                            3.363308
rel_frequency                                            -0.992991
rel_letters_count                                        -1.169316
rel_orthographic_density                                 -1.827537
rel_synonyms_count                                        1.280335
global_aoa * global_clustering                           -0.028065
global_aoa * global_frequency                            -0.008635
global_aoa * global_letters_count                         0.055000
global_aoa * global_orthographic_density                  0.106395
global_aoa * global_synonyms_count                       -0.234238
global_aoa * rel_aoa                                      0.007523
global_aoa * rel_clustering                              -0.032773
global_aoa * rel_frequency                               -0.025277
global_aoa * rel_letters_count                           -0.000891
global_aoa * rel_orthographic_density                    -0.028728
global_aoa * rel_synonyms_count                           0.288262
global_clustering * global_frequency                      0.060672
global_clustering * global_letters_count                  0.201077
global_clustering * global_orthographic_density          -0.315149
global_clustering * global_synonyms_count                -0.096936
global_clustering * rel_aoa                              -0.203445
global_clustering * rel_clustering                        0.129293
global_clustering * rel_frequency                        -0.233902
global_clustering * rel_letters_count                    -0.042127
global_clustering * rel_orthographic_density              0.596193
global_clustering * rel_synonyms_count                    0.526505
global_frequency * global_letters_count                  -0.021461
global_frequency * global_orthographic_density           -0.190034
global_frequency * global_synonyms_count                 -0.095859
global_frequency * rel_aoa                                0.003520
global_frequency * rel_clustering                        -0.070530
global_frequency * rel_frequency                         -0.022241
global_frequency * rel_letters_count                      0.069531
global_frequency * rel_orthographic_density               0.406390
global_frequency * rel_synonyms_count                     0.081968
global_letters_count * global_orthographic_density       -0.088367
global_letters_count * global_synonyms_count             -0.225526
global_letters_count * rel_aoa                           -0.059540
global_letters_count * rel_clustering                    -0.290884
global_letters_count * rel_frequency                     -0.041085
global_letters_count * rel_letters_count                 -0.010092
global_letters_count * rel_orthographic_density           0.300776
global_letters_count * rel_synonyms_count                -0.124479
global_orthographic_density * global_synonyms_count      -0.235826
global_orthographic_density * rel_aoa                    -0.146135
global_orthographic_density * rel_clustering             -0.060740
global_orthographic_density * rel_frequency              -0.055277
global_orthographic_density * rel_letters_count          -0.003965
global_orthographic_density * rel_orthographic_density   -0.131507
global_orthographic_density * rel_synonyms_count          0.076351
global_synonyms_count * rel_aoa                           0.267629
global_synonyms_count * rel_clustering                   -0.245883
global_synonyms_count * rel_frequency                     0.170695
global_synonyms_count * rel_letters_count                 0.383257
global_synonyms_count * rel_orthographic_density          0.271889
global_synonyms_count * rel_synonyms_count               -0.155279
rel_aoa * rel_clustering                                  0.107517
rel_aoa * rel_frequency                                   0.015466
rel_aoa * rel_letters_count                              -0.001904
rel_aoa * rel_orthographic_density                        0.062105
rel_aoa * rel_synonyms_count                             -0.335343
rel_clustering * rel_frequency                            0.217965
rel_clustering * rel_letters_count                        0.192970
rel_clustering * rel_orthographic_density                -0.114645
rel_clustering * rel_synonyms_count                      -0.176627
rel_frequency * rel_letters_count                        -0.025579
rel_frequency * rel_orthographic_density                 -0.274888
rel_frequency * rel_synonyms_count                       -0.094705
rel_letters_count * rel_orthographic_density             -0.274839
rel_letters_count * rel_synonyms_count                   -0.045500
rel_orthographic_density * rel_synonyms_count            -0.004228
dtype: float64

Regressing rel orthographic_density with 382 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16777321668471235

intercept                      2.298631
global_aoa                    -0.018091
global_clustering              0.227165
global_frequency              -0.000159
global_letters_count          -0.097311
global_orthographic_density   -0.437584
global_synonyms_count          0.007795
rel_aoa                        0.021170
rel_clustering                -0.109697
rel_frequency                  0.049568
rel_letters_count              0.108919
rel_orthographic_density       0.787873
rel_synonyms_count             0.083567
dtype: float64

Regressing rel orthographic_density with 382 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2958609491645262

intercept                                                -10.245073
global_aoa                                                 0.274137
global_clustering                                         -1.303126
global_frequency                                           0.952852
global_letters_count                                       0.502859
global_orthographic_density                               -1.674840
global_synonyms_count                                      3.723577
rel_aoa                                                   -1.298761
rel_clustering                                             2.923381
rel_frequency                                             -1.079200
rel_letters_count                                         -0.249440
rel_orthographic_density                                   0.159954
rel_synonyms_count                                         0.747505
global_aoa * global_clustering                             0.039259
global_aoa * global_frequency                             -0.018431
global_aoa * global_letters_count                          0.053812
global_aoa * global_orthographic_density                  -0.017548
global_aoa * global_synonyms_count                        -0.240505
global_aoa * rel_aoa                                       0.005239
global_aoa * rel_clustering                               -0.067642
global_aoa * rel_frequency                                -0.007159
global_aoa * rel_letters_count                             0.004671
global_aoa * rel_orthographic_density                      0.110866
global_aoa * rel_synonyms_count                            0.290320
global_clustering * global_frequency                       0.085010
global_clustering * global_letters_count                   0.169294
global_clustering * global_orthographic_density           -0.295289
global_clustering * global_synonyms_count                 -0.140744
global_clustering * rel_aoa                               -0.238420
global_clustering * rel_clustering                         0.108467
global_clustering * rel_frequency                         -0.230452
global_clustering * rel_letters_count                      0.000355
global_clustering * rel_orthographic_density               0.605666
global_clustering * rel_synonyms_count                     0.564941
global_frequency * global_letters_count                    0.001893
global_frequency * global_orthographic_density            -0.102985
global_frequency * global_synonyms_count                  -0.138596
global_frequency * rel_aoa                                 0.021414
global_frequency * rel_clustering                         -0.082553
global_frequency * rel_frequency                          -0.019855
global_frequency * rel_letters_count                       0.036056
global_frequency * rel_orthographic_density                0.300121
global_frequency * rel_synonyms_count                      0.113864
global_letters_count * global_orthographic_density         0.110179
global_letters_count * global_synonyms_count              -0.248993
global_letters_count * rel_aoa                            -0.051958
global_letters_count * rel_clustering                     -0.254965
global_letters_count * rel_frequency                      -0.053057
global_letters_count * rel_letters_count                  -0.015881
global_letters_count * rel_orthographic_density            0.091173
global_letters_count * rel_synonyms_count                 -0.055726
global_orthographic_density * global_synonyms_count       -0.203617
global_orthographic_density * rel_aoa                     -0.081494
global_orthographic_density * rel_clustering               0.140496
global_orthographic_density * rel_frequency               -0.038281
global_orthographic_density * rel_letters_count           -0.153381
global_orthographic_density * rel_orthographic_density    -0.093613
global_orthographic_density * rel_synonyms_count           0.069162
global_synonyms_count * rel_aoa                            0.256579
global_synonyms_count * rel_clustering                    -0.172490
global_synonyms_count * rel_frequency                      0.202751
global_synonyms_count * rel_letters_count                  0.341951
global_synonyms_count * rel_orthographic_density           0.213987
global_synonyms_count * rel_synonyms_count                -0.164573
rel_aoa * rel_clustering                                   0.135699
rel_aoa * rel_frequency                                   -0.007299
rel_aoa * rel_letters_count                               -0.008253
rel_aoa * rel_orthographic_density                        -0.011736
rel_aoa * rel_synonyms_count                              -0.327938
rel_clustering * rel_frequency                             0.213386
rel_clustering * rel_letters_count                         0.128664
rel_clustering * rel_orthographic_density                 -0.367942
rel_clustering * rel_synonyms_count                       -0.278483
rel_frequency * rel_letters_count                         -0.016071
rel_frequency * rel_orthographic_density                  -0.270674
rel_frequency * rel_synonyms_count                        -0.136683
rel_letters_count * rel_orthographic_density              -0.118996
rel_letters_count * rel_synonyms_count                    -0.054905
rel_orthographic_density * rel_synonyms_count             -0.013183
dtype: float64

	aoa	betweenness	clustering	degree	frequency	letters_count	orthographic_density	pagerank	phonemes_count	phonological_density	syllables_count	synonyms_count
Component-0	-0.480639	0.235220	-0.085941	0.223020	0.221617	-0.459272	0.208394	0.257139	-0.429278	0.269621	-0.172668	0.012316
Component-1	0.367462	-0.361454	0.135680	-0.278595	-0.326051	-0.403357	0.125370	-0.298287	-0.445819	0.197887	-0.162948	0.011571
Component-2	-0.760629	-0.491724	0.057437	-0.135817	-0.315228	0.131807	0.047868	-0.161522	0.058676	-0.079684	0.037981	0.039356

	aoa	frequency	letters_count
Component-0	-0.761088	0.339546	-0.552678
Component-1	0.423269	-0.385666	-0.819820