Feature variation by substitution ($\nu_{\phi}$)

1 Setup

Flags and settings.


In [1]:
SAVE_FIGURES = False
PAPER_FEATURES = ['frequency', 'aoa', 'clustering', 'letters_count',
                  'synonyms_count', 'orthographic_density']
N_COMPONENTS = 3
BIN_COUNT = 4

Imports and database setup.


In [2]:
from itertools import product

import pandas as pd
import seaborn as sb
from scipy import stats
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from progressbar import ProgressBar

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope
engine = init_db()

2 Variation of features upon substitution

First build our data.


In [3]:
model = Model(Time.discrete, Source.majority, Past.last_bin, Durl.all, 1)
data = []

with session_scope() as session:
    substitutions = session.query(Substitution.id)\
        .filter(Substitution.model == model)
    print("Got {} substitutions for model {}"
          .format(substitutions.count(), model))
    substitution_ids = [id for (id,) in substitutions]

for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for feature in Substitution.__features__:
            source, destination = substitution.features(feature)
            source_rel, destination_rel = \
                substitution.features(feature, sentence_relative='median')
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'feature': feature,
                'source': source,
                'source_rel': source_rel,
                'destination': destination,
                'destination_rel': destination_rel,
                'h0': substitution.feature_average(feature),
                'h0_rel': substitution.feature_average(
                        feature, sentence_relative='median'),
                'h0n': substitution.feature_average(
                        feature, source_synonyms=True),
                'h0n_rel': substitution.feature_average(
                        feature, source_synonyms=True,
                        sentence_relative='median')})

original_variations = pd.DataFrame(data)
del data


Got 6318 substitutions for model Model(time=Time.discrete, source=Source.majority, past=Past.last_bin, durl=Durl.all, max_distance=1)
100% (6318 of 6318) |######################| Elapsed Time: 0:02:10 Time: 0:02:10

Compute cluster averages (so as not to overestimate confidence intervals) and crop data so that we have acceptable CIs.


In [4]:
variations = original_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'feature'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'feature'], as_index=False)\
    ['source', 'source_rel', 'destination', 'destination_rel', 'feature',
     'h0', 'h0_rel', 'h0n', 'h0n_rel'].mean()
variations['variation'] = variations['destination'] - variations['source']

# HARDCODED: drop values where source AoA is above 15.
# This crops the graphs to acceptable CIs.
variations.loc[(variations.feature == 'aoa') & (variations.source > 15),
               ['source', 'source_rel', 'destination', 'destination_rel',
                'h0', 'h0_rel', 'h0n', 'h0n_rel']] = np.nan

Prepare feature ordering.


In [5]:
ordered_features = sorted(
    Substitution.__features__,
    key=lambda f: Substitution._transformed_feature(f).__doc__
)

What we plot about features

For a feature $\phi$, plot:

  • $\nu_{\phi}$, the average feature of an appearing word upon substitution, as a function of the feature of the disappearing word: $$\nu_{\phi}(f) = \left< \phi(w') \right>_{\{w \rightarrow w' | \phi(w) = f \}}$$
  • $\nu_{\phi}^0$ (which is the average feature value), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi}^{00}$ (which is the average feature value for synonyms of the source word), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

We also plot these values relative to the sentence average, i.e.:

  • $\nu_{\phi, r}$, the average sentence-relative feature of an appearing word upon substitution as a function of the sentence-relative feature of the disappearing word, i.e. $\phi($destination$) - \phi($destination sentence$)$ as a function of $\phi($source$) - \phi($source sentence$)$
  • $\nu_{\phi, r}^0$ (which is the average feature value minus the sentence average), i.e. what happens under $\mathcal{H}_0$
  • $\nu_{\phi, r}^{00}$ (which is the average feature value for synonyms of the source word minus the sentence average), i.e. what happens under $\mathcal{H}_{00}$
  • $y = x$, i.e. what happens if there is no substitution

Those values are plotted with fixed-width bins, then quantile bins, with absolute feature values, then with relative-to-sentence features.


In [6]:
def print_significance(name, bins, h0, h0n, values):
    bin_count = bins.max() + 1
    print()
    print('-' * len(name))
    print(name)
    print('-' * len(name))
    header = ('Bin  |   '
              + ' |   '.join(map(str, range(1, bin_count + 1)))
              + ' |')
    print(header)
    print('-' * len(header))
    
    for null_name, nulls in [('H_0 ', h0), ('H_00', h0n)]:
        bin_values = np.zeros(bin_count)
        bin_nulls = np.zeros(bin_count)
        cis = np.zeros((bin_count, 3))

        for i in range(bin_count):
            indices = bins == i
            n = (indices).sum()
            s = values[indices].std(ddof=1)

            bin_values[i] = values[indices].mean()
            bin_nulls[i] = nulls[indices].mean()
            for j, alpha in enumerate([.05, .01, .001]):
                cis[i, j] = (stats.t.ppf(1 - alpha/2, n - 1)
                             * values[indices].std(ddof=1)
                             / np.sqrt(n))

        print(null_name + ' |', end='')
        differences = ((bin_values[:,np.newaxis]
                        < bin_nulls[:,np.newaxis] - cis)
                       | (bin_values[:,np.newaxis]
                          > bin_nulls[:,np.newaxis] + cis))
        for i in range(bin_count):
            if differences[i].any():
                n_stars = np.where(differences[i])[0].max()
                bin_stars = '*' * (1 + n_stars) + ' ' * (2 - n_stars)
            else:
                bin_stars = 'ns.'
            print(' ' + bin_stars + ' |', end='')
        print()

In [7]:
def plot_variation(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    relative = kwargs.get('relative', False)
    quantiles = kwargs.get('quantiles', False)
    feature_field = kwargs.get('feature_field', 'feature')
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n))
    
    # Plot.
    nuphi = r'\nu_{\phi' + (',r' if relative else '') + '}'
    plt.plot(middles, values, '-', lw=2, color=color,
             label='${}$'.format(nuphi))
    plt.fill_between(middles, values - cis, values + cis,
                     color=sb.desaturate(color, 0.2), alpha=0.2)
    plt.plot(middles, h0s, '--', color=sb.desaturate(color, 0.2),
             label='${}^0$'.format(nuphi))
    plt.plot(middles, h0ns, linestyle='-.',
             color=sb.desaturate(color, 0.2),
             label='${}^{{00}}$'.format(nuphi))
    plt.plot(middles, middles, linestyle='dotted',
             color=sb.desaturate(color, 0.2),
             label='$y = x$')
    lmin, lmax = middles[0], middles[-1]
    h0min, h0max = min(h0s.min(), h0ns.min()), max(h0s.max(), h0ns.max())
    # Rescale limits if we're touching H0 or H00.
    if h0min < lmin:
        lmin = h0min - (lmax - h0min) / 10
    elif h0max > lmax:
        lmax = h0max + (h0max - lmin) / 10
    plt.xlim(lmin, lmax)
    plt.ylim(lmin, lmax)

    # Test for statistical significance
    print_significance(str(data.iloc[0][feature_field]),
                       x_bins, h0, h0n, y)

In [8]:
def plot_grid(data, features, filename,
              plot_function, xlabel, ylabel,
              feature_field='feature', plot_kws={}):
    g = sb.FacetGrid(data=data[data[feature_field]
                               .map(lambda f: f in features)],
                     sharex=False, sharey=False,
                     col=feature_field, hue=feature_field,
                     col_order=features, hue_order=features,
                     col_wrap=3, aspect=1.5, size=3)
    g.map_dataframe(plot_function, **plot_kws)
    g.set_titles('{col_name}')
    g.set_xlabels(xlabel)
    g.set_ylabels(ylabel)
    for ax in g.axes.ravel():
        legend = ax.legend(frameon=True, loc='best')
        if not legend:
            # Skip if nothing was plotted on these axes.
            continue
        frame = legend.get_frame()
        frame.set_facecolor('#f2f2f2')
        frame.set_edgecolor('#000000')
        ax.set_title(Substitution._transformed_feature(ax.get_title())
                     .__doc__)
    if SAVE_FIGURES:
        g.fig.savefig(settings.FIGURE.format(filename),
                      bbox_inches='tight', dpi=300)

In [9]:
def plot_bias(ax, data, color, ci=True, relative=False, quantiles=False):
    feature = data.iloc[0].feature
    rel = '_rel' if relative else ''
    x = data['source' + rel]
    y = data['destination' + rel]
    h0 = data['h0' + rel]
    h0n = data['h0n' + rel]
    
    # Compute binning.
    cut, cut_kws = ((pd.qcut, {}) if quantiles
                    else (pd.cut, {'right': False}))
    for bin_count in range(BIN_COUNT, 0, -1):
        try:
            x_bins, bins = cut(x, bin_count, labels=False,
                               retbins=True, **cut_kws)
            break
        except ValueError:
            pass
    middles = (bins[:-1] + bins[1:]) / 2
    
    # Compute bin values.
    h0s = np.zeros(bin_count)
    h0ns = np.zeros(bin_count)
    values = np.zeros(bin_count)
    cis = np.zeros(bin_count)
    for i in range(bin_count):
        indices = x_bins == i
        n = indices.sum()
        h0s[i] = h0[indices].mean()
        h0ns[i] = h0n[indices].mean()
        values[i] = y[indices].mean()
        cis[i] = (stats.t.ppf(.975, n - 1) * y[indices].std(ddof=1)
                  / np.sqrt(n))
    
    # Plot.
    scale = abs(h0s.mean())
    ax.plot(np.linspace(0, 1, bin_count),
            (values - h0ns) / scale, '-', lw=2, color=color,
            label=Substitution._transformed_feature(feature).__doc__)
    if ci:
        ax.fill_between(np.linspace(0, 1, bin_count),
                        (values - h0ns - cis) / scale,
                        (values - h0ns + cis) / scale,
                        color=sb.desaturate(color, 0.2), alpha=0.2)

In [10]:
def plot_overlay(data, features, filename, palette_name,
                 plot_function, title, xlabel, ylabel, plot_kws={}):
    palette = sb.color_palette(palette_name, len(features))
    fig, ax = plt.subplots(figsize=(12, 6))
    for j, feature in enumerate(features):
        plot_function(ax, data[data.feature == feature].dropna(),
                      color=palette[j], **plot_kws)
    ax.legend(loc='lower right')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if SAVE_FIGURES:
        fig.savefig(settings.FIGURE.format(filename),
                    bbox_inches='tight', dpi=300)
    return ax

2.1 Global feature values

2.1.1 Bins of distribution of appeared global feature values

For each feature $\phi$, we plot the variation upon substitution as explained above


In [11]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *   | **  |
H_00 | *** | ns. | ns. | ns. |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | *** | **  |
H_00 | *** | ns. | *   | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | **  | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | *** | *** | *** |
H_00 | **  | *** | *** | ns. |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | ns. | *** | *** | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

Then plot $\nu_{\phi} - \nu_{\phi}^{00}$ for each feature (i.e. the measured bias) to see how they compare


In [12]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False});



In [13]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$')


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *   |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | ns. |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | **  | *   |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | **  | *** | *** |
H_00 | **  | ns. | ns. | ns. |

In [14]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)')\
    .set_ylim(-2, .7);


2.1.2 Quantiles of distribution of appeared global feature values


In [15]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | *** | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |
------------------------
H_0  | *** | *** | *** |
H_00 | **  | **  | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | **  | ns. | *** |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | ns. | *** | *   | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | ns. | ns. | *   |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | ns. | ns. |

In [16]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_global',
             'husl', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'ci': False, 'quantiles': True});



In [17]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_global', plot_variation,
          r'$\phi($disappearing word$)$', r'$\phi($appearing word$)$',
          plot_kws={'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | **  |
H_00 | ns. | *** | *   | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | **  | ns. | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | **  | ns. | ns. | *   |

In [18]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_global',
             'deep', plot_bias, 'Measured bias for all features',
             r'$\phi($source word$)$ (normalised to $[0, 1]$)',
             r'$\nu_{\phi} - \nu_{\phi}^{00}$'
                 '\n(normalised to feature average)',
             plot_kws={'quantiles': True})\
    .set_ylim(-1.2, .6);


2.2 Sentence-relative feature values

2.2.1 Bins of distribution of appeared sentence-relative values


In [19]:
plot_grid(variations, ordered_features,
          'all-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi_r($disappearing word$)$',
          r'$\phi_r($appearing word$)$',
          plot_kws={'relative': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | **  | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *   | *** |
H_00 | ns. | *   | ns. | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | *** | ns. |
H_00 | ns. | *** | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | ns. | *** | *** |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *** | *   | *** | **  |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | **  | *   | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | *   | **  | *** | ns. |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | ns. | *   | ns. | ns. |

In [20]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-fixedbins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi_r($disappearing word$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi_r} - \nu_{\phi_r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True});



In [21]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-fixedbins_sentencerel', plot_variation,
          r'$\phi_r($disappearing word$)$',
          r'$\phi_r($appearing word$)$',
          plot_kws={'relative': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | **  | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *** | ns. | ns. |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | ns. |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *   | *** | ns. |
H_00 | ns. | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | **  | *   | ns. | ns. |

In [22]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-fixedbins_sentencerel',
             'deep', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi_r($disappearing word$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi_r} - \nu_{\phi_r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True})\
    .set_ylim(-2, .7);


2.2.2 Quantiles of distribution of appeared sentence-relative values


In [23]:
plot_grid(variations, ordered_features,
          'all-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi_r($disappearing word$)$',
          r'$\phi_r($appearing word$)$',
          plot_kws={'relative': True, 'quantiles': True})


-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
phonemes_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | **  |
H_00 | ns. | ns. | ns. | *   |

---------------
syllables_count
---------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | ns. | *** |
H_00 | **  | ns. | *   | *   |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | **  | ns. | *** | *** |

-----------
betweenness
-----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *   | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | ns. | *   |

------
degree
------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | *** | *** |

---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |

--------
pagerank
--------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

--------------------
phonological_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | ns. | ns. | ns. |

In [24]:
plot_overlay(variations, ordered_features,
             'all-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi_r($disappearing word$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi_r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'ci': False, 'relative': True, 'quantiles': True});



In [25]:
plot_grid(variations, PAPER_FEATURES,
          'paper-variations-quantilebins_sentencerel', plot_variation,
          r'$\phi_r($disappearing word$)$',
          r'$\phi_r($appearing word$)$',
          plot_kws={'relative': True, 'quantiles': True})


---------
frequency
---------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *** |

---
aoa
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | *** | *** | *** | *   |

----------
clustering
----------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | *** |
H_00 | ns. | *** | ns. | *   |

-------------
letters_count
-------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | *** |

--------------
synonyms_count
--------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | ns. | ns. | *** |
H_00 | **  | ns. | *** | *** |

--------------------
orthographic_density
--------------------
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *   | ns. | *** | *** |
H_00 | *** | ns. | ns. | ns. |

In [26]:
plot_overlay(variations, PAPER_FEATURES,
             'paper-variations_bias-quantilebins_sentencerel',
             'husl', plot_bias,
             'Measured bias for all sentence-relative features',
             r'$\phi_r($disappearing word$)$'
                 r' (normalised to $[0, 1]$)',
             r'$\nu_{\phi_r} - \nu_{\phi,r}^{00}$'
                 '\n(normalised to sentence-relative feature average)',
             plot_kws={'relative': True, 'quantiles': True});


3 Streamplots

We'd like to see what happens between absolute and relative feature values, i.e. how do their effects interact. Especially, we want to know who wins between cognitive bias, attraction to sentence average, or attraction to global feature average.

To do this we plot the general direction (arrows) and strength (color) of where destination words are given a particular absolute/relative source feature couple. I.e., for a given absolute feature value and relative feature value, if this word were to be substituted, where would it go in this (absolute, relative) space?

The interesting thing in these plots is the attraction front, where all arrows point to and join. We're interested in:

  • its slope
  • its shape (e.g. several slope regimes?)
  • its position w.r.t. $\nu_{\phi}^0$ and $y = 0$ (which is $\left< \phi(sentence) \right>$)

First, here's our plotting function. (Note we set the arrow size to something that turns out to be huge here, but gives normal sizes in the figures saves. There must be some dpi scaling problem with the arrows.)


In [27]:
def plot_stream(**kwargs):
    data = kwargs.pop('data')
    color = kwargs.get('color', 'blue')
    source = data['source']
    source_rel = data['source_rel']
    dest = data['destination']
    dest_rel = data['destination_rel']
    h0 = data['h0']
    
    # Compute binning.
    bin_count = 4
    x_bins, x_margins = pd.cut(source, bin_count,
                               right=False, labels=False, retbins=True)
    x_middles = (x_margins[:-1] + x_margins[1:]) / 2
    y_bins, y_margins = pd.cut(source_rel, bin_count,
                               right=False, labels=False, retbins=True)
    y_middles = (y_margins[:-1] + y_margins[1:]) / 2
    
    # Compute bin values.
    h0s = np.ones(bin_count) * h0.iloc[0]
    u_values = np.zeros((bin_count, bin_count))
    v_values = np.zeros((bin_count, bin_count))
    strength = np.zeros((bin_count, bin_count))
    for x in range(bin_count):
        for y in range(bin_count):
            u_values[y, x] = (
                dest[(x_bins == x) & (y_bins == y)] -
                source[(x_bins == x) & (y_bins == y)]
            ).mean()
            v_values[y, x] = (
                dest_rel[(x_bins == x) & (y_bins == y)] -
                source_rel[(x_bins == x) & (y_bins == y)]
            ).mean()
            strength[y, x] = np.sqrt(
                (dest[(x_bins == x) & (y_bins == y)] - 
                 source[(x_bins == x) & (y_bins == y)]) ** 2 +
                (dest_rel[(x_bins == x) & (y_bins == y)] - 
                 source_rel[(x_bins == x) & (y_bins == y)]) ** 2
            ).mean()
    
    # Plot.
    plt.streamplot(x_middles, y_middles, u_values, v_values,
                   arrowsize=4, color=strength, cmap=plt.cm.viridis)
    plt.plot(x_middles, np.zeros(bin_count), linestyle='-',
             color=sb.desaturate(color, 0.2), 
             label=r'$\left< \phi(sentence) \right>$')
    plt.plot(h0s, y_middles, linestyle='--',
             color=sb.desaturate(color, 0.2), label=r'$\nu_{\phi}^0$')
    plt.xlim(x_middles[0], x_middles[-1])
    plt.ylim(y_middles[0], y_middles[-1])

Here are the plots for all features


In [28]:
g = sb.FacetGrid(data=variations,
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=ordered_features, hue_order=ordered_features)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi_r($word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

And here are the plots for the features we expose in the paper


In [29]:
g = sb.FacetGrid(data=variations[variations['feature']
                                 .map(lambda f: f in PAPER_FEATURES)],
                 col='feature', col_wrap=3,
                 sharex=False, sharey=False, hue='feature',
                 aspect=1, size=4.5,
                 col_order=PAPER_FEATURES, hue_order=PAPER_FEATURES)
g.map_dataframe(plot_stream)
g.set_titles('{col_name}')
g.set_xlabels(r'$\phi($word$)$')
g.set_ylabels(r'$\phi_r($word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
    ax.set_title(Substitution._transformed_feature(ax.get_title()).__doc__)
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-feature_streams'),
                  bbox_inches='tight', dpi=300)


/home/sl/.virtualenvs/brainscopypaste/lib/python3.5/site-packages/numpy/ma/core.py:4144: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

4 PCA'd feature variations

Compute PCA on feature variations (note: on variations, not on features directly), and show the evolution of the first three components upon substitution.

CAVEAT: the PCA is computed on variations where all features are defined. This greatly reduces the number of words included (and also the number of substitutions -- see below for real values, but you should know it's drastic). This also has an effect on the computation of $\mathcal{H}_0$ and $\mathcal{H}_{00}$, which are computed using words for which all features are defined. This, again, hugely reduces the number of words taken into account, changing the values under the null hypotheses.

4.1 On all the features

Compute the actual PCA


In [30]:
# Compute the PCA.
pcafeatures = tuple(sorted(Substitution.__features__))
pcavariations = variations.pivot(index='cluster_id',
                                 columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

print("We're plotting variation for the first {} components:"
      .format(N_COMPONENTS))
pd.DataFrame(pca.components_[:N_COMPONENTS],
             columns=pcafeatures,
             index=['Component-{}'.format(i) for i in range(N_COMPONENTS)])


MLE estimates there are 11 components.

Those explain the following variance:
[ 0.54601438  0.15938617  0.08145657  0.07174038  0.03516331  0.02791302
  0.02146007  0.01836326  0.01624824  0.01029141  0.00737022]

We're plotting variation for the first 3 components:
Out[30]:
aoa betweenness clustering degree frequency letters_count orthographic_density pagerank phonemes_count phonological_density syllables_count synonyms_count
Component-0 -0.492463 0.294978 -0.101583 0.244914 0.251340 -0.418117 0.213284 0.268325 -0.377521 0.280270 -0.153406 0.008792
Component-1 -0.339265 0.351267 -0.093073 0.273678 0.231802 0.420829 -0.161448 0.310660 0.481343 -0.249358 0.173202 -0.025981
Component-2 -0.652011 -0.154437 0.035430 -0.073182 -0.725758 0.046095 0.008336 -0.081155 0.022628 -0.078460 0.014312 0.045194

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [31]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(N_COMPONENTS):
            source, destination = substitution\
                .components(component, pca, pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (6318 of 6318) |######################| Elapsed Time: 0:01:45 Time: 0:01:45

Compute cluster averages (so as not to overestimate confidence intervals).


In [32]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components (see the caveat section below)


In [33]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='upper left')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('all-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | ns. | *** | *** |
H_00 | *   | ns. | *   | **  |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | **  |
H_00 | ns. | *** | *** | *   |

---
2.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | ns. |
H_00 | ns. | *** | **  | ns. |

4.2 On a subset of relevant features


In [34]:
relevant_features = ['frequency', 'aoa', 'letters_count']

Compute the actual PCA


In [35]:
# Compute the PCA.
pcafeatures = tuple(sorted(relevant_features))
pcavariations = variations[variations['feature']
                           .map(lambda f: f in pcafeatures)]\
    .pivot(index='cluster_id', columns='feature', values='variation')
pcavariations = pcavariations.dropna()
pca = PCA(n_components='mle')
pca.fit(pcavariations)

# Show 
print('MLE estimates there are {} components.\n'.format(pca.n_components_))
print('Those explain the following variance:')
print(pca.explained_variance_ratio_)
print()

pd.DataFrame(pca.components_,
             columns=pcafeatures,
             index=['Component-{}'.format(i)
                    for i in range(pca.n_components_)])


MLE estimates there are 2 components.

Those explain the following variance:
[ 0.64694229  0.21386368]

Out[35]:
aoa frequency letters_count
Component-0 -0.732385 0.381288 -0.564120
Component-1 0.326151 -0.530816 -0.782214

Compute the source and destination component values, along with $\mathcal{H}_0$ and $\mathcal{H}_{00}$, for each component.


In [36]:
data = []
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
    with session_scope() as session:
        substitution = session.query(Substitution).get(substitution_id)
        
        for component in range(pca.n_components_):
            source, destination = substitution.components(component, pca,
                                                          pcafeatures)
            data.append({
                'cluster_id': substitution.source.cluster.sid,
                'destination_id': substitution.destination.sid,
                'occurrence': substitution.occurrence,
                'position': substitution.position,
                'source_id': substitution.source.sid,
                'component': component,
                'source': source,
                'destination': destination,
                'h0': substitution.component_average(component, pca,
                                                     pcafeatures),
                'h0n': substitution.component_average(component, pca,
                                                      pcafeatures,
                                                      source_synonyms=True)
            })

original_component_variations = pd.DataFrame(data)
del data


100% (6318 of 6318) |######################| Elapsed Time: 0:00:50 Time: 0:00:50

Compute cluster averages (so as not to overestimate confidence intervals).


In [37]:
component_variations = original_component_variations\
    .groupby(['destination_id', 'occurrence', 'position', 'component'],
             as_index=False).mean()\
    .groupby(['cluster_id', 'component'], as_index=False)\
    ['source', 'destination', 'component', 'h0', 'h0n'].mean()

Plot the actual variations of components


In [38]:
g = sb.FacetGrid(data=component_variations, col='component', col_wrap=3,
                 sharex=False, sharey=False, hue='component',
                 aspect=1.5, size=3)
g.map_dataframe(plot_variation, feature_field='component')
g.set_xlabels(r'$c($disappearing word$)$')
g.set_ylabels(r'$c($appearing word$)$')
for ax in g.axes.ravel():
    legend = ax.legend(frameon=True, loc='best')
    if not legend:
        # Skip if nothing was plotted on these axes.
        continue
    frame = legend.get_frame()
    frame.set_facecolor('#f2f2f2')
    frame.set_edgecolor('#000000')
if SAVE_FIGURES:
    g.fig.savefig(settings.FIGURE.format('paper-pca_variations-absolute'),
                  bbox_inches='tight', dpi=300)


---
0.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | ns. | *** | *** | *** |
H_00 | ns. | ns. | *** | *** |

---
1.0
---
Bin  |   1 |   2 |   3 |   4 |
------------------------------
H_0  | *** | *** | *** | ns. |
H_00 | *** | *** | *** | ns. |

4.3 CAVEAT: reduction of the numbers of words and substitutions

As explained above, this PCA analysis can only use words for which all the features are defined (in this case, the features listed in relevant_features). So note the following:


In [39]:
for feature in relevant_features:
    print("Feature '{}' is based on {} words."
          .format(feature, len(Substitution
                               ._transformed_feature(feature)())))

# Compute the number of words that have all PAPER_FEATURES defined.
words = set()
for tfeature in [Substitution._transformed_feature(feature)
                 for feature in relevant_features]:
    words.update(tfeature())

data = dict((feature, []) for feature in relevant_features)
words_list = []
for word in words:
    words_list.append(word)
    for feature in relevant_features:
        data[feature].append(Substitution
                             ._transformed_feature(feature)(word))
wordsdf = pd.DataFrame(data)
wordsdf['words'] = words_list
del words_list, data

print()
print("Among all the set of words used by these features, "
      "only {} are used."
      .format(len(wordsdf.dropna())))

print()
print("Similarly, we mined {} (cluster-unique) substitutions, "
      "but the PCA is in fact"
      " computed on {} of them (those where all features are defined)."
      .format(len(set(variations['cluster_id'])), len(pcavariations)))


Feature 'frequency' is based on 33450 words.
Feature 'aoa' is based on 30102 words.
Feature 'letters_count' is based on 42786 words.

Among all the set of words used by these features, only 14450 are used.

Similarly, we mined 698 (cluster-unique) substitutions, but the PCA is in fact computed on 525 of them (those where all features are defined).

The way $\mathcal{H}_0$ and $\mathcal{H}_{00}$ are computed makes them also affected by this.

5 Interactions between features (by Anova)

Some useful variables first.


In [40]:
cuts = [('fixed bins', pd.cut)]#, ('quantiles', pd.qcut)]
rels = [('global', ''), ('sentence-relative', '_rel')]

def star_level(p):
    if p < .001:
        return '***'
    elif p < .01:
        return ' **'
    elif p < .05:
        return '  *'
    else:
        return 'ns.'

Now for each feature, assess if it has an interaction with the other features' destination value. We look at this for all pairs of features, with all pairs of global/sentence-relative value and types of binning (fixed width/quantiles). So it's a lot of answers.

Three stars means $p < .001$, two $p < .01$, one $p < .05$, and ns. means non-significative.


In [41]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
   ** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
    * global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
    * global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
    * global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  ns. global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
   ** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Now for each feature, look at its interaction with the other features' variation (i.e. destination - source). Same drill, same combinations.


In [42]:
for feature1 in PAPER_FEATURES:
    print('-' * len(feature1))
    print(feature1)
    print('-' * len(feature1))

    for feature2 in PAPER_FEATURES:
        print()
        print('-> {}'.format(feature2))
        for (cut_label, cut), (rel1_label, rel1) in product(cuts, rels):
            for (rel2_label, rel2) in rels:
                source = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel1)[feature1]
                destination = variations.pivot(
                    index='cluster_id', columns='feature',
                    values='destination' + rel2)[feature2]\
                    - variations.pivot(
                    index='cluster_id', columns='feature',
                    values='source' + rel2)[feature2]

                # Compute binning.
                for bin_count in range(BIN_COUNT, 0, -1):
                    try:
                        source_bins = cut(source, bin_count, labels=False)
                        break
                    except ValueError:
                        pass

                _, p = stats.f_oneway(*[destination[source_bins == i]
                                        .dropna()
                                        for i in range(bin_count)])
                print('  {} {} -> {}'
                      .format(star_level(p), rel1_label, rel2_label))
    print()


---------
frequency
---------

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
    * global -> global
    * global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

---
aoa
---

-> frequency
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
  *** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

----------
clustering
----------

-> frequency
  *** global -> global
  *** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
    * global -> global
    * global -> sentence-relative
    * sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> letters_count
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-------------
letters_count
-------------

-> frequency
   ** global -> global
   ** global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
   ** global -> global
   ** global -> sentence-relative
    * sentence-relative -> global
    * sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

--------------
synonyms_count
--------------

-> frequency
    * global -> global
    * global -> sentence-relative
   ** sentence-relative -> global
   ** sentence-relative -> sentence-relative

-> aoa
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> clustering
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> synonyms_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> orthographic_density
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

--------------------
orthographic_density
--------------------

-> frequency
  *** global -> global
  *** global -> sentence-relative
   ** sentence-relative -> global
    * sentence-relative -> sentence-relative

-> aoa
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> clustering
    * global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> letters_count
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

-> synonyms_count
  ns. global -> global
  ns. global -> sentence-relative
  ns. sentence-relative -> global
  ns. sentence-relative -> sentence-relative

-> orthographic_density
  *** global -> global
  *** global -> sentence-relative
  *** sentence-relative -> global
  *** sentence-relative -> sentence-relative

Ok, so this can go on for a long time, and I'm not going to look at interactions with this lens (meaning at interaction of couples of features with another feature's destination values).

6 Regression


In [43]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm

In [44]:
def prettify_index(index):
    return pd.Index(name=index.name,
                    data=index.map(lambda f: Substitution
                                   ._transformed_feature(f).__doc__))

First try regressing the global feature variations based on the global source features, then the average of global feature variations.


In [ ]:


In [45]:
# Get source and destination values.
source = pd.pivot_table(
    variations,
    values='source',
    index=['cluster_id'],
    columns=['feature']
)[PAPER_FEATURES].dropna()
destination = pd.pivot_table(
    variations,
    values='destination',
    index=['cluster_id'],
    columns=['feature']
).loc[source.index][PAPER_FEATURES].dropna()
source = source.loc[destination.index]
nsource = ((source.values - source.values.min(0))
           / (source.values.max(0) - source.values.min(0)))
target = destination.values - source.values
target = (target - target.min(0)) / (target.max(0) - target.min(0))

# Regress.
linreg = linear_model.LinearRegression()
linreg.fit(nsource, target)

# And print the score.
print('Regressing destination global values with {} measures'
      .format(len(source)))
print('R^2 = {}'.format(linreg.score(nsource, target)))
coeffs = pd.DataFrame(data=linreg.coef_,
                      index=prettify_index(destination.columns
                                           .rename('target variation')),
                      columns=prettify_index(source.columns
                                             .rename('source feature')))
#coeffs['intercept'] = linreg.intercept_

# Plot the coefficients.
ax = coeffs.plot(kind='bar', rot=45)
ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.figure()
axh = sb.heatmap(coeffs, annot=True, fmt=".2f", linewidths=.5, robust=True)

# Print the OLS details, and augment the heatmap texts with significance
# stars.
def stars_addendum(p):
    if p <= .001:
        return '\n***'
    elif p <= .01:
        return '\n**'
    elif p <= .05:
        return '\n*'
    else:
        return ''

print()
texts = np.array(axh.texts).reshape((len(PAPER_FEATURES),
                                     len(PAPER_FEATURES)))[::-1]
for i, feature in enumerate(PAPER_FEATURES):
    sm_results = sm.OLS(target[:, i], sm.add_constant(nsource)).fit()
    for j, p in enumerate(sm_results.pvalues[1:]):
        texts[i, j].set_text(texts[i, j].get_text()
                             + stars_addendum(p))
    print('=' * 78)
    print('Details for {}'.format(feature.upper()))
    print(sm_results.summary())

if SAVE_FIGURES:
    ax.figure.savefig(settings.FIGURE
                      .format('paper-variations_regression-'
                              'globals_to_globalsvariation'),
                      bbox_inches='tight', dpi=300)
    axh.figure.savefig(settings.FIGURE
                       .format('paper-variations_regression-'
                               'globals_to_globalsvariation-heatmap'),
                       bbox_inches='tight', dpi=300)


Regressing destination global values with 273 measures
R^2 = 0.32880088378014444

==============================================================================
Details for FREQUENCY
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.290
Model:                            OLS   Adj. R-squared:                  0.274
Method:                 Least Squares   F-statistic:                     18.07
Date:                Fri, 29 Jul 2016   Prob (F-statistic):           1.40e-17
Time:                        16:13:44   Log-Likelihood:                 182.98
No. Observations:                 273   AIC:                            -352.0
Df Residuals:                     266   BIC:                            -326.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6603      0.063     10.409      0.000       0.535       0.785
x1            -0.3116      0.041     -7.634      0.000      -0.392      -0.231
x2             0.0180      0.049      0.372      0.710      -0.078       0.114
x3             0.0728      0.048      1.531      0.127      -0.021       0.166
x4             0.0265      0.053      0.497      0.620      -0.078       0.131
x5            -0.0264      0.050     -0.524      0.601      -0.126       0.073
x6            -0.0257      0.042     -0.604      0.546      -0.109       0.058
==============================================================================
Omnibus:                        0.448   Durbin-Watson:                   1.825
Prob(Omnibus):                  0.799   Jarque-Bera (JB):                0.267
Skew:                          -0.061   Prob(JB):                        0.875
Kurtosis:                       3.093   Cond. No.                         17.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
==============================================================================
Details for AOA
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.263
Model:                            OLS   Adj. R-squared:                  0.247
Method:                 Least Squares   F-statistic:                     15.83
Date:                Fri, 29 Jul 2016   Prob (F-statistic):           1.50e-15
Time:                        16:13:44   Log-Likelihood:                 118.84
No. Observations:                 273   AIC:                            -223.7
Df Residuals:                     266   BIC:                            -198.4
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7806      0.080      9.729      0.000       0.623       0.939
x1            -0.1253      0.052     -2.426      0.016      -0.227      -0.024
x2            -0.4151      0.061     -6.764      0.000      -0.536      -0.294
x3            -0.0693      0.060     -1.153      0.250      -0.188       0.049
x4            -0.0766      0.067     -1.137      0.257      -0.209       0.056
x5            -0.0594      0.064     -0.931      0.353      -0.185       0.066
x6            -0.0260      0.054     -0.483      0.629      -0.132       0.080
==============================================================================
Omnibus:                        7.033   Durbin-Watson:                   2.145
Prob(Omnibus):                  0.030   Jarque-Bera (JB):                7.215
Skew:                           0.396   Prob(JB):                       0.0271
Kurtosis:                       2.920   Cond. No.                         17.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
==============================================================================
Details for CLUSTERING
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.345
Model:                            OLS   Adj. R-squared:                  0.330
Method:                 Least Squares   F-statistic:                     23.34
Date:                Fri, 29 Jul 2016   Prob (F-statistic):           4.10e-22
Time:                        16:13:44   Log-Likelihood:                 182.87
No. Observations:                 273   AIC:                            -351.7
Df Residuals:                     266   BIC:                            -326.5
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8821      0.063     13.899      0.000       0.757       1.007
x1            -0.1106      0.041     -2.709      0.007      -0.191      -0.030
x2            -0.0205      0.049     -0.421      0.674      -0.116       0.075
x3            -0.5243      0.048    -11.028      0.000      -0.618      -0.431
x4            -0.0565      0.053     -1.060      0.290      -0.161       0.048
x5            -0.0288      0.050     -0.571      0.568      -0.128       0.071
x6            -0.0022      0.043     -0.053      0.958      -0.086       0.081
==============================================================================
Omnibus:                        4.554   Durbin-Watson:                   2.095
Prob(Omnibus):                  0.103   Jarque-Bera (JB):                5.911
Skew:                          -0.053   Prob(JB):                       0.0521
Kurtosis:                       3.713   Cond. No.                         17.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
==============================================================================
Details for LETTERS_COUNT
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.371
Model:                            OLS   Adj. R-squared:                  0.357
Method:                 Least Squares   F-statistic:                     26.20
Date:                Fri, 29 Jul 2016   Prob (F-statistic):           1.92e-24
Time:                        16:13:44   Log-Likelihood:                 236.50
No. Observations:                 273   AIC:                            -459.0
Df Residuals:                     266   BIC:                            -433.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6553      0.052     12.567      0.000       0.553       0.758
x1            -0.0236      0.034     -0.702      0.483      -0.090       0.042
x2             0.0615      0.040      1.543      0.124      -0.017       0.140
x3            -0.0265      0.039     -0.678      0.499      -0.103       0.050
x4            -0.3674      0.044     -8.388      0.000      -0.454      -0.281
x5            -0.0471      0.041     -1.136      0.257      -0.129       0.035
x6          5.651e-05      0.035      0.002      0.999      -0.069       0.069
==============================================================================
Omnibus:                        6.159   Durbin-Watson:                   2.084
Prob(Omnibus):                  0.046   Jarque-Bera (JB):                5.923
Skew:                           0.315   Prob(JB):                       0.0517
Kurtosis:                       3.351   Cond. No.                         17.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
==============================================================================
Details for SYNONYMS_COUNT
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.418
Model:                            OLS   Adj. R-squared:                  0.405
Method:                 Least Squares   F-statistic:                     31.80
Date:                Fri, 29 Jul 2016   Prob (F-statistic):           9.36e-29
Time:                        16:13:44   Log-Likelihood:                 198.46
No. Observations:                 273   AIC:                            -382.9
Df Residuals:                     266   BIC:                            -357.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8333      0.060     13.902      0.000       0.715       0.951
x1             0.0277      0.039      0.719      0.473      -0.048       0.104
x2            -0.0323      0.046     -0.704      0.482      -0.123       0.058
x3             0.0558      0.045      1.242      0.215      -0.033       0.144
x4            -0.0109      0.050     -0.216      0.829      -0.110       0.088
x5            -0.6547      0.048    -13.741      0.000      -0.749      -0.561
x6            -0.0026      0.040     -0.066      0.948      -0.082       0.076
==============================================================================
Omnibus:                        8.272   Durbin-Watson:                   1.608
Prob(Omnibus):                  0.016   Jarque-Bera (JB):                8.981
Skew:                          -0.318   Prob(JB):                       0.0112
Kurtosis:                       3.620   Cond. No.                         17.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
==============================================================================
Details for ORTHOGRAPHIC_DENSITY
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.309
Method:                 Least Squares   F-statistic:                     21.31
Date:                Fri, 29 Jul 2016   Prob (F-statistic):           2.10e-20
Time:                        16:13:44   Log-Likelihood:                 131.94
No. Observations:                 273   AIC:                            -249.9
Df Residuals:                     266   BIC:                            -224.6
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6214      0.076      8.125      0.000       0.471       0.772
x1             0.0281      0.049      0.570      0.569      -0.069       0.125
x2            -0.0186      0.058     -0.317      0.751      -0.134       0.097
x3            -0.0173      0.057     -0.302      0.763      -0.130       0.096
x4             0.0018      0.064      0.028      0.977      -0.125       0.128
x5             0.0583      0.061      0.959      0.338      -0.061       0.178
x6            -0.3551      0.051     -6.934      0.000      -0.456      -0.254
==============================================================================
Omnibus:                       18.191   Durbin-Watson:                   2.041
Prob(Omnibus):                  0.000   Jarque-Bera (JB):                8.035
Skew:                           0.176   Prob(JB):                       0.0180
Kurtosis:                       2.236   Cond. No.                         17.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [46]:
# Get source and destination values.
source = pd.pivot_table(
    variations,
    values='source',
    index=['cluster_id'],
    columns=['feature']
)[PAPER_FEATURES].dropna()
destination = pd.pivot_table(
    variations,
    values='destination',
    index=['cluster_id'],
    columns=['feature']
).loc[source.index][PAPER_FEATURES].dropna()
source = source.loc[destination.index]
nsource = ((source.values - source.values.min(0))
           / (source.values.max(0) - source.values.min(0)))
target = destination.values - source.values
target = ((target - target.min(0)) / (target.max(0) - target.min(0))).mean(1)

# Regress.
linreg = linear_model.LinearRegression()
linreg.fit(nsource, target)

# And print the score and coefficients.
print('Regressing destination global values with {} measures'
      .format(len(source)))
print('R^2 = {}'.format(linreg.score(nsource, target)))
print()
print(sm.OLS(target, sm.add_constant(nsource)).fit().summary())
coeffs = pd.Series(data=linreg.coef_,
                   index=prettify_index(source.columns))
#coeffs['intercept'] = linreg.intercept_
ax = coeffs.plot(kind='bar', color=sb.color_palette(n_colors=6), rot=45)
if SAVE_FIGURES:
    ax.figure.savefig(settings.FIGURE
                      .format('paper-variations_regression-'
                              'globals_to_meanvariation'),
                      bbox_inches='tight', dpi=300)


Regressing destination global values with 273 measures
R^2 = 0.36079572187138587

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.346
Method:                 Least Squares   F-statistic:                     25.02
Date:                Fri, 29 Jul 2016   Prob (F-statistic):           1.71e-23
Time:                        16:13:46   Log-Likelihood:                 488.32
No. Observations:                 273   AIC:                            -962.6
Df Residuals:                     266   BIC:                            -937.4
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7388      0.021     35.640      0.000       0.698       0.780
x1            -0.0859      0.013     -6.439      0.000      -0.112      -0.060
x2            -0.0678      0.016     -4.276      0.000      -0.099      -0.037
x3            -0.0848      0.016     -5.460      0.000      -0.115      -0.054
x4            -0.0805      0.017     -4.624      0.000      -0.115      -0.046
x5            -0.1264      0.016     -7.668      0.000      -0.159      -0.094
x6            -0.0686      0.014     -4.941      0.000      -0.096      -0.041
==============================================================================
Omnibus:                        1.700   Durbin-Watson:                   1.996
Prob(Omnibus):                  0.427   Jarque-Bera (JB):                1.744
Skew:                          -0.188   Prob(JB):                        0.418
Kurtosis:                       2.889   Cond. No.                         17.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

The following details a humongous list of possible regressions, which we won't use.


In [47]:
rels = {False: ('global', ''),
        True: ('rel', '_rel')}

def regress(data, features, target,
            source_rel=False, dest_rel=False, interactions=False):
    if source_rel not in [True, False, 'both']:
        raise ValueError
    if not isinstance(dest_rel, bool):
        raise ValueError
    # Process source/destination relativeness arguments.
    if isinstance(source_rel, bool):
        source_rel = [source_rel]
    else:
        source_rel = [False, True]
    dest_rel_name, dest_rel = rels[dest_rel]
    
    features = tuple(sorted(features))
    feature_tuples = [('source' + rels[rel][1], feature)
                      for rel in source_rel
                      for feature in features]
    feature_names = [rels[rel][0] + '_' + feature
                     for rel in source_rel
                     for feature in features]
    
    # Get source and destination values.
    source = pd.pivot_table(
        data,
        values=['source' + rels[rel][1] for rel in source_rel],
        index=['cluster_id'],
        columns=['feature']
    )[feature_tuples].dropna()
    destination = variations[variations.feature == target]\
        .pivot(index='cluster_id', columns='feature',
               values='destination' + dest_rel)\
        .loc[source.index][target].dropna()
    source = source.loc[destination.index].values
    destination = destination.values

    # If asked to, get polynomial features.
    if interactions:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        source = poly.fit_transform(source)
        regress_features = [' * '.join([feature_names[j]
                                        for j, p in enumerate(powers)
                                        if p > 0]) or 'intercept'
                            for powers in poly.powers_]
    else:
        regress_features = feature_names

    # Regress.
    linreg = linear_model.LinearRegression(fit_intercept=not interactions)
    linreg.fit(source, destination)

    # And print the score and coefficients.
    print('Regressing {} with {} measures, {} interactions'
          .format(dest_rel_name + ' ' + target, len(source),
                  'with' if interactions else 'no'))
    print('           ' + '^' * len(dest_rel_name + ' ' + target))
    print('R^2 = {}'
          .format(linreg.score(source, destination)))
    print()
    coeffs = pd.Series(index=regress_features, data=linreg.coef_)
    if not interactions:
        coeffs = pd.Series(index=['intercept'], data=[linreg.intercept_])\
            .append(coeffs)
    with pd.option_context('display.max_rows', 999):
        print(coeffs)

In [48]:
for target in PAPER_FEATURES:
    print('-' * 70)
    for source_rel, dest_rel in product([False, True, 'both'],
                                        [False, True]):
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel)
        print()
        regress(variations, PAPER_FEATURES, target, source_rel=source_rel,
                dest_rel=dest_rel, interactions=True)
        print()


----------------------------------------------------------------------
Regressing global frequency with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.09853263729254713

intercept                      5.681805
global_aoa                     0.098454
global_clustering              0.169474
global_frequency               0.419654
global_letters_count          -0.035324
global_orthographic_density   -0.105466
global_synonyms_count         -0.173456
dtype: float64

Regressing global frequency with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.12882465936510978

intercept                                              8.718810
global_aoa                                            -0.606622
global_clustering                                      1.369041
global_frequency                                       0.822327
global_letters_count                                   0.094275
global_orthographic_density                            1.383686
global_synonyms_count                                 -0.080281
global_aoa * global_clustering                        -0.073016
global_aoa * global_frequency                          0.015214
global_aoa * global_letters_count                      0.019222
global_aoa * global_orthographic_density              -0.037485
global_aoa * global_synonyms_count                     0.098774
global_clustering * global_frequency                  -0.033704
global_clustering * global_letters_count              -0.044744
global_clustering * global_orthographic_density       -0.024197
global_clustering * global_synonyms_count             -0.020333
global_frequency * global_letters_count               -0.065203
global_frequency * global_orthographic_density        -0.217019
global_frequency * global_synonyms_count              -0.038801
global_letters_count * global_orthographic_density     0.105381
global_letters_count * global_synonyms_count          -0.085432
global_orthographic_density * global_synonyms_count    0.010950
dtype: float64

Regressing rel frequency with 391 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.06409705512194153

intercept                     -5.470672
global_aoa                     0.125243
global_clustering              0.151611
global_frequency               0.334279
global_letters_count          -0.011557
global_orthographic_density   -0.194056
global_synonyms_count         -0.095133
dtype: float64

Regressing rel frequency with 391 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.08678399300447925

intercept                                              2.626390
global_aoa                                            -0.620521
global_clustering                                      1.744601
global_frequency                                       0.441703
global_letters_count                                  -0.330133
global_orthographic_density                           -0.097994
global_synonyms_count                                 -1.869709
global_aoa * global_clustering                        -0.082577
global_aoa * global_frequency                         -0.004581
global_aoa * global_letters_count                      0.043847
global_aoa * global_orthographic_density              -0.023997
global_aoa * global_synonyms_count                     0.102205
global_clustering * global_frequency                  -0.063214
global_clustering * global_letters_count              -0.049916
global_clustering * global_orthographic_density       -0.001973
global_clustering * global_synonyms_count             -0.223953
global_frequency * global_letters_count               -0.046765
global_frequency * global_orthographic_density        -0.104591
global_frequency * global_synonyms_count              -0.035116
global_letters_count * global_orthographic_density     0.172520
global_letters_count * global_synonyms_count          -0.000905
global_orthographic_density * global_synonyms_count    0.108433
dtype: float64

Regressing global frequency with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.07003097580617534

intercept                   9.434106
rel_aoa                     0.144055
rel_clustering              0.120198
rel_frequency               0.290976
rel_letters_count          -0.041879
rel_orthographic_density   -0.059478
rel_synonyms_count         -0.296137
dtype: float64

Regressing global frequency with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.11061822670784549

intercept                                        9.397489
rel_aoa                                          0.336078
rel_clustering                                  -0.046320
rel_frequency                                    0.309812
rel_letters_count                               -0.064660
rel_orthographic_density                        -0.497039
rel_synonyms_count                               0.285341
rel_aoa * rel_clustering                        -0.192748
rel_aoa * rel_frequency                          0.034800
rel_aoa * rel_letters_count                     -0.030892
rel_aoa * rel_orthographic_density              -0.064057
rel_aoa * rel_synonyms_count                     0.065730
rel_clustering * rel_frequency                  -0.049980
rel_clustering * rel_letters_count               0.050896
rel_clustering * rel_orthographic_density       -0.027699
rel_clustering * rel_synonyms_count              0.146044
rel_frequency * rel_letters_count               -0.057196
rel_frequency * rel_orthographic_density        -0.133976
rel_frequency * rel_synonyms_count               0.071667
rel_letters_count * rel_orthographic_density     0.093762
rel_letters_count * rel_synonyms_count          -0.214881
rel_orthographic_density * rel_synonyms_count   -0.078854
dtype: float64

Regressing rel frequency with 391 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.26382110021773497

intercept                  -1.359780
rel_aoa                     0.128675
rel_clustering              0.318422
rel_frequency               0.650721
rel_letters_count          -0.127629
rel_orthographic_density   -0.271098
rel_synonyms_count         -0.240708
dtype: float64

Regressing rel frequency with 391 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.3052997419684169

intercept                                       -1.366130
rel_aoa                                          0.200335
rel_clustering                                  -0.028015
rel_frequency                                    0.718987
rel_letters_count                               -0.107284
rel_orthographic_density                        -0.745880
rel_synonyms_count                               0.168885
rel_aoa * rel_clustering                        -0.215017
rel_aoa * rel_frequency                         -0.041996
rel_aoa * rel_letters_count                      0.008066
rel_aoa * rel_orthographic_density               0.105147
rel_aoa * rel_synonyms_count                     0.127867
rel_clustering * rel_frequency                  -0.121601
rel_clustering * rel_letters_count               0.033883
rel_clustering * rel_orthographic_density       -0.124748
rel_clustering * rel_synonyms_count              0.040174
rel_frequency * rel_letters_count               -0.037660
rel_frequency * rel_orthographic_density        -0.118624
rel_frequency * rel_synonyms_count               0.024012
rel_letters_count * rel_orthographic_density     0.094604
rel_letters_count * rel_synonyms_count          -0.144801
rel_orthographic_density * rel_synonyms_count    0.100963
dtype: float64

Regressing global frequency with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.11471972522067997

intercept                      3.702336
global_aoa                    -0.005124
global_clustering             -0.000685
global_frequency               0.472286
global_letters_count           0.172489
global_orthographic_density   -0.104075
global_synonyms_count          0.287224
rel_aoa                        0.144418
rel_clustering                 0.205497
rel_frequency                 -0.056937
rel_letters_count             -0.229190
rel_orthographic_density      -0.013805
rel_synonyms_count            -0.559468
dtype: float64

Regressing global frequency with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^
R^2 = 0.2564536329485744

intercept                                                -12.273075
global_aoa                                                -0.187552
global_clustering                                         -2.412826
global_frequency                                           1.971381
global_letters_count                                       1.819225
global_orthographic_density                               -1.789975
global_synonyms_count                                      3.449530
rel_aoa                                                    1.986795
rel_clustering                                             0.261432
rel_frequency                                             -3.488910
rel_letters_count                                         -1.701372
rel_orthographic_density                                   8.427170
rel_synonyms_count                                        -0.993259
global_aoa * global_clustering                             0.330150
global_aoa * global_frequency                              0.163669
global_aoa * global_letters_count                          0.020621
global_aoa * global_orthographic_density                   0.245871
global_aoa * global_synonyms_count                        -0.106001
global_aoa * rel_aoa                                      -0.073299
global_aoa * rel_clustering                               -0.102811
global_aoa * rel_frequency                                 0.042480
global_aoa * rel_letters_count                            -0.009189
global_aoa * rel_orthographic_density                     -0.413183
global_aoa * rel_synonyms_count                            0.176233
global_clustering * global_frequency                       0.137648
global_clustering * global_letters_count                  -0.125906
global_clustering * global_orthographic_density           -0.529404
global_clustering * global_synonyms_count                  0.728423
global_clustering * rel_aoa                               -0.460054
global_clustering * rel_clustering                         0.184358
global_clustering * rel_frequency                         -0.307388
global_clustering * rel_letters_count                      0.179927
global_clustering * rel_orthographic_density               0.856428
global_clustering * rel_synonyms_count                    -0.609633
global_frequency * global_letters_count                   -0.285853
global_frequency * global_orthographic_density            -0.300035
global_frequency * global_synonyms_count                  -0.341672
global_frequency * rel_aoa                                -0.347392
global_frequency * rel_clustering                          0.093633
global_frequency * rel_frequency                           0.033165
global_frequency * rel_letters_count                       0.249468
global_frequency * rel_orthographic_density               -0.085738
global_frequency * rel_synonyms_count                      0.307868
global_letters_count * global_orthographic_density        -0.030830
global_letters_count * global_synonyms_count               0.878443
global_letters_count * rel_aoa                            -0.089243
global_letters_count * rel_clustering                     -0.077398
global_letters_count * rel_frequency                       0.126977
global_letters_count * rel_letters_count                   0.048602
global_letters_count * rel_orthographic_density            0.031900
global_letters_count * rel_synonyms_count                 -1.173906
global_orthographic_density * global_synonyms_count        0.793295
global_orthographic_density * rel_aoa                     -0.107252
global_orthographic_density * rel_clustering               0.670715
global_orthographic_density * rel_frequency                0.312739
global_orthographic_density * rel_letters_count            0.025699
global_orthographic_density * rel_orthographic_density    -0.181026
global_orthographic_density * rel_synonyms_count          -0.932305
global_synonyms_count * rel_aoa                            0.189272
global_synonyms_count * rel_clustering                    -1.061133
global_synonyms_count * rel_frequency                      0.259173
global_synonyms_count * rel_letters_count                 -0.265853
global_synonyms_count * rel_orthographic_density          -0.218773
global_synonyms_count * rel_synonyms_count                -0.082062
rel_aoa * rel_clustering                                   0.052865
rel_aoa * rel_frequency                                    0.136280
rel_aoa * rel_letters_count                                0.049736
rel_aoa * rel_orthographic_density                         0.037369
rel_aoa * rel_synonyms_count                              -0.173914
rel_clustering * rel_frequency                             0.126440
rel_clustering * rel_letters_count                         0.074941
rel_clustering * rel_orthographic_density                 -1.001577
rel_clustering * rel_synonyms_count                        1.103892
rel_frequency * rel_letters_count                         -0.144258
rel_frequency * rel_orthographic_density                  -0.099035
rel_frequency * rel_synonyms_count                        -0.122825
rel_letters_count * rel_orthographic_density               0.165257
rel_letters_count * rel_synonyms_count                     0.358910
rel_orthographic_density * rel_synonyms_count              0.170909
dtype: float64

Regressing rel frequency with 391 measures, no interactions
           ^^^^^^^^^^^^^
R^2 = 0.3407938525314035

intercept                      3.706670
global_aoa                     0.002655
global_clustering              0.110871
global_frequency              -0.459169
global_letters_count           0.167333
global_orthographic_density   -0.107415
global_synonyms_count          0.225380
rel_aoa                        0.120371
rel_clustering                 0.137184
rel_frequency                  0.916074
rel_letters_count             -0.218363
rel_orthographic_density       0.031076
rel_synonyms_count            -0.460482
dtype: float64

Regressing rel frequency with 391 measures, with interactions
           ^^^^^^^^^^^^^
R^2 = 0.4442234540278843

intercept                                                -14.317460
global_aoa                                                 0.116623
global_clustering                                         -2.602370
global_frequency                                           1.214939
global_letters_count                                       2.012600
global_orthographic_density                               -2.490474
global_synonyms_count                                      2.354218
rel_aoa                                                    1.513352
rel_clustering                                             0.787663
rel_frequency                                             -2.626188
rel_letters_count                                         -2.454743
rel_orthographic_density                                   8.749125
rel_synonyms_count                                        -0.030571
global_aoa * global_clustering                             0.322841
global_aoa * global_frequency                              0.140424
global_aoa * global_letters_count                          0.002967
global_aoa * global_orthographic_density                   0.240574
global_aoa * global_synonyms_count                        -0.099199
global_aoa * rel_aoa                                      -0.070490
global_aoa * rel_clustering                               -0.104719
global_aoa * rel_frequency                                 0.063446
global_aoa * rel_letters_count                             0.027111
global_aoa * rel_orthographic_density                     -0.380512
global_aoa * rel_synonyms_count                            0.170651
global_clustering * global_frequency                       0.180165
global_clustering * global_letters_count                  -0.032281
global_clustering * global_orthographic_density           -0.656948
global_clustering * global_synonyms_count                  0.532209
global_clustering * rel_aoa                               -0.480837
global_clustering * rel_clustering                         0.145403
global_clustering * rel_frequency                         -0.316917
global_clustering * rel_letters_count                      0.086095
global_clustering * rel_orthographic_density               1.028314
global_clustering * rel_synonyms_count                    -0.478042
global_frequency * global_letters_count                   -0.228001
global_frequency * global_orthographic_density            -0.289012
global_frequency * global_synonyms_count                  -0.271671
global_frequency * rel_aoa                                -0.312373
global_frequency * rel_clustering                          0.022531
global_frequency * rel_frequency                           0.051239
global_frequency * rel_letters_count                       0.220767
global_frequency * rel_orthographic_density               -0.055212
global_frequency * rel_synonyms_count                      0.198471
global_letters_count * global_orthographic_density        -0.060367
global_letters_count * global_synonyms_count               0.711925
global_letters_count * rel_aoa                            -0.094187
global_letters_count * rel_clustering                     -0.184608
global_letters_count * rel_frequency                       0.079970
global_letters_count * rel_letters_count                   0.057296
global_letters_count * rel_orthographic_density            0.081679
global_letters_count * rel_synonyms_count                 -0.975236
global_orthographic_density * global_synonyms_count        0.600366
global_orthographic_density * rel_aoa                     -0.140831
global_orthographic_density * rel_clustering               0.772739
global_orthographic_density * rel_frequency                0.311930
global_orthographic_density * rel_letters_count            0.126051
global_orthographic_density * rel_orthographic_density    -0.117238
global_orthographic_density * rel_synonyms_count          -0.726816
global_synonyms_count * rel_aoa                            0.171536
global_synonyms_count * rel_clustering                    -0.838406
global_synonyms_count * rel_frequency                      0.143543
global_synonyms_count * rel_letters_count                 -0.174740
global_synonyms_count * rel_orthographic_density          -0.110222
global_synonyms_count * rel_synonyms_count                -0.070820
rel_aoa * rel_clustering                                   0.077911
rel_aoa * rel_frequency                                    0.097422
rel_aoa * rel_letters_count                                0.025272
rel_aoa * rel_orthographic_density                         0.035710
rel_aoa * rel_synonyms_count                              -0.163896
rel_clustering * rel_frequency                             0.158584
rel_clustering * rel_letters_count                         0.175804
rel_clustering * rel_orthographic_density                 -1.137827
rel_clustering * rel_synonyms_count                        0.873932
rel_frequency * rel_letters_count                         -0.112342
rel_frequency * rel_orthographic_density                  -0.104361
rel_frequency * rel_synonyms_count                         0.011580
rel_letters_count * rel_orthographic_density               0.081045
rel_letters_count * rel_synonyms_count                     0.254768
rel_orthographic_density * rel_synonyms_count              0.060840
dtype: float64

----------------------------------------------------------------------
Regressing global aoa with 354 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.12259316110787877

intercept                      4.679834
global_aoa                     0.370550
global_clustering             -0.133492
global_frequency              -0.090202
global_letters_count           0.008836
global_orthographic_density   -0.048018
global_synonyms_count          0.090882
dtype: float64

Regressing global aoa with 354 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.18985607711054098

intercept                                             -15.337419
global_aoa                                              2.323744
global_clustering                                      -2.537080
global_frequency                                        0.124904
global_letters_count                                    1.764313
global_orthographic_density                             1.036875
global_synonyms_count                                  -3.735516
global_aoa * global_clustering                          0.181417
global_aoa * global_frequency                          -0.072131
global_aoa * global_letters_count                      -0.054066
global_aoa * global_orthographic_density                0.011065
global_aoa * global_synonyms_count                      0.152028
global_clustering * global_frequency                   -0.019871
global_clustering * global_letters_count                0.225555
global_clustering * global_orthographic_density         0.070146
global_clustering * global_synonyms_count              -0.413787
global_frequency * global_letters_count                 0.015499
global_frequency * global_orthographic_density         -0.002095
global_frequency * global_synonyms_count                0.097437
global_letters_count * global_orthographic_density     -0.154113
global_letters_count * global_synonyms_count           -0.136751
global_orthographic_density * global_synonyms_count     0.283097
dtype: float64

Regressing rel aoa with 354 measures, no interactions
           ^^^^^^^
R^2 = 0.04149600917525842

intercept                      0.253627
global_aoa                     0.139876
global_clustering             -0.086273
global_frequency              -0.163675
global_letters_count           0.068419
global_orthographic_density    0.133377
global_synonyms_count          0.007264
dtype: float64

Regressing rel aoa with 354 measures, with interactions
           ^^^^^^^
R^2 = 0.12265370608408421

intercept                                             -14.505431
global_aoa                                              2.540140
global_clustering                                      -1.116019
global_frequency                                        0.328564
global_letters_count                                    0.607294
global_orthographic_density                             0.306129
global_synonyms_count                                  -2.695675
global_aoa * global_clustering                          0.213786
global_aoa * global_frequency                          -0.091791
global_aoa * global_letters_count                      -0.080367
global_aoa * global_orthographic_density                0.051592
global_aoa * global_synonyms_count                      0.168336
global_clustering * global_frequency                   -0.005628
global_clustering * global_letters_count               -0.026328
global_clustering * global_orthographic_density        -0.161306
global_clustering * global_synonyms_count              -0.203785
global_frequency * global_letters_count                 0.017153
global_frequency * global_orthographic_density         -0.035056
global_frequency * global_synonyms_count                0.056290
global_letters_count * global_orthographic_density     -0.232562
global_letters_count * global_synonyms_count           -0.087773
global_orthographic_density * global_synonyms_count     0.326252
dtype: float64

Regressing global aoa with 354 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.04714015374073077

intercept                   6.847413
rel_aoa                     0.160958
rel_clustering              0.048207
rel_frequency               0.046419
rel_letters_count          -0.049308
rel_orthographic_density   -0.443998
rel_synonyms_count         -0.036288
dtype: float64

Regressing global aoa with 354 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.1363757948920139

intercept                                        6.863601
rel_aoa                                         -0.155919
rel_clustering                                  -0.191600
rel_frequency                                    0.063034
rel_letters_count                               -0.152745
rel_orthographic_density                        -0.375896
rel_synonyms_count                               0.682220
rel_aoa * rel_clustering                         0.191557
rel_aoa * rel_frequency                         -0.134500
rel_aoa * rel_letters_count                      0.037021
rel_aoa * rel_orthographic_density               0.230671
rel_aoa * rel_synonyms_count                     0.297098
rel_clustering * rel_frequency                  -0.004483
rel_clustering * rel_letters_count               0.245536
rel_clustering * rel_orthographic_density        0.361079
rel_clustering * rel_synonyms_count             -0.373554
rel_frequency * rel_letters_count                0.019513
rel_frequency * rel_orthographic_density         0.025354
rel_frequency * rel_synonyms_count              -0.028635
rel_letters_count * rel_orthographic_density    -0.092632
rel_letters_count * rel_synonyms_count          -0.095293
rel_orthographic_density * rel_synonyms_count    0.687596
dtype: float64

Regressing rel aoa with 354 measures, no interactions
           ^^^^^^^
R^2 = 0.21896757436599212

intercept                   0.484917
rel_aoa                     0.537430
rel_clustering             -0.109995
rel_frequency              -0.107215
rel_letters_count          -0.001743
rel_orthographic_density    0.168822
rel_synonyms_count         -0.067580
dtype: float64

Regressing rel aoa with 354 measures, with interactions
           ^^^^^^^
R^2 = 0.2718064059074077

intercept                                        0.778331
rel_aoa                                          0.464888
rel_clustering                                  -0.270903
rel_frequency                                   -0.005612
rel_letters_count                               -0.135181
rel_orthographic_density                         0.644460
rel_synonyms_count                               0.377028
rel_aoa * rel_clustering                         0.100321
rel_aoa * rel_frequency                         -0.060606
rel_aoa * rel_letters_count                     -0.049131
rel_aoa * rel_orthographic_density               0.039805
rel_aoa * rel_synonyms_count                     0.138810
rel_clustering * rel_frequency                  -0.025094
rel_clustering * rel_letters_count               0.094304
rel_clustering * rel_orthographic_density        0.160154
rel_clustering * rel_synonyms_count             -0.364832
rel_frequency * rel_letters_count               -0.001086
rel_frequency * rel_orthographic_density         0.084389
rel_frequency * rel_synonyms_count              -0.026291
rel_letters_count * rel_orthographic_density    -0.154506
rel_letters_count * rel_synonyms_count          -0.023109
rel_orthographic_density * rel_synonyms_count    0.437353
dtype: float64

Regressing global aoa with 354 measures, no interactions
           ^^^^^^^^^^
R^2 = 0.1410221981909605

intercept                      3.363531
global_aoa                     0.464890
global_clustering             -0.199585
global_frequency              -0.163951
global_letters_count           0.158857
global_orthographic_density    0.195762
global_synonyms_count          0.479601
rel_aoa                       -0.158281
rel_clustering                 0.054184
rel_frequency                  0.064040
rel_letters_count             -0.158967
rel_orthographic_density      -0.212848
rel_synonyms_count            -0.479687
dtype: float64

Regressing global aoa with 354 measures, with interactions
           ^^^^^^^^^^
R^2 = 0.35715047427890223

intercept                                                 109.153246
global_aoa                                                  0.722841
global_clustering                                          12.165278
global_frequency                                           -5.775888
global_letters_count                                       -6.935886
global_orthographic_density                               -13.475406
global_synonyms_count                                     -26.863343
rel_aoa                                                     1.393902
rel_clustering                                            -10.738642
rel_frequency                                               8.691170
rel_letters_count                                           5.395120
rel_orthographic_density                                    6.101009
rel_synonyms_count                                         10.996769
global_aoa * global_clustering                             -0.121147
global_aoa * global_frequency                              -0.185069
global_aoa * global_letters_count                           0.044587
global_aoa * global_orthographic_density                    0.071012
global_aoa * global_synonyms_count                          0.498098
global_aoa * rel_aoa                                       -0.020420
global_aoa * rel_clustering                                 0.109160
global_aoa * rel_frequency                                 -0.083737
global_aoa * rel_letters_count                             -0.101389
global_aoa * rel_orthographic_density                      -0.136599
global_aoa * rel_synonyms_count                            -0.417761
global_clustering * global_frequency                       -0.576528
global_clustering * global_letters_count                   -0.163146
global_clustering * global_orthographic_density            -1.491364
global_clustering * global_synonyms_count                  -3.112586
global_clustering * rel_aoa                                 0.350871
global_clustering * rel_clustering                          0.023931
global_clustering * rel_frequency                           0.939328
global_clustering * rel_letters_count                      -0.030770
global_clustering * rel_orthographic_density                0.235098
global_clustering * rel_synonyms_count                      2.339121
global_frequency * global_letters_count                     0.518447
global_frequency * global_orthographic_density              0.272742
global_frequency * global_synonyms_count                    0.559322
global_frequency * rel_aoa                                  0.222129
global_frequency * rel_clustering                           0.273571
global_frequency * rel_frequency                           -0.120745
global_frequency * rel_letters_count                       -0.537708
global_frequency * rel_orthographic_density                -0.386654
global_frequency * rel_synonyms_count                       0.161142
global_letters_count * global_orthographic_density          0.203616
global_letters_count * global_synonyms_count               -0.408657
global_letters_count * rel_aoa                             -0.167593
global_letters_count * rel_clustering                       0.625334
global_letters_count * rel_frequency                       -0.151827
global_letters_count * rel_letters_count                    0.128182
global_letters_count * rel_orthographic_density            -0.034926
global_letters_count * rel_synonyms_count                   1.200736
global_orthographic_density * global_synonyms_count         0.212105
global_orthographic_density * rel_aoa                      -0.293666
global_orthographic_density * rel_clustering                1.326879
global_orthographic_density * rel_frequency                -0.133076
global_orthographic_density * rel_letters_count            -0.050447
global_orthographic_density * rel_orthographic_density      0.325059
global_orthographic_density * rel_synonyms_count           -0.304980
global_synonyms_count * rel_aoa                            -0.636485
global_synonyms_count * rel_clustering                      3.247160
global_synonyms_count * rel_frequency                      -0.904744
global_synonyms_count * rel_letters_count                  -0.584996
global_synonyms_count * rel_orthographic_density           -0.942784
global_synonyms_count * rel_synonyms_count                  0.313750
rel_aoa * rel_clustering                                   -0.149779
rel_aoa * rel_frequency                                    -0.139397
rel_aoa * rel_letters_count                                 0.093173
rel_aoa * rel_orthographic_density                          0.280139
rel_aoa * rel_synonyms_count                                0.625527
rel_clustering * rel_frequency                             -0.804956
rel_clustering * rel_letters_count                         -0.099500
rel_clustering * rel_orthographic_density                   0.507130
rel_clustering * rel_synonyms_count                        -2.731203
rel_frequency * rel_letters_count                           0.219523
rel_frequency * rel_orthographic_density                    0.229414
rel_frequency * rel_synonyms_count                          0.394282
rel_letters_count * rel_orthographic_density                0.062510
rel_letters_count * rel_synonyms_count                     -0.058440
rel_orthographic_density * rel_synonyms_count               1.745118
dtype: float64

Regressing rel aoa with 354 measures, no interactions
           ^^^^^^^
R^2 = 0.26403027152097447

intercept                      1.384438
global_aoa                    -0.349238
global_clustering             -0.174566
global_frequency              -0.080439
global_letters_count           0.132248
global_orthographic_density    0.108115
global_synonyms_count          0.590038
rel_aoa                        0.776135
rel_clustering                 0.123767
rel_frequency                 -0.025335
rel_letters_count             -0.101261
rel_orthographic_density      -0.115799
rel_synonyms_count            -0.645633
dtype: float64

Regressing rel aoa with 354 measures, with interactions
           ^^^^^^^
R^2 = 0.4485645023344458

intercept                                                 107.195373
global_aoa                                                 -1.912088
global_clustering                                          12.561734
global_frequency                                           -5.257112
global_letters_count                                       -5.605520
global_orthographic_density                               -13.052296
global_synonyms_count                                     -26.260995
rel_aoa                                                     3.656163
rel_clustering                                             -9.588066
rel_frequency                                               7.514378
rel_letters_count                                           4.702344
rel_orthographic_density                                    7.188713
rel_synonyms_count                                         10.831199
global_aoa * global_clustering                             -0.345210
global_aoa * global_frequency                              -0.152805
global_aoa * global_letters_count                           0.027990
global_aoa * global_orthographic_density                    0.117617
global_aoa * global_synonyms_count                          0.736970
global_aoa * rel_aoa                                       -0.044874
global_aoa * rel_clustering                                 0.267171
global_aoa * rel_frequency                                 -0.096334
global_aoa * rel_letters_count                             -0.090864
global_aoa * rel_orthographic_density                      -0.240205
global_aoa * rel_synonyms_count                            -0.619544
global_clustering * global_frequency                       -0.560083
global_clustering * global_letters_count                   -0.145233
global_clustering * global_orthographic_density            -1.315222
global_clustering * global_synonyms_count                  -3.279997
global_clustering * rel_aoa                                 0.342854
global_clustering * rel_clustering                          0.061258
global_clustering * rel_frequency                           0.767177
global_clustering * rel_letters_count                       0.004033
global_clustering * rel_orthographic_density                0.053534
global_clustering * rel_synonyms_count                      2.512050
global_frequency * global_letters_count                     0.414929
global_frequency * global_orthographic_density              0.288856
global_frequency * global_synonyms_count                    0.216020
global_frequency * rel_aoa                                  0.169932
global_frequency * rel_clustering                           0.289862
global_frequency * rel_frequency                           -0.073751
global_frequency * rel_letters_count                       -0.454008
global_frequency * rel_orthographic_density                -0.493033
global_frequency * rel_synonyms_count                       0.399952
global_letters_count * global_orthographic_density          0.150486
global_letters_count * global_synonyms_count               -0.305385
global_letters_count * rel_aoa                             -0.232352
global_letters_count * rel_clustering                       0.454777
global_letters_count * rel_frequency                       -0.137254
global_letters_count * rel_letters_count                    0.121963
global_letters_count * rel_orthographic_density            -0.026033
global_letters_count * rel_synonyms_count                   0.974273
global_orthographic_density * global_synonyms_count         0.440338
global_orthographic_density * rel_aoa                      -0.439247
global_orthographic_density * rel_clustering                1.004870
global_orthographic_density * rel_frequency                -0.324174
global_orthographic_density * rel_letters_count            -0.033732
global_orthographic_density * rel_orthographic_density      0.156378
global_orthographic_density * rel_synonyms_count           -0.221040
global_synonyms_count * rel_aoa                            -0.663324
global_synonyms_count * rel_clustering                      2.864416
global_synonyms_count * rel_frequency                      -0.597167
global_synonyms_count * rel_letters_count                  -0.441629
global_synonyms_count * rel_orthographic_density           -0.856749
global_synonyms_count * rel_synonyms_count                  0.228102
rel_aoa * rel_clustering                                   -0.136382
rel_aoa * rel_frequency                                    -0.078515
rel_aoa * rel_letters_count                                 0.128854
rel_aoa * rel_orthographic_density                          0.340404
rel_aoa * rel_synonyms_count                                0.574143
rel_clustering * rel_frequency                             -0.616625
rel_clustering * rel_letters_count                         -0.023499
rel_clustering * rel_orthographic_density                   0.751168
rel_clustering * rel_synonyms_count                        -2.534633
rel_frequency * rel_letters_count                           0.187202
rel_frequency * rel_orthographic_density                    0.437791
rel_frequency * rel_synonyms_count                          0.071817
rel_letters_count * rel_orthographic_density                0.082607
rel_letters_count * rel_synonyms_count                     -0.153455
rel_orthographic_density * rel_synonyms_count               1.090841
dtype: float64

----------------------------------------------------------------------
Regressing global clustering with 316 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.0943291406463459

intercept                     -3.176724
global_aoa                    -0.027766
global_clustering              0.217588
global_frequency              -0.081800
global_letters_count          -0.051657
global_orthographic_density   -0.053771
global_synonyms_count         -0.016606
dtype: float64

Regressing global clustering with 316 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.22022783952943023

intercept                                             -7.386086
global_aoa                                             0.512310
global_clustering                                     -0.113372
global_frequency                                      -0.483373
global_letters_count                                   0.744185
global_orthographic_density                            1.414330
global_synonyms_count                                 -1.762566
global_aoa * global_clustering                         0.038864
global_aoa * global_frequency                         -0.030673
global_aoa * global_letters_count                     -0.005445
global_aoa * global_orthographic_density              -0.005774
global_aoa * global_synonyms_count                     0.044487
global_clustering * global_frequency                  -0.098869
global_clustering * global_letters_count               0.138651
global_clustering * global_orthographic_density        0.167412
global_clustering * global_synonyms_count             -0.276015
global_frequency * global_letters_count                0.011051
global_frequency * global_orthographic_density        -0.015777
global_frequency * global_synonyms_count              -0.008596
global_letters_count * global_orthographic_density    -0.053629
global_letters_count * global_synonyms_count          -0.005934
global_orthographic_density * global_synonyms_count   -0.036378
dtype: float64

Regressing rel clustering with 316 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.062350572906168855

intercept                      2.425091
global_aoa                    -0.024810
global_clustering              0.163380
global_frequency              -0.058348
global_letters_count          -0.052328
global_orthographic_density   -0.023333
global_synonyms_count         -0.051982
dtype: float64

Regressing rel clustering with 316 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.1431565555829506

intercept                                              0.547775
global_aoa                                             0.246680
global_clustering                                      0.197975
global_frequency                                      -0.524571
global_letters_count                                   0.658201
global_orthographic_density                            1.670432
global_synonyms_count                                 -1.510994
global_aoa * global_clustering                         0.003465
global_aoa * global_frequency                         -0.019056
global_aoa * global_letters_count                     -0.007810
global_aoa * global_orthographic_density              -0.023150
global_aoa * global_synonyms_count                     0.030859
global_clustering * global_frequency                  -0.102099
global_clustering * global_letters_count               0.118810
global_clustering * global_orthographic_density        0.181099
global_clustering * global_synonyms_count             -0.247825
global_frequency * global_letters_count                0.007977
global_frequency * global_orthographic_density        -0.026576
global_frequency * global_synonyms_count              -0.011240
global_letters_count * global_orthographic_density    -0.035635
global_letters_count * global_synonyms_count          -0.000183
global_orthographic_density * global_synonyms_count   -0.066696
dtype: float64

Regressing global clustering with 316 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.043005798455538535

intercept                  -5.832695
rel_aoa                    -0.006385
rel_clustering              0.150936
rel_frequency              -0.039964
rel_letters_count          -0.062993
rel_orthographic_density   -0.059810
rel_synonyms_count         -0.019290
dtype: float64

Regressing global clustering with 316 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.15600209408441468

intercept                                       -5.708479
rel_aoa                                         -0.011859
rel_clustering                                  -0.150919
rel_frequency                                   -0.013573
rel_letters_count                               -0.122261
rel_orthographic_density                         0.042854
rel_synonyms_count                               0.072246
rel_aoa * rel_clustering                         0.005435
rel_aoa * rel_frequency                         -0.031432
rel_aoa * rel_letters_count                     -0.014452
rel_aoa * rel_orthographic_density               0.044402
rel_aoa * rel_synonyms_count                     0.080016
rel_clustering * rel_frequency                  -0.062757
rel_clustering * rel_letters_count               0.124248
rel_clustering * rel_orthographic_density        0.106632
rel_clustering * rel_synonyms_count             -0.191764
rel_frequency * rel_letters_count                0.008294
rel_frequency * rel_orthographic_density         0.003482
rel_frequency * rel_synonyms_count              -0.012048
rel_letters_count * rel_orthographic_density    -0.061510
rel_letters_count * rel_synonyms_count          -0.037752
rel_orthographic_density * rel_synonyms_count    0.023199
dtype: float64

Regressing rel clustering with 316 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.13333109968870172

intercept                   0.327169
rel_aoa                    -0.016617
rel_clustering              0.362212
rel_frequency              -0.018552
rel_letters_count          -0.042343
rel_orthographic_density   -0.001661
rel_synonyms_count          0.001937
dtype: float64

Regressing rel clustering with 316 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.23590563723372726

intercept                                        0.442875
rel_aoa                                          0.013782
rel_clustering                                   0.020368
rel_frequency                                    0.008700
rel_letters_count                               -0.080574
rel_orthographic_density                         0.046137
rel_synonyms_count                               0.062265
rel_aoa * rel_clustering                        -0.035911
rel_aoa * rel_frequency                         -0.018118
rel_aoa * rel_letters_count                     -0.023297
rel_aoa * rel_orthographic_density              -0.000197
rel_aoa * rel_synonyms_count                     0.052195
rel_clustering * rel_frequency                  -0.089105
rel_clustering * rel_letters_count               0.117366
rel_clustering * rel_orthographic_density        0.104139
rel_clustering * rel_synonyms_count             -0.267323
rel_frequency * rel_letters_count                0.005593
rel_frequency * rel_orthographic_density        -0.013631
rel_frequency * rel_synonyms_count              -0.009048
rel_letters_count * rel_orthographic_density    -0.040729
rel_letters_count * rel_synonyms_count           0.002735
rel_orthographic_density * rel_synonyms_count    0.032795
dtype: float64

Regressing global clustering with 316 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11782294078612032

intercept                     -1.305446
global_aoa                    -0.042135
global_clustering              0.376596
global_frequency              -0.151912
global_letters_count          -0.060046
global_orthographic_density   -0.003617
global_synonyms_count         -0.050341
rel_aoa                        0.016759
rel_clustering                -0.179647
rel_frequency                  0.084803
rel_letters_count              0.007052
rel_orthographic_density      -0.042172
rel_synonyms_count             0.030200
dtype: float64

Regressing global clustering with 316 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.3852809613487748

intercept                                                 24.956625
global_aoa                                                -0.439395
global_clustering                                          7.066711
global_frequency                                          -1.623660
global_letters_count                                       1.058943
global_orthographic_density                                1.281369
global_synonyms_count                                     -6.313443
rel_aoa                                                    0.151314
rel_clustering                                            -7.301884
rel_frequency                                              0.659273
rel_letters_count                                         -1.035386
rel_orthographic_density                                  -1.434655
rel_synonyms_count                                         2.112366
global_aoa * global_clustering                            -0.252673
global_aoa * global_frequency                             -0.042375
global_aoa * global_letters_count                         -0.071479
global_aoa * global_orthographic_density                  -0.244190
global_aoa * global_synonyms_count                         0.038047
global_aoa * rel_aoa                                       0.031255
global_aoa * rel_clustering                                0.338665
global_aoa * rel_frequency                                 0.020301
global_aoa * rel_letters_count                             0.066043
global_aoa * rel_orthographic_density                      0.172690
global_aoa * rel_synonyms_count                            0.076748
global_clustering * global_frequency                      -0.361265
global_clustering * global_letters_count                   0.087797
global_clustering * global_orthographic_density           -0.642179
global_clustering * global_synonyms_count                 -1.202674
global_clustering * rel_aoa                                0.132472
global_clustering * rel_clustering                        -0.059867
global_clustering * rel_frequency                          0.245630
global_clustering * rel_letters_count                     -0.036358
global_clustering * rel_orthographic_density               0.376330
global_clustering * rel_synonyms_count                     0.985276
global_frequency * global_letters_count                    0.026961
global_frequency * global_orthographic_density            -0.232742
global_frequency * global_synonyms_count                   0.064165
global_frequency * rel_aoa                                 0.005879
global_frequency * rel_clustering                          0.234013
global_frequency * rel_frequency                           0.026441
global_frequency * rel_letters_count                      -0.017469
global_frequency * rel_orthographic_density                0.170983
global_frequency * rel_synonyms_count                      0.063574
global_letters_count * global_orthographic_density        -0.193604
global_letters_count * global_synonyms_count              -0.227528
global_letters_count * rel_aoa                             0.028396
global_letters_count * rel_clustering                      0.027979
global_letters_count * rel_frequency                       0.031274
global_letters_count * rel_letters_count                   0.022306
global_letters_count * rel_orthographic_density            0.105705
global_letters_count * rel_synonyms_count                  0.336045
global_orthographic_density * global_synonyms_count       -0.693272
global_orthographic_density * rel_aoa                      0.232506
global_orthographic_density * rel_clustering               0.790965
global_orthographic_density * rel_frequency                0.248829
global_orthographic_density * rel_letters_count            0.169708
global_orthographic_density * rel_orthographic_density    -0.062537
global_orthographic_density * rel_synonyms_count           0.783278
global_synonyms_count * rel_aoa                            0.025817
global_synonyms_count * rel_clustering                     0.497442
global_synonyms_count * rel_frequency                     -0.182603
global_synonyms_count * rel_letters_count                  0.117184
global_synonyms_count * rel_orthographic_density           0.342574
global_synonyms_count * rel_synonyms_count                 0.031805
rel_aoa * rel_clustering                                  -0.213817
rel_aoa * rel_frequency                                    0.007953
rel_aoa * rel_letters_count                               -0.050844
rel_aoa * rel_orthographic_density                        -0.121501
rel_aoa * rel_synonyms_count                              -0.061219
rel_clustering * rel_frequency                            -0.236326
rel_clustering * rel_letters_count                         0.139538
rel_clustering * rel_orthographic_density                 -0.226624
rel_clustering * rel_synonyms_count                       -0.608356
rel_frequency * rel_letters_count                         -0.023289
rel_frequency * rel_orthographic_density                  -0.166381
rel_frequency * rel_synonyms_count                         0.002870
rel_letters_count * rel_orthographic_density              -0.108346
rel_letters_count * rel_synonyms_count                    -0.241696
rel_orthographic_density * rel_synonyms_count             -0.379820
dtype: float64

Regressing rel clustering with 316 measures, no interactions
           ^^^^^^^^^^^^^^
R^2 = 0.21187936657639783

intercept                     -0.708405
global_aoa                    -0.035755
global_clustering             -0.477023
global_frequency              -0.131756
global_letters_count          -0.060689
global_orthographic_density    0.004440
global_synonyms_count         -0.105066
rel_aoa                        0.011962
rel_clustering                 0.748900
rel_frequency                  0.073081
rel_letters_count              0.014803
rel_orthographic_density      -0.057928
rel_synonyms_count             0.083972
dtype: float64

Regressing rel clustering with 316 measures, with interactions
           ^^^^^^^^^^^^^^
R^2 = 0.45264251370926756

intercept                                                 19.082660
global_aoa                                                -0.354199
global_clustering                                          4.599595
global_frequency                                          -1.264512
global_letters_count                                       0.935572
global_orthographic_density                                1.119493
global_synonyms_count                                     -5.490294
rel_aoa                                                    0.247598
rel_clustering                                            -5.552350
rel_frequency                                              0.389599
rel_letters_count                                         -0.869767
rel_orthographic_density                                  -0.970634
rel_synonyms_count                                         1.297074
global_aoa * global_clustering                            -0.188256
global_aoa * global_frequency                             -0.033017
global_aoa * global_letters_count                         -0.044585
global_aoa * global_orthographic_density                  -0.191744
global_aoa * global_synonyms_count                         0.032513
global_aoa * rel_aoa                                       0.028374
global_aoa * rel_clustering                                0.266052
global_aoa * rel_frequency                                 0.012611
global_aoa * rel_letters_count                             0.051610
global_aoa * rel_orthographic_density                      0.149934
global_aoa * rel_synonyms_count                            0.050960
global_clustering * global_frequency                      -0.274045
global_clustering * global_letters_count                   0.127656
global_clustering * global_orthographic_density           -0.581270
global_clustering * global_synonyms_count                 -1.088320
global_clustering * rel_aoa                                0.101178
global_clustering * rel_clustering                        -0.088105
global_clustering * rel_frequency                          0.184447
global_clustering * rel_letters_count                     -0.049980
global_clustering * rel_orthographic_density               0.447237
global_clustering * rel_synonyms_count                     0.839578
global_frequency * global_letters_count                    0.048166
global_frequency * global_orthographic_density            -0.209111
global_frequency * global_synonyms_count                   0.018396
global_frequency * rel_aoa                                -0.004971
global_frequency * rel_clustering                          0.178314
global_frequency * rel_frequency                           0.023311
global_frequency * rel_letters_count                      -0.033835
global_frequency * rel_orthographic_density                0.167744
global_frequency * rel_synonyms_count                      0.104752
global_letters_count * global_orthographic_density        -0.205069
global_letters_count * global_synonyms_count              -0.219434
global_letters_count * rel_aoa                             0.009601
global_letters_count * rel_clustering                      0.011035
global_letters_count * rel_frequency                       0.018261
global_letters_count * rel_letters_count                   0.021044
global_letters_count * rel_orthographic_density            0.138554
global_letters_count * rel_synonyms_count                  0.339820
global_orthographic_density * global_synonyms_count       -0.603880
global_orthographic_density * rel_aoa                      0.197226
global_orthographic_density * rel_clustering               0.805840
global_orthographic_density * rel_frequency                0.253977
global_orthographic_density * rel_letters_count            0.182230
global_orthographic_density * rel_orthographic_density    -0.075581
global_orthographic_density * rel_synonyms_count           0.668012
global_synonyms_count * rel_aoa                           -0.017348
global_synonyms_count * rel_clustering                     0.588214
global_synonyms_count * rel_frequency                     -0.166592
global_synonyms_count * rel_letters_count                  0.123209
global_synonyms_count * rel_orthographic_density           0.223689
global_synonyms_count * rel_synonyms_count                 0.033280
rel_aoa * rel_clustering                                  -0.212472
rel_aoa * rel_frequency                                    0.011576
rel_aoa * rel_letters_count                               -0.041220
rel_aoa * rel_orthographic_density                        -0.119070
rel_aoa * rel_synonyms_count                              -0.003718
rel_clustering * rel_frequency                            -0.210917
rel_clustering * rel_letters_count                         0.130498
rel_clustering * rel_orthographic_density                 -0.411874
rel_clustering * rel_synonyms_count                       -0.641984
rel_frequency * rel_letters_count                         -0.014562
rel_frequency * rel_orthographic_density                  -0.199088
rel_frequency * rel_synonyms_count                        -0.000484
rel_letters_count * rel_orthographic_density              -0.134625
rel_letters_count * rel_synonyms_count                    -0.242495
rel_orthographic_density * rel_synonyms_count             -0.274920
dtype: float64

----------------------------------------------------------------------
Regressing global letters_count with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10237465242451349

intercept                      4.435163
global_aoa                    -0.016332
global_clustering             -0.099083
global_frequency              -0.044911
global_letters_count           0.328025
global_orthographic_density   -0.042187
global_synonyms_count         -0.271415
dtype: float64

Regressing global letters_count with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.15528126956564137

intercept                                             -13.533142
global_aoa                                              1.026911
global_clustering                                      -3.353339
global_frequency                                        0.267862
global_letters_count                                    1.195802
global_orthographic_density                             0.802829
global_synonyms_count                                  -0.456611
global_aoa * global_clustering                          0.203400
global_aoa * global_frequency                           0.089951
global_aoa * global_letters_count                      -0.086284
global_aoa * global_orthographic_density               -0.133582
global_aoa * global_synonyms_count                      0.131001
global_clustering * global_frequency                    0.186774
global_clustering * global_letters_count               -0.037305
global_clustering * global_orthographic_density         0.135232
global_clustering * global_synonyms_count               0.381919
global_frequency * global_letters_count                -0.031532
global_frequency * global_orthographic_density          0.165659
global_frequency * global_synonyms_count                0.241069
global_letters_count * global_orthographic_density     -0.105060
global_letters_count * global_synonyms_count           -0.105704
global_orthographic_density * global_synonyms_count     0.112515
dtype: float64

Regressing rel letters_count with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.05814651349775524

intercept                      1.695721
global_aoa                    -0.083987
global_clustering             -0.083872
global_frequency              -0.101442
global_letters_count           0.284502
global_orthographic_density    0.055356
global_synonyms_count         -0.351880
dtype: float64

Regressing rel letters_count with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.11974347905965786

intercept                                             -23.951170
global_aoa                                              1.539036
global_clustering                                      -4.458311
global_frequency                                        0.761521
global_letters_count                                    1.130454
global_orthographic_density                             1.066598
global_synonyms_count                                  -0.249744
global_aoa * global_clustering                          0.295139
global_aoa * global_frequency                           0.094618
global_aoa * global_letters_count                      -0.106482
global_aoa * global_orthographic_density               -0.111682
global_aoa * global_synonyms_count                      0.113534
global_clustering * global_frequency                    0.277019
global_clustering * global_letters_count               -0.077293
global_clustering * global_orthographic_density         0.084117
global_clustering * global_synonyms_count               0.357578
global_frequency * global_letters_count                -0.033184
global_frequency * global_orthographic_density          0.130005
global_frequency * global_synonyms_count                0.226347
global_letters_count * global_orthographic_density     -0.151890
global_letters_count * global_synonyms_count           -0.118875
global_orthographic_density * global_synonyms_count     0.015949
dtype: float64

Regressing global letters_count with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.07739949093025555

intercept                   5.724765
rel_aoa                    -0.091602
rel_clustering              0.070153
rel_frequency               0.036317
rel_letters_count           0.228103
rel_orthographic_density   -0.271357
rel_synonyms_count         -0.248586
dtype: float64

Regressing global letters_count with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10288737830512362

intercept                                        5.619644
rel_aoa                                         -0.278109
rel_clustering                                   0.118088
rel_frequency                                    0.054537
rel_letters_count                                0.280176
rel_orthographic_density                        -0.360548
rel_synonyms_count                               0.203797
rel_aoa * rel_clustering                         0.113012
rel_aoa * rel_frequency                         -0.025541
rel_aoa * rel_letters_count                     -0.007260
rel_aoa * rel_orthographic_density              -0.066533
rel_aoa * rel_synonyms_count                     0.178610
rel_clustering * rel_frequency                  -0.006008
rel_clustering * rel_letters_count               0.050785
rel_clustering * rel_orthographic_density        0.308617
rel_clustering * rel_synonyms_count              0.418141
rel_frequency * rel_letters_count               -0.003162
rel_frequency * rel_orthographic_density         0.005636
rel_frequency * rel_synonyms_count               0.104937
rel_letters_count * rel_orthographic_density     0.029147
rel_letters_count * rel_synonyms_count           0.034886
rel_orthographic_density * rel_synonyms_count    0.459359
dtype: float64

Regressing rel letters_count with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.15696526650928355

intercept                   1.172833
rel_aoa                    -0.087302
rel_clustering             -0.009277
rel_frequency              -0.181162
rel_letters_count           0.440010
rel_orthographic_density    0.125737
rel_synonyms_count         -0.259910
dtype: float64

Regressing rel letters_count with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.20483882872841253

intercept                                        0.975009
rel_aoa                                         -0.185556
rel_clustering                                   0.252156
rel_frequency                                   -0.222177
rel_letters_count                                0.595848
rel_orthographic_density                         0.198962
rel_synonyms_count                               0.310477
rel_aoa * rel_clustering                         0.177372
rel_aoa * rel_frequency                          0.013760
rel_aoa * rel_letters_count                     -0.079640
rel_aoa * rel_orthographic_density              -0.208859
rel_aoa * rel_synonyms_count                     0.182997
rel_clustering * rel_frequency                   0.089630
rel_clustering * rel_letters_count               0.028767
rel_clustering * rel_orthographic_density        0.303924
rel_clustering * rel_synonyms_count              0.417493
rel_frequency * rel_letters_count                0.008492
rel_frequency * rel_orthographic_density         0.028074
rel_frequency * rel_synonyms_count               0.158669
rel_letters_count * rel_orthographic_density     0.027903
rel_letters_count * rel_synonyms_count           0.023136
rel_orthographic_density * rel_synonyms_count    0.353708
dtype: float64

Regressing global letters_count with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11300227509562566

intercept                      0.777066
global_aoa                     0.066177
global_clustering             -0.427271
global_frequency              -0.019366
global_letters_count           0.440065
global_orthographic_density    0.102443
global_synonyms_count         -0.245064
rel_aoa                       -0.136094
rel_clustering                 0.352137
rel_frequency                 -0.054836
rel_letters_count             -0.115287
rel_orthographic_density      -0.152556
rel_synonyms_count            -0.015954
dtype: float64

Regressing global letters_count with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^
R^2 = 0.3056813269210735

intercept                                                -48.982439
global_aoa                                                 4.342587
global_clustering                                         -9.256466
global_frequency                                           0.331751
global_letters_count                                      -0.975636
global_orthographic_density                                6.796509
global_synonyms_count                                      8.484282
rel_aoa                                                   -8.237674
rel_clustering                                             4.877958
rel_frequency                                              0.796034
rel_letters_count                                          2.857039
rel_orthographic_density                                 -11.086919
rel_synonyms_count                                       -10.193034
global_aoa * global_clustering                             0.574713
global_aoa * global_frequency                             -0.018714
global_aoa * global_letters_count                         -0.088446
global_aoa * global_orthographic_density                  -0.100265
global_aoa * global_synonyms_count                        -0.086081
global_aoa * rel_aoa                                       0.029274
global_aoa * rel_clustering                               -0.469264
global_aoa * rel_frequency                                -0.041520
global_aoa * rel_letters_count                            -0.030230
global_aoa * rel_orthographic_density                      0.018111
global_aoa * rel_synonyms_count                            0.031085
global_clustering * global_frequency                       0.303769
global_clustering * global_letters_count                   0.020568
global_clustering * global_orthographic_density            1.185113
global_clustering * global_synonyms_count                 -0.673410
global_clustering * rel_aoa                               -0.206439
global_clustering * rel_clustering                        -0.086276
global_clustering * rel_frequency                          0.138177
global_clustering * rel_letters_count                     -0.077161
global_clustering * rel_orthographic_density              -1.557061
global_clustering * rel_synonyms_count                     0.485319
global_frequency * global_letters_count                    0.332785
global_frequency * global_orthographic_density             0.340768
global_frequency * global_synonyms_count                  -0.806931
global_frequency * rel_aoa                                 0.418048
global_frequency * rel_clustering                         -0.077925
global_frequency * rel_frequency                          -0.015400
global_frequency * rel_letters_count                      -0.427514
global_frequency * rel_orthographic_density               -0.077763
global_frequency * rel_synonyms_count                      0.966949
global_letters_count * global_orthographic_density        -0.409755
global_letters_count * global_synonyms_count              -0.786433
global_letters_count * rel_aoa                             0.371841
global_letters_count * rel_clustering                      0.277914
global_letters_count * rel_frequency                      -0.025622
global_letters_count * rel_letters_count                   0.057814
global_letters_count * rel_orthographic_density            0.334908
global_letters_count * rel_synonyms_count                  0.712908
global_orthographic_density * global_synonyms_count        0.339562
global_orthographic_density * rel_aoa                      0.219666
global_orthographic_density * rel_clustering              -1.316103
global_orthographic_density * rel_frequency               -0.019989
global_orthographic_density * rel_letters_count            0.313151
global_orthographic_density * rel_orthographic_density     0.140262
global_orthographic_density * rel_synonyms_count          -0.384693
global_synonyms_count * rel_aoa                            0.404052
global_synonyms_count * rel_clustering                     1.094472
global_synonyms_count * rel_frequency                      0.904339
global_synonyms_count * rel_letters_count                  0.222915
global_synonyms_count * rel_orthographic_density          -0.670539
global_synonyms_count * rel_synonyms_count                -0.100576
rel_aoa * rel_clustering                                   0.415736
rel_aoa * rel_frequency                                   -0.301055
rel_aoa * rel_letters_count                               -0.343755
rel_aoa * rel_orthographic_density                        -0.187177
rel_aoa * rel_synonyms_count                              -0.075914
rel_clustering * rel_frequency                            -0.173481
rel_clustering * rel_letters_count                        -0.130600
rel_clustering * rel_orthographic_density                  2.049898
rel_clustering * rel_synonyms_count                       -0.425647
rel_frequency * rel_letters_count                          0.149123
rel_frequency * rel_orthographic_density                  -0.039733
rel_frequency * rel_synonyms_count                        -0.813133
rel_letters_count * rel_orthographic_density              -0.153963
rel_letters_count * rel_synonyms_count                    -0.251732
rel_orthographic_density * rel_synonyms_count              0.869590
dtype: float64

Regressing rel letters_count with 391 measures, no interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.19702252709175538

intercept                      0.317020
global_aoa                     0.030918
global_clustering             -0.439816
global_frequency              -0.014215
global_letters_count          -0.451669
global_orthographic_density    0.077261
global_synonyms_count         -0.178481
rel_aoa                       -0.099244
rel_clustering                 0.378060
rel_frequency                 -0.089752
rel_letters_count              0.794833
rel_orthographic_density      -0.160354
rel_synonyms_count            -0.099738
dtype: float64

Regressing rel letters_count with 391 measures, with interactions
           ^^^^^^^^^^^^^^^^^
R^2 = 0.36011596662647716

intercept                                                -58.851984
global_aoa                                                 3.505340
global_clustering                                        -11.615220
global_frequency                                           0.732633
global_letters_count                                      -0.162915
global_orthographic_density                                6.168905
global_synonyms_count                                      5.861147
rel_aoa                                                   -6.797640
rel_clustering                                             5.681126
rel_frequency                                              0.137529
rel_letters_count                                          2.469079
rel_orthographic_density                                 -10.147087
rel_synonyms_count                                        -8.805664
global_aoa * global_clustering                             0.514766
global_aoa * global_frequency                              0.018791
global_aoa * global_letters_count                         -0.064709
global_aoa * global_orthographic_density                  -0.133679
global_aoa * global_synonyms_count                        -0.068668
global_aoa * rel_aoa                                       0.029360
global_aoa * rel_clustering                               -0.346440
global_aoa * rel_frequency                                -0.060227
global_aoa * rel_letters_count                            -0.063535
global_aoa * rel_orthographic_density                      0.024137
global_aoa * rel_synonyms_count                           -0.048412
global_clustering * global_frequency                       0.418974
global_clustering * global_letters_count                   0.325750
global_clustering * global_orthographic_density            1.219351
global_clustering * global_synonyms_count                 -0.637197
global_clustering * rel_aoa                               -0.166880
global_clustering * rel_clustering                        -0.140373
global_clustering * rel_frequency                         -0.020525
global_clustering * rel_letters_count                     -0.358205
global_clustering * rel_orthographic_density              -1.572889
global_clustering * rel_synonyms_count                     0.320146
global_frequency * global_letters_count                    0.299500
global_frequency * global_orthographic_density             0.397061
global_frequency * global_synonyms_count                  -0.593262
global_frequency * rel_aoa                                 0.322520
global_frequency * rel_clustering                         -0.171423
global_frequency * rel_frequency                          -0.027585
global_frequency * rel_letters_count                      -0.402522
global_frequency * rel_orthographic_density               -0.146619
global_frequency * rel_synonyms_count                      0.803654
global_letters_count * global_orthographic_density        -0.303957
global_letters_count * global_synonyms_count              -0.547527
global_letters_count * rel_aoa                             0.302888
global_letters_count * rel_clustering                     -0.009283
global_letters_count * rel_frequency                      -0.029236
global_letters_count * rel_letters_count                   0.049493
global_letters_count * rel_orthographic_density            0.233084
global_letters_count * rel_synonyms_count                  0.553091
global_orthographic_density * global_synonyms_count        0.173097
global_orthographic_density * rel_aoa                      0.303323
global_orthographic_density * rel_clustering              -1.159460
global_orthographic_density * rel_frequency               -0.048254
global_orthographic_density * rel_letters_count            0.185829
global_orthographic_density * rel_orthographic_density     0.094065
global_orthographic_density * rel_synonyms_count          -0.261193
global_synonyms_count * rel_aoa                            0.493916
global_synonyms_count * rel_clustering                     1.112921
global_synonyms_count * rel_frequency                      0.748125
global_synonyms_count * rel_letters_count                 -0.009522
global_synonyms_count * rel_orthographic_density          -0.303638
global_synonyms_count * rel_synonyms_count                -0.082887
rel_aoa * rel_clustering                                   0.291378
rel_aoa * rel_frequency                                   -0.227900
rel_aoa * rel_letters_count                               -0.277041
rel_aoa * rel_orthographic_density                        -0.254598
rel_aoa * rel_synonyms_count                              -0.107304
rel_clustering * rel_frequency                            -0.075262
rel_clustering * rel_letters_count                         0.122409
rel_clustering * rel_orthographic_density                  1.854904
rel_clustering * rel_synonyms_count                       -0.309923
rel_frequency * rel_letters_count                          0.154720
rel_frequency * rel_orthographic_density                  -0.028621
rel_frequency * rel_synonyms_count                        -0.693602
rel_letters_count * rel_orthographic_density              -0.046942
rel_letters_count * rel_synonyms_count                    -0.087203
rel_orthographic_density * rel_synonyms_count              0.556044
dtype: float64

----------------------------------------------------------------------
Regressing global synonyms_count with 379 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.09624268566954097

intercept                      0.797144
global_aoa                    -0.019863
global_clustering              0.056196
global_frequency               0.010267
global_letters_count          -0.018256
global_orthographic_density   -0.005183
global_synonyms_count          0.267010
dtype: float64

Regressing global synonyms_count with 379 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13729392993648581

intercept                                              1.671486
global_aoa                                            -0.050333
global_clustering                                      0.519805
global_frequency                                      -0.040289
global_letters_count                                   0.179693
global_orthographic_density                            0.153648
global_synonyms_count                                  0.610414
global_aoa * global_clustering                        -0.014634
global_aoa * global_frequency                         -0.009585
global_aoa * global_letters_count                     -0.000918
global_aoa * global_orthographic_density               0.029871
global_aoa * global_synonyms_count                     0.024265
global_clustering * global_frequency                  -0.046233
global_clustering * global_letters_count               0.005504
global_clustering * global_orthographic_density       -0.022179
global_clustering * global_synonyms_count              0.139821
global_frequency * global_letters_count               -0.016288
global_frequency * global_orthographic_density        -0.040466
global_frequency * global_synonyms_count               0.024057
global_letters_count * global_orthographic_density    -0.024718
global_letters_count * global_synonyms_count           0.005098
global_orthographic_density * global_synonyms_count    0.039247
dtype: float64

Regressing rel synonyms_count with 379 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.06183951952074185

intercept                      0.522578
global_aoa                    -0.017035
global_clustering              0.052977
global_frequency               0.009449
global_letters_count          -0.021330
global_orthographic_density   -0.009922
global_synonyms_count          0.198360
dtype: float64

Regressing rel synonyms_count with 379 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.10032283382141871

intercept                                              2.039544
global_aoa                                            -0.106662
global_clustering                                      0.613511
global_frequency                                      -0.132158
global_letters_count                                   0.208408
global_orthographic_density                            0.225892
global_synonyms_count                                  0.750562
global_aoa * global_clustering                        -0.031228
global_aoa * global_frequency                         -0.006459
global_aoa * global_letters_count                     -0.005548
global_aoa * global_orthographic_density               0.011092
global_aoa * global_synonyms_count                     0.000817
global_clustering * global_frequency                  -0.051588
global_clustering * global_letters_count               0.012423
global_clustering * global_orthographic_density       -0.016282
global_clustering * global_synonyms_count              0.153002
global_frequency * global_letters_count               -0.012242
global_frequency * global_orthographic_density        -0.033044
global_frequency * global_synonyms_count               0.030985
global_letters_count * global_orthographic_density    -0.016053
global_letters_count * global_synonyms_count           0.006692
global_orthographic_density * global_synonyms_count    0.003798
dtype: float64

Regressing global synonyms_count with 379 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.08482915021321491

intercept                   0.469979
rel_aoa                     0.016214
rel_clustering             -0.017993
rel_frequency               0.007507
rel_letters_count          -0.021081
rel_orthographic_density    0.016416
rel_synonyms_count          0.281545
dtype: float64

Regressing global synonyms_count with 379 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1312403152024506

intercept                                        0.564503
rel_aoa                                          0.024379
rel_clustering                                  -0.176745
rel_frequency                                    0.037196
rel_letters_count                               -0.085451
rel_orthographic_density                         0.039932
rel_synonyms_count                               0.344152
rel_aoa * rel_clustering                        -0.017696
rel_aoa * rel_frequency                         -0.009875
rel_aoa * rel_letters_count                      0.013195
rel_aoa * rel_orthographic_density               0.053086
rel_aoa * rel_synonyms_count                     0.033556
rel_clustering * rel_frequency                  -0.042581
rel_clustering * rel_letters_count               0.020658
rel_clustering * rel_orthographic_density       -0.013589
rel_clustering * rel_synonyms_count              0.140315
rel_frequency * rel_letters_count               -0.004783
rel_frequency * rel_orthographic_density        -0.000360
rel_frequency * rel_synonyms_count               0.040392
rel_letters_count * rel_orthographic_density    -0.027659
rel_letters_count * rel_synonyms_count           0.009669
rel_orthographic_density * rel_synonyms_count    0.050788
dtype: float64

Regressing rel synonyms_count with 379 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.1628864725647401

intercept                   0.146223
rel_aoa                     0.005699
rel_clustering              0.027051
rel_frequency               0.011873
rel_letters_count          -0.020722
rel_orthographic_density   -0.006867
rel_synonyms_count          0.391674
dtype: float64

Regressing rel synonyms_count with 379 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.21300174760211765

intercept                                        0.242751
rel_aoa                                          0.020423
rel_clustering                                  -0.163175
rel_frequency                                    0.049572
rel_letters_count                               -0.086473
rel_orthographic_density                        -0.009563
rel_synonyms_count                               0.540448
rel_aoa * rel_clustering                        -0.017783
rel_aoa * rel_frequency                         -0.003633
rel_aoa * rel_letters_count                      0.010296
rel_aoa * rel_orthographic_density               0.029800
rel_aoa * rel_synonyms_count                     0.002870
rel_clustering * rel_frequency                  -0.051897
rel_clustering * rel_letters_count               0.022046
rel_clustering * rel_orthographic_density       -0.021181
rel_clustering * rel_synonyms_count              0.144935
rel_frequency * rel_letters_count               -0.012819
rel_frequency * rel_orthographic_density        -0.007994
rel_frequency * rel_synonyms_count               0.063359
rel_letters_count * rel_orthographic_density    -0.012312
rel_letters_count * rel_synonyms_count           0.019122
rel_orthographic_density * rel_synonyms_count    0.069496
dtype: float64

Regressing global synonyms_count with 379 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.11916231100231212

intercept                      2.186279
global_aoa                    -0.055138
global_clustering              0.208448
global_frequency              -0.011170
global_letters_count          -0.012381
global_orthographic_density    0.023516
global_synonyms_count          0.116347
rel_aoa                        0.054099
rel_clustering                -0.168829
rel_frequency                  0.031478
rel_letters_count             -0.008310
rel_orthographic_density      -0.029235
rel_synonyms_count             0.167611
dtype: float64

Regressing global synonyms_count with 379 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2776120307805565

intercept                                                 8.283804
global_aoa                                                0.554674
global_clustering                                         2.838613
global_frequency                                         -0.315599
global_letters_count                                     -0.291987
global_orthographic_density                               1.342327
global_synonyms_count                                     7.644378
rel_aoa                                                  -0.884513
rel_clustering                                           -2.593545
rel_frequency                                             0.046847
rel_letters_count                                         0.288471
rel_orthographic_density                                 -0.023086
rel_synonyms_count                                       -7.287141
global_aoa * global_clustering                           -0.032167
global_aoa * global_frequency                            -0.028693
global_aoa * global_letters_count                        -0.036930
global_aoa * global_orthographic_density                 -0.124058
global_aoa * global_synonyms_count                       -0.124520
global_aoa * rel_aoa                                     -0.007730
global_aoa * rel_clustering                               0.036119
global_aoa * rel_frequency                                0.018677
global_aoa * rel_letters_count                            0.041746
global_aoa * rel_orthographic_density                     0.168139
global_aoa * rel_synonyms_count                           0.188203
global_clustering * global_frequency                     -0.142869
global_clustering * global_letters_count                 -0.103498
global_clustering * global_orthographic_density          -0.260261
global_clustering * global_synonyms_count                 0.480522
global_clustering * rel_aoa                              -0.080972
global_clustering * rel_clustering                        0.065288
global_clustering * rel_frequency                         0.064931
global_clustering * rel_letters_count                     0.096595
global_clustering * rel_orthographic_density              0.371371
global_clustering * rel_synonyms_count                   -0.311288
global_frequency * global_letters_count                  -0.005426
global_frequency * global_orthographic_density           -0.167032
global_frequency * global_synonyms_count                 -0.051651
global_frequency * rel_aoa                                0.028359
global_frequency * rel_clustering                         0.143190
global_frequency * rel_frequency                         -0.004565
global_frequency * rel_letters_count                     -0.012091
global_frequency * rel_orthographic_density               0.125411
global_frequency * rel_synonyms_count                     0.091047
global_letters_count * global_orthographic_density       -0.025353
global_letters_count * global_synonyms_count             -0.242865
global_letters_count * rel_aoa                            0.034870
global_letters_count * rel_clustering                     0.119823
global_letters_count * rel_frequency                      0.014307
global_letters_count * rel_letters_count                 -0.011143
global_letters_count * rel_orthographic_density          -0.050626
global_letters_count * rel_synonyms_count                 0.317638
global_orthographic_density * global_synonyms_count      -0.988442
global_orthographic_density * rel_aoa                    -0.001576
global_orthographic_density * rel_clustering              0.188212
global_orthographic_density * rel_frequency               0.119922
global_orthographic_density * rel_letters_count           0.093238
global_orthographic_density * rel_orthographic_density    0.021289
global_orthographic_density * rel_synonyms_count          0.944693
global_synonyms_count * rel_aoa                           0.046617
global_synonyms_count * rel_clustering                   -0.082181
global_synonyms_count * rel_frequency                     0.121348
global_synonyms_count * rel_letters_count                 0.120551
global_synonyms_count * rel_orthographic_density          0.639812
global_synonyms_count * rel_synonyms_count                0.110124
rel_aoa * rel_clustering                                  0.041408
rel_aoa * rel_frequency                                  -0.034664
rel_aoa * rel_letters_count                              -0.012154
rel_aoa * rel_orthographic_density                        0.028688
rel_aoa * rel_synonyms_count                             -0.080051
rel_clustering * rel_frequency                           -0.116269
rel_clustering * rel_letters_count                       -0.119879
rel_clustering * rel_orthographic_density                -0.327560
rel_clustering * rel_synonyms_count                       0.026971
rel_frequency * rel_letters_count                        -0.007768
rel_frequency * rel_orthographic_density                 -0.090157
rel_frequency * rel_synonyms_count                       -0.130438
rel_letters_count * rel_orthographic_density             -0.062017
rel_letters_count * rel_synonyms_count                   -0.142007
rel_orthographic_density * rel_synonyms_count            -0.440033
dtype: float64

Regressing rel synonyms_count with 379 measures, no interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.27883911264102823

intercept                      1.934544
global_aoa                    -0.047831
global_clustering              0.181939
global_frequency              -0.017334
global_letters_count          -0.013202
global_orthographic_density    0.043734
global_synonyms_count         -0.726517
rel_aoa                        0.045217
rel_clustering                -0.145637
rel_frequency                  0.033896
rel_letters_count             -0.001630
rel_orthographic_density      -0.052361
rel_synonyms_count             1.081120
dtype: float64

Regressing rel synonyms_count with 379 measures, with interactions
           ^^^^^^^^^^^^^^^^^^
R^2 = 0.40999867706349624

intercept                                                 8.186277
global_aoa                                                0.500829
global_clustering                                         2.674944
global_frequency                                         -0.386791
global_letters_count                                     -0.347513
global_orthographic_density                               1.582150
global_synonyms_count                                     5.646378
rel_aoa                                                  -0.644274
rel_clustering                                           -2.649892
rel_frequency                                             0.001386
rel_letters_count                                         0.184430
rel_orthographic_density                                 -0.272890
rel_synonyms_count                                       -5.433254
global_aoa * global_clustering                           -0.027148
global_aoa * global_frequency                            -0.022577
global_aoa * global_letters_count                        -0.033801
global_aoa * global_orthographic_density                 -0.125392
global_aoa * global_synonyms_count                       -0.077321
global_aoa * rel_aoa                                     -0.003294
global_aoa * rel_clustering                               0.029407
global_aoa * rel_frequency                                0.016282
global_aoa * rel_letters_count                            0.036288
global_aoa * rel_orthographic_density                     0.160967
global_aoa * rel_synonyms_count                           0.143825
global_clustering * global_frequency                     -0.142777
global_clustering * global_letters_count                 -0.120368
global_clustering * global_orthographic_density          -0.174056
global_clustering * global_synonyms_count                 0.411963
global_clustering * rel_aoa                              -0.068892
global_clustering * rel_clustering                        0.059073
global_clustering * rel_frequency                         0.058458
global_clustering * rel_letters_count                     0.107123
global_clustering * rel_orthographic_density              0.308590
global_clustering * rel_synonyms_count                   -0.272587
global_frequency * global_letters_count                  -0.015742
global_frequency * global_orthographic_density           -0.147090
global_frequency * global_synonyms_count                 -0.042424
global_frequency * rel_aoa                                0.012944
global_frequency * rel_clustering                         0.144657
global_frequency * rel_frequency                         -0.009840
global_frequency * rel_letters_count                      0.008358
global_frequency * rel_orthographic_density               0.115036
global_frequency * rel_synonyms_count                     0.090150
global_letters_count * global_orthographic_density       -0.001793
global_letters_count * global_synonyms_count             -0.198277
global_letters_count * rel_aoa                            0.022971
global_letters_count * rel_clustering                     0.148114
global_letters_count * rel_frequency                      0.027378
global_letters_count * rel_letters_count                 -0.004770
global_letters_count * rel_orthographic_density          -0.059394
global_letters_count * rel_synonyms_count                 0.272063
global_orthographic_density * global_synonyms_count      -0.866379
global_orthographic_density * rel_aoa                     0.010789
global_orthographic_density * rel_clustering              0.174559
global_orthographic_density * rel_frequency               0.129683
global_orthographic_density * rel_letters_count           0.073550
global_orthographic_density * rel_orthographic_density    0.034219
global_orthographic_density * rel_synonyms_count          0.812971
global_synonyms_count * rel_aoa                           0.017227
global_synonyms_count * rel_clustering                   -0.085701
global_synonyms_count * rel_frequency                     0.140972
global_synonyms_count * rel_letters_count                 0.098273
global_synonyms_count * rel_orthographic_density          0.573736
global_synonyms_count * rel_synonyms_count                0.110936
rel_aoa * rel_clustering                                  0.034104
rel_aoa * rel_frequency                                  -0.021725
rel_aoa * rel_letters_count                              -0.007759
rel_aoa * rel_orthographic_density                        0.004618
rel_aoa * rel_synonyms_count                             -0.069659
rel_clustering * rel_frequency                           -0.109447
rel_clustering * rel_letters_count                       -0.142708
rel_clustering * rel_orthographic_density                -0.327228
rel_clustering * rel_synonyms_count                       0.034774
rel_frequency * rel_letters_count                        -0.029320
rel_frequency * rel_orthographic_density                 -0.101452
rel_frequency * rel_synonyms_count                       -0.150150
rel_letters_count * rel_orthographic_density             -0.029504
rel_letters_count * rel_synonyms_count                   -0.120271
rel_orthographic_density * rel_synonyms_count            -0.363777
dtype: float64

----------------------------------------------------------------------
Regressing global orthographic_density with 330 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.13638815294211815

intercept                      0.653642
global_aoa                    -0.010247
global_clustering             -0.034078
global_frequency               0.005351
global_letters_count          -0.008404
global_orthographic_density    0.314615
global_synonyms_count          0.049104
dtype: float64

Regressing global orthographic_density with 330 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.19045755878068116

intercept                                              3.283970
global_aoa                                            -0.417983
global_clustering                                      0.244934
global_frequency                                       0.266424
global_letters_count                                  -0.471259
global_orthographic_density                            0.198705
global_synonyms_count                                  0.684792
global_aoa * global_clustering                        -0.049066
global_aoa * global_frequency                         -0.024235
global_aoa * global_letters_count                      0.035170
global_aoa * global_orthographic_density               0.112043
global_aoa * global_synonyms_count                    -0.038786
global_clustering * global_frequency                   0.013124
global_clustering * global_letters_count              -0.011062
global_clustering * global_orthographic_density        0.013690
global_clustering * global_synonyms_count              0.060453
global_frequency * global_letters_count                0.013359
global_frequency * global_orthographic_density        -0.064668
global_frequency * global_synonyms_count               0.000336
global_letters_count * global_orthographic_density     0.002971
global_letters_count * global_synonyms_count           0.002009
global_orthographic_density * global_synonyms_count   -0.050642
dtype: float64

Regressing rel orthographic_density with 330 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.10152351281196448

intercept                     -1.721882
global_aoa                     0.002225
global_clustering             -0.088364
global_frequency               0.004665
global_letters_count          -0.024654
global_orthographic_density    0.232992
global_synonyms_count          0.052198
dtype: float64

Regressing rel orthographic_density with 330 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1586847898007061

intercept                                              4.281925
global_aoa                                            -0.462087
global_clustering                                      0.404027
global_frequency                                       0.008165
global_letters_count                                  -0.834499
global_orthographic_density                           -0.460350
global_synonyms_count                                  0.467995
global_aoa * global_clustering                        -0.038617
global_aoa * global_frequency                         -0.026889
global_aoa * global_letters_count                      0.055861
global_aoa * global_orthographic_density               0.125865
global_aoa * global_synonyms_count                    -0.042572
global_clustering * global_frequency                  -0.014289
global_clustering * global_letters_count              -0.022207
global_clustering * global_orthographic_density        0.046464
global_clustering * global_synonyms_count              0.038760
global_frequency * global_letters_count                0.023873
global_frequency * global_orthographic_density        -0.012144
global_frequency * global_synonyms_count              -0.030814
global_letters_count * global_orthographic_density     0.034281
global_letters_count * global_synonyms_count           0.053808
global_orthographic_density * global_synonyms_count    0.020458
dtype: float64

Regressing global orthographic_density with 330 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.12525662723169184

intercept                   1.479781
rel_aoa                    -0.029897
rel_clustering             -0.026606
rel_frequency              -0.002357
rel_letters_count           0.044172
rel_orthographic_density    0.385708
rel_synonyms_count          0.054236
dtype: float64

Regressing global orthographic_density with 330 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.1550153325741197

intercept                                        1.420615
rel_aoa                                          0.027298
rel_clustering                                   0.211617
rel_frequency                                   -0.029002
rel_letters_count                                0.054360
rel_orthographic_density                         0.328430
rel_synonyms_count                              -0.008935
rel_aoa * rel_clustering                         0.083327
rel_aoa * rel_frequency                          0.016494
rel_aoa * rel_letters_count                      0.007858
rel_aoa * rel_orthographic_density               0.060051
rel_aoa * rel_synonyms_count                    -0.072474
rel_clustering * rel_frequency                   0.035393
rel_clustering * rel_letters_count              -0.080593
rel_clustering * rel_orthographic_density        0.058642
rel_clustering * rel_synonyms_count             -0.016854
rel_frequency * rel_letters_count               -0.002381
rel_frequency * rel_orthographic_density        -0.005434
rel_frequency * rel_synonyms_count              -0.002651
rel_letters_count * rel_orthographic_density    -0.005372
rel_letters_count * rel_synonyms_count           0.035703
rel_orthographic_density * rel_synonyms_count   -0.025838
dtype: float64

Regressing rel orthographic_density with 330 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.19540477784234456

intercept                  -0.533849
rel_aoa                    -0.020673
rel_clustering             -0.046773
rel_frequency               0.043821
rel_letters_count           0.034386
rel_orthographic_density    0.431575
rel_synonyms_count          0.021228
dtype: float64

Regressing rel orthographic_density with 330 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.22421811961147653

intercept                                       -0.498608
rel_aoa                                          0.015727
rel_clustering                                   0.060991
rel_frequency                                    0.073524
rel_letters_count                                0.019688
rel_orthographic_density                         0.339676
rel_synonyms_count                              -0.138469
rel_aoa * rel_clustering                         0.077227
rel_aoa * rel_frequency                          0.009477
rel_aoa * rel_letters_count                      0.026077
rel_aoa * rel_orthographic_density               0.090993
rel_aoa * rel_synonyms_count                    -0.063996
rel_clustering * rel_frequency                  -0.007451
rel_clustering * rel_letters_count              -0.047056
rel_clustering * rel_orthographic_density        0.089586
rel_clustering * rel_synonyms_count              0.006125
rel_frequency * rel_letters_count               -0.011288
rel_frequency * rel_orthographic_density         0.011738
rel_frequency * rel_synonyms_count              -0.029262
rel_letters_count * rel_orthographic_density     0.017970
rel_letters_count * rel_synonyms_count           0.036392
rel_orthographic_density * rel_synonyms_count   -0.014717
dtype: float64

Regressing global orthographic_density with 330 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.16141040667448314

intercept                      2.789983
global_aoa                     0.034232
global_clustering              0.022587
global_frequency              -0.070697
global_letters_count          -0.220691
global_orthographic_density    0.172442
global_synonyms_count          0.067902
rel_aoa                       -0.052587
rel_clustering                -0.067186
rel_frequency                  0.090467
rel_letters_count              0.223119
rel_orthographic_density       0.151832
rel_synonyms_count            -0.010056
dtype: float64

Regressing global orthographic_density with 330 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.33102937438691815

intercept                                                -15.740977
global_aoa                                                -0.982353
global_clustering                                         -2.579401
global_frequency                                           2.044793
global_letters_count                                       1.989682
global_orthographic_density                               -2.916403
global_synonyms_count                                      1.609564
rel_aoa                                                    1.397585
rel_clustering                                             4.800554
rel_frequency                                             -2.527996
rel_letters_count                                         -1.657609
rel_orthographic_density                                   6.048030
rel_synonyms_count                                         0.861792
global_aoa * global_clustering                            -0.180094
global_aoa * global_frequency                             -0.033496
global_aoa * global_letters_count                         -0.006789
global_aoa * global_orthographic_density                   0.187141
global_aoa * global_synonyms_count                        -0.084031
global_aoa * rel_aoa                                       0.019146
global_aoa * rel_clustering                               -0.005097
global_aoa * rel_frequency                                 0.011196
global_aoa * rel_letters_count                             0.010197
global_aoa * rel_orthographic_density                     -0.140588
global_aoa * rel_synonyms_count                            0.108413
global_clustering * global_frequency                       0.276964
global_clustering * global_letters_count                   0.260475
global_clustering * global_orthographic_density           -0.388833
global_clustering * global_synonyms_count                  0.448834
global_clustering * rel_aoa                                0.083619
global_clustering * rel_clustering                         0.124134
global_clustering * rel_frequency                         -0.482548
global_clustering * rel_letters_count                     -0.203030
global_clustering * rel_orthographic_density               0.813542
global_clustering * rel_synonyms_count                    -0.148470
global_frequency * global_letters_count                   -0.050807
global_frequency * global_orthographic_density            -0.036697
global_frequency * global_synonyms_count                   0.069355
global_frequency * rel_aoa                                -0.032749
global_frequency * rel_clustering                         -0.148912
global_frequency * rel_frequency                           0.022587
global_frequency * rel_letters_count                       0.091771
global_frequency * rel_orthographic_density               -0.001491
global_frequency * rel_synonyms_count                     -0.066655
global_letters_count * global_orthographic_density        -0.046650
global_letters_count * global_synonyms_count               0.176814
global_letters_count * rel_aoa                            -0.074105
global_letters_count * rel_clustering                     -0.240635
global_letters_count * rel_frequency                      -0.021973
global_letters_count * rel_letters_count                  -0.032549
global_letters_count * rel_orthographic_density            0.043882
global_letters_count * rel_synonyms_count                 -0.317580
global_orthographic_density * global_synonyms_count        0.264968
global_orthographic_density * rel_aoa                     -0.176216
global_orthographic_density * rel_clustering              -0.372392
global_orthographic_density * rel_frequency               -0.232224
global_orthographic_density * rel_letters_count           -0.126230
global_orthographic_density * rel_orthographic_density    -0.086322
global_orthographic_density * rel_synonyms_count          -0.383266
global_synonyms_count * rel_aoa                            0.042984
global_synonyms_count * rel_clustering                    -0.840525
global_synonyms_count * rel_frequency                      0.083899
global_synonyms_count * rel_letters_count                 -0.075607
global_synonyms_count * rel_orthographic_density          -0.411529
global_synonyms_count * rel_synonyms_count                -0.148100
rel_aoa * rel_clustering                                   0.104314
rel_aoa * rel_frequency                                    0.078525
rel_aoa * rel_letters_count                                0.088841
rel_aoa * rel_orthographic_density                         0.232692
rel_aoa * rel_synonyms_count                              -0.128345
rel_clustering * rel_frequency                             0.426655
rel_clustering * rel_letters_count                         0.177819
rel_clustering * rel_orthographic_density                  0.042542
rel_clustering * rel_synonyms_count                        0.534785
rel_frequency * rel_letters_count                         -0.036640
rel_frequency * rel_orthographic_density                   0.208136
rel_frequency * rel_synonyms_count                        -0.134531
rel_letters_count * rel_orthographic_density               0.035238
rel_letters_count * rel_synonyms_count                     0.150365
rel_orthographic_density * rel_synonyms_count              0.496096
dtype: float64

Regressing rel orthographic_density with 330 measures, no interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.2473490399455708

intercept                      1.678306
global_aoa                     0.041821
global_clustering              0.003397
global_frequency              -0.051549
global_letters_count          -0.158487
global_orthographic_density   -0.564524
global_synonyms_count          0.068065
rel_aoa                       -0.053964
rel_clustering                -0.045626
rel_frequency                  0.076657
rel_letters_count              0.146719
rel_orthographic_density       0.944535
rel_synonyms_count            -0.037504
dtype: float64

Regressing rel orthographic_density with 330 measures, with interactions
           ^^^^^^^^^^^^^^^^^^^^^^^^
R^2 = 0.4062882079236916

intercept                                                -9.496221
global_aoa                                               -0.659341
global_clustering                                        -1.614080
global_frequency                                          1.499805
global_letters_count                                      1.075515
global_orthographic_density                              -3.898501
global_synonyms_count                                     2.139999
rel_aoa                                                   1.008413
rel_clustering                                            3.314980
rel_frequency                                            -2.182571
rel_letters_count                                        -0.971132
rel_orthographic_density                                  6.754247
rel_synonyms_count                                       -0.495446
global_aoa * global_clustering                           -0.163014
global_aoa * global_frequency                            -0.043060
global_aoa * global_letters_count                        -0.000629
global_aoa * global_orthographic_density                  0.113212
global_aoa * global_synonyms_count                       -0.120414
global_aoa * rel_aoa                                      0.019039
global_aoa * rel_clustering                              -0.014941
global_aoa * rel_frequency                                0.013143
global_aoa * rel_letters_count                            0.023614
global_aoa * rel_orthographic_density                    -0.046782
global_aoa * rel_synonyms_count                           0.189559
global_clustering * global_frequency                      0.208346
global_clustering * global_letters_count                  0.181556
global_clustering * global_orthographic_density          -0.358947
global_clustering * global_synonyms_count                 0.434840
global_clustering * rel_aoa                               0.107638
global_clustering * rel_clustering                        0.117091
global_clustering * rel_frequency                        -0.383648
global_clustering * rel_letters_count                    -0.099832
global_clustering * rel_orthographic_density              0.807908
global_clustering * rel_synonyms_count                   -0.152295
global_frequency * global_letters_count                  -0.019759
global_frequency * global_orthographic_density            0.011560
global_frequency * global_synonyms_count                  0.013963
global_frequency * rel_aoa                               -0.005241
global_frequency * rel_clustering                        -0.093707
global_frequency * rel_frequency                          0.024960
global_frequency * rel_letters_count                      0.070607
global_frequency * rel_orthographic_density              -0.033451
global_frequency * rel_synonyms_count                    -0.012545
global_letters_count * global_orthographic_density        0.050716
global_letters_count * global_synonyms_count              0.195275
global_letters_count * rel_aoa                           -0.048656
global_letters_count * rel_clustering                    -0.116888
global_letters_count * rel_frequency                     -0.009563
global_letters_count * rel_letters_count                 -0.033844
global_letters_count * rel_orthographic_density          -0.015239
global_letters_count * rel_synonyms_count                -0.326928
global_orthographic_density * global_synonyms_count       0.287002
global_orthographic_density * rel_aoa                    -0.109194
global_orthographic_density * rel_clustering             -0.233623
global_orthographic_density * rel_frequency              -0.159484
global_orthographic_density * rel_letters_count          -0.145676
global_orthographic_density * rel_orthographic_density   -0.008704
global_orthographic_density * rel_synonyms_count         -0.239210
global_synonyms_count * rel_aoa                           0.024728
global_synonyms_count * rel_clustering                   -0.628699
global_synonyms_count * rel_frequency                     0.110790
global_synonyms_count * rel_letters_count                -0.142140
global_synonyms_count * rel_orthographic_density         -0.517899
global_synonyms_count * rel_synonyms_count               -0.172934
rel_aoa * rel_clustering                                  0.084123
rel_aoa * rel_frequency                                   0.057294
rel_aoa * rel_letters_count                               0.057450
rel_aoa * rel_orthographic_density                        0.163680
rel_aoa * rel_synonyms_count                             -0.151913
rel_clustering * rel_frequency                            0.340964
rel_clustering * rel_letters_count                        0.022873
rel_clustering * rel_orthographic_density                -0.140605
rel_clustering * rel_synonyms_count                       0.331129
rel_frequency * rel_letters_count                        -0.058477
rel_frequency * rel_orthographic_density                  0.126105
rel_frequency * rel_synonyms_count                       -0.171140
rel_letters_count * rel_orthographic_density              0.032546
rel_letters_count * rel_synonyms_count                    0.208795
rel_orthographic_density * rel_synonyms_count             0.403528
dtype: float64