Hierarchical clustering based on estimated N400 amplitude


In [1]:
# Import Pandas data handing module
import pandas as pd

# For pretty display of tables
from IPython.display import display

# Load the data
data = pd.read_csv('data.csv', index_col=['subject', 'cue-english', 'association-english'])

# Transform the "raw" N400 amplitudes into distance measurements according to the equation above
data['distance'] = data['N400'] - data.groupby(level=['subject', 'association-english'])['N400'].transform('mean')

# Show the first 10 rows
display(data.head(10))

# Compute the list of stimuli and subjects
subjects = data.index.levels[0]
n_subjects = len(subjects)
stimuli = data.index.levels[2]
n_stimuli = len(stimuli)


cue association language button N400 distance
subject cue-english association-english
subject01 zebra couch zebra zetel NL 1 0.043147 -0.126011
couch hippopotamus zetel nijlpaard NL 1 -0.725864 -0.082516
giraffe closet giraf kast NL 1 0.252211 -0.004391
desk tiger bureau tijger NL 1 0.563608 0.452885
table rhinoceros tafel neushoorn NL 1 -0.765238 -0.593084
elephant couch olifant zetel NL 1 0.041667 -0.127491
zebra lion zebra leeuw NL 0 -0.530273 -0.267699
chair tiger stoel tijger NL 1 0.189504 0.078781
lion door leeuw deur NL 1 1.515943 1.127102
desk elephant bureau olifant NL 1 1.090760 1.472297

In [2]:
import numpy as np

# Compute the full word2word distance matrix for each subject
distance = np.zeros((n_stimuli, n_stimuli, n_subjects))

for i, subject in enumerate(subjects):
    data_subject = data.xs(subject, level='subject').reset_index()
    
    # Reshape the data into a cue-association word distance matrix
    matrix = data_subject.pivot(index='cue-english', columns='association-english', values='distance')
    
    # The diagonal is currently NaN (=missing), the distance from a word to itself is considered zero
    matrix = matrix.fillna(0)

    # Collect the distance matrices in a big array
    distance[:, :, i] = matrix
    
# Average across the subjects
mean_distance = distance.mean(axis=2)

# Make positive and symmetric
mean_distance = (mean_distance + mean_distance.T) / 2.
mean_distance -= np.min(mean_distance)

# The distance from a word to itself is 0
np.fill_diagonal(mean_distance, 0)

# Plot the result
from matplotlib import pyplot as plt
%matplotlib inline

plt.figure(figsize=(5, 4))
plt.matshow(mean_distance, cmap='magma', fignum=0)
plt.xticks(np.arange(n_stimuli), stimuli, rotation=90)
plt.yticks(np.arange(n_stimuli), stimuli);
cb = plt.colorbar()
cb.set_label('distance based on N400 amplitude')



In [3]:
# Load modules for hierarchical clustering
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, set_link_color_palette

# Perform hierarchical clustering
dist = squareform(mean_distance)
Z = linkage(dist, 'average')

# For display, abbreviate hippopotamus and rhinoceros
stimuli_abbr = ['hippo' if s == 'hippopotamus' else s for s in stimuli]
stimuli_abbr = ['rhino' if s == 'rhinoceros' else s for s in stimuli_abbr]

# Define a colorscheme based on the "magma" colormap we use for the distance matrix
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
cmap = get_cmap('magma')
set_link_color_palette([rgb2hex(color) for color in [cmap(0.25), cmap(0.5), cmap(0.75)]])

# Plot the dendrogram
plt.figure(figsize=(14, 4))
with plt.rc_context({'lines.linewidth': 2}):
    d = dendrogram(Z, labels=stimuli_abbr, above_threshold_color='black')
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.ylim(-0.01, 0.8)
plt.title('Dendrogram for distance based on N400 amplitude')
plt.yticks([])
plt.grid(False)



In [4]:
# Sort the distance matrix according to the dendrogram
order = d['leaves']
stimuli_sorted = [stimuli_abbr[i] for i in order]
distance_sorted = mean_distance[order, :][:, order]

plt.figure(figsize=(5, 4))
plt.matshow(distance_sorted, cmap='magma', fignum=0)
plt.xticks(np.arange(n_stimuli), stimuli_sorted, rotation=90)
plt.yticks(np.arange(n_stimuli), stimuli_sorted)
plt.xlabel('target')
plt.ylabel('cue')

# Add thick lines along the cluster boundaries
plt.axhline(6.5, linewidth=3, color='black')
plt.axvline(6.5, linewidth=3, color='black')

cb = plt.colorbar()
cb.set_label('distance based on N400 amplitude')

plt.tight_layout()
plt.savefig('figure2.tiff', dpi=300)



In [7]:
def group(data, group1, group2):
    """Assigns 'between' or 'within' cluster labels to each cue-association pair"""
    labels = []
    for cue, association in zip(data.index.get_level_values(level='cue-english'),
                                data.index.get_level_values(level='association-english')):
        if ((cue in group1 and association in group1) or
            (cue in group2 and association in group2)):
            labels.append('within')
        elif ((cue in group1 and association in group2) or
              (cue in group2 and association in group1)):
            labels.append('between')
        else:
            labels.append(np.NaN)
    data_ = data.copy()
    data_['label'] = labels
    
    # Drop all rows that were neither within-cluster or between-cluster
    data_ = data_.dropna()
    
    return data_

# Bring in a bridge to R for statistics
from rpy2 import robjects as r
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()

# R-packages
importr('lme4')
importr('lmerTest')

# Student's T distribution
from scipy.stats import t as tdist

# The R code produces some harmless warnings that clutter up the page.
# This disables printing of the warnings. When modifying this notebook, you may want to turn
# this back on.
import warnings
warnings.filterwarnings('ignore')

def lme_test(group1, group2, data):
    """Performs a statistical test using an LME model to obtain a p-value for
    'between' and 'within' cluster values
    """
    # We can only do a test if the group sizes are both > 1
    if len(group1) == 1 or len(group2) == 1:
        return ''
    
    # Annotate data with between-cluster and within-cluster labels
    data = group(data, group1, group2)
    
    # Send the data to R
    r.globalenv['data'] = data.reset_index()
    
    # Fit the LME model.
    # Fit random slopes and intercepts for subjects.
    r.r('m = lmer(distance ~ label + (label | subject), data=data)')
    try:
        df, t, p = r.r('summary(m)$coefficients["labelwithin", 3:5]')
    except:
        # Estimation of degress of freedom failed. Take n_subjects - 1 as ddof
        n_subjects = len(data.index.levels[0])
        df = n_subjects - 1
        print('Estimation of degrees of freedom failed for cluster (%s-%s), using ddof=%d' %
              (group1, group2, df))
        
        t = r.r('summary(m)$coefficients["labelwithin", "t value"]')[0]
        p = tdist.sf(np.abs(t), df)
    
    # Bonferroni correction for p values
    p *= 4  # There are 4 clusters we can test in this dataset
    
    # Construct a text label with the results
    if p < 0.001:
        return 't(%d)=%.3f\np<0.001' % (df, t)
    else:
        return 't(%d)=%.3f\np=%.3f' % (df, t, p)

In [8]:
from annotate_dendrogram import annotate_dendrogram

# Plot the dendrogram like before
plt.figure(figsize=(14, 4))
with plt.rc_context({'lines.linewidth': 2}):
    d = dendrogram(Z, labels=stimuli_abbr, above_threshold_color='black')
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.ylim(-0.01, 0.8)
plt.title('Dendrogram for distance based on N400 amplitude')
plt.yticks([])
plt.grid(False)

# Annotate the dendrogram with t- and p-values
annotate_dendrogram(Z, d, stimuli, lme_test, args=[data])

plt.savefig('figure3.tiff', dpi=300)


Estimation of degrees of freedom failed for cluster (['door', 'table']-['chair', 'couch']), using ddof=15
Estimation of degrees of freedom failed for cluster (['lion', 'tiger', 'rhinoceros', 'zebra']-['elephant', 'giraffe', 'hippopotamus']), using ddof=15