Hierarchical clustering based on estimated N400 amplitude



In [1]:

    
# Import Pandas data handing module
import pandas as pd

# For pretty display of tables
from IPython.display import display

# Load the data
data = pd.read_csv('data.csv', index_col=['subject', 'cue-english', 'association-english'])

# Transform the "raw" N400 amplitudes into distance measurements according to the equation above
data['distance'] = data['N400'] - data.groupby(level=['subject', 'association-english'])['N400'].transform('mean')

# Show the first 10 rows
display(data.head(10))

# Compute the list of stimuli and subjects
subjects = data.index.levels[0]
n_subjects = len(subjects)
stimuli = data.index.levels[2]
n_stimuli = len(stimuli)









    







  
    
      
      
      
      cue
      association
      language
      button
      N400
      distance
    
    
      subject
      cue-english
      association-english
      
      
      
      
      
      
    
  
  
    
      subject01
      zebra
      couch
      zebra
      zetel
      NL
      1
      0.043147
      -0.126011
    
    
      couch
      hippopotamus
      zetel
      nijlpaard
      NL
      1
      -0.725864
      -0.082516
    
    
      giraffe
      closet
      giraf
      kast
      NL
      1
      0.252211
      -0.004391
    
    
      desk
      tiger
      bureau
      tijger
      NL
      1
      0.563608
      0.452885
    
    
      table
      rhinoceros
      tafel
      neushoorn
      NL
      1
      -0.765238
      -0.593084
    
    
      elephant
      couch
      olifant
      zetel
      NL
      1
      0.041667
      -0.127491
    
    
      zebra
      lion
      zebra
      leeuw
      NL
      0
      -0.530273
      -0.267699
    
    
      chair
      tiger
      stoel
      tijger
      NL
      1
      0.189504
      0.078781
    
    
      lion
      door
      leeuw
      deur
      NL
      1
      1.515943
      1.127102
    
    
      desk
      elephant
      bureau
      olifant
      NL
      1
      1.090760
      1.472297



In [2]:

    
import numpy as np

# Compute the full word2word distance matrix for each subject
distance = np.zeros((n_stimuli, n_stimuli, n_subjects))

for i, subject in enumerate(subjects):
    data_subject = data.xs(subject, level='subject').reset_index()
    
    # Reshape the data into a cue-association word distance matrix
    matrix = data_subject.pivot(index='cue-english', columns='association-english', values='distance')
    
    # The diagonal is currently NaN (=missing), the distance from a word to itself is considered zero
    matrix = matrix.fillna(0)

    # Collect the distance matrices in a big array
    distance[:, :, i] = matrix
    
# Average across the subjects
mean_distance = distance.mean(axis=2)

# Make positive and symmetric
mean_distance = (mean_distance + mean_distance.T) / 2.
mean_distance -= np.min(mean_distance)

# The distance from a word to itself is 0
np.fill_diagonal(mean_distance, 0)

# Plot the result
from matplotlib import pyplot as plt
%matplotlib inline

plt.figure(figsize=(5, 4))
plt.matshow(mean_distance, cmap='magma', fignum=0)
plt.xticks(np.arange(n_stimuli), stimuli, rotation=90)
plt.yticks(np.arange(n_stimuli), stimuli);
cb = plt.colorbar()
cb.set_label('distance based on N400 amplitude')



In [3]:

    
# Load modules for hierarchical clustering
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, set_link_color_palette

# Perform hierarchical clustering
dist = squareform(mean_distance)
Z = linkage(dist, 'average')

# For display, abbreviate hippopotamus and rhinoceros
stimuli_abbr = ['hippo' if s == 'hippopotamus' else s for s in stimuli]
stimuli_abbr = ['rhino' if s == 'rhinoceros' else s for s in stimuli_abbr]

# Define a colorscheme based on the "magma" colormap we use for the distance matrix
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
cmap = get_cmap('magma')
set_link_color_palette([rgb2hex(color) for color in [cmap(0.25), cmap(0.5), cmap(0.75)]])

# Plot the dendrogram
plt.figure(figsize=(14, 4))
with plt.rc_context({'lines.linewidth': 2}):
    d = dendrogram(Z, labels=stimuli_abbr, above_threshold_color='black')
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.ylim(-0.01, 0.8)
plt.title('Dendrogram for distance based on N400 amplitude')
plt.yticks([])
plt.grid(False)



In [4]:

    
# Sort the distance matrix according to the dendrogram
order = d['leaves']
stimuli_sorted = [stimuli_abbr[i] for i in order]
distance_sorted = mean_distance[order, :][:, order]

plt.figure(figsize=(5, 4))
plt.matshow(distance_sorted, cmap='magma', fignum=0)
plt.xticks(np.arange(n_stimuli), stimuli_sorted, rotation=90)
plt.yticks(np.arange(n_stimuli), stimuli_sorted)
plt.xlabel('target')
plt.ylabel('cue')

# Add thick lines along the cluster boundaries
plt.axhline(6.5, linewidth=3, color='black')
plt.axvline(6.5, linewidth=3, color='black')

cb = plt.colorbar()
cb.set_label('distance based on N400 amplitude')

plt.tight_layout()
plt.savefig('figure2.tiff', dpi=300)



In [7]:

    
def group(data, group1, group2):
    """Assigns 'between' or 'within' cluster labels to each cue-association pair"""
    labels = []
    for cue, association in zip(data.index.get_level_values(level='cue-english'),
                                data.index.get_level_values(level='association-english')):
        if ((cue in group1 and association in group1) or
            (cue in group2 and association in group2)):
            labels.append('within')
        elif ((cue in group1 and association in group2) or
              (cue in group2 and association in group1)):
            labels.append('between')
        else:
            labels.append(np.NaN)
    data_ = data.copy()
    data_['label'] = labels
    
    # Drop all rows that were neither within-cluster or between-cluster
    data_ = data_.dropna()
    
    return data_

# Bring in a bridge to R for statistics
from rpy2 import robjects as r
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()

# R-packages
importr('lme4')
importr('lmerTest')

# Student's T distribution
from scipy.stats import t as tdist

# The R code produces some harmless warnings that clutter up the page.
# This disables printing of the warnings. When modifying this notebook, you may want to turn
# this back on.
import warnings
warnings.filterwarnings('ignore')

def lme_test(group1, group2, data):
    """Performs a statistical test using an LME model to obtain a p-value for
    'between' and 'within' cluster values
    """
    # We can only do a test if the group sizes are both > 1
    if len(group1) == 1 or len(group2) == 1:
        return ''
    
    # Annotate data with between-cluster and within-cluster labels
    data = group(data, group1, group2)
    
    # Send the data to R
    r.globalenv['data'] = data.reset_index()
    
    # Fit the LME model.
    # Fit random slopes and intercepts for subjects.
    r.r('m = lmer(distance ~ label + (label | subject), data=data)')
    try:
        df, t, p = r.r('summary(m)$coefficients["labelwithin", 3:5]')
    except:
        # Estimation of degress of freedom failed. Take n_subjects - 1 as ddof
        n_subjects = len(data.index.levels[0])
        df = n_subjects - 1
        print('Estimation of degrees of freedom failed for cluster (%s-%s), using ddof=%d' %
              (group1, group2, df))
        
        t = r.r('summary(m)$coefficients["labelwithin", "t value"]')[0]
        p = tdist.sf(np.abs(t), df)
    
    # Bonferroni correction for p values
    p *= 4  # There are 4 clusters we can test in this dataset
    
    # Construct a text label with the results
    if p < 0.001:
        return 't(%d)=%.3f\np<0.001' % (df, t)
    else:
        return 't(%d)=%.3f\np=%.3f' % (df, t, p)



In [8]:

    
from annotate_dendrogram import annotate_dendrogram

# Plot the dendrogram like before
plt.figure(figsize=(14, 4))
with plt.rc_context({'lines.linewidth': 2}):
    d = dendrogram(Z, labels=stimuli_abbr, above_threshold_color='black')
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.ylim(-0.01, 0.8)
plt.title('Dendrogram for distance based on N400 amplitude')
plt.yticks([])
plt.grid(False)

# Annotate the dendrogram with t- and p-values
annotate_dendrogram(Z, d, stimuli, lme_test, args=[data])

plt.savefig('figure3.tiff', dpi=300)









    



Estimation of degrees of freedom failed for cluster (['door', 'table']-['chair', 'couch']), using ddof=15
Estimation of degrees of freedom failed for cluster (['lion', 'tiger', 'rhinoceros', 'zebra']-['elephant', 'giraffe', 'hippopotamus']), using ddof=15

			cue	association	language	button	N400	distance
subject	cue-english	association-english
subject01	zebra	couch	zebra	zetel	NL	1	0.043147	-0.126011
	couch	hippopotamus	zetel	nijlpaard	NL	1	-0.725864	-0.082516
	giraffe	closet	giraf	kast	NL	1	0.252211	-0.004391
	desk	tiger	bureau	tijger	NL	1	0.563608	0.452885
	table	rhinoceros	tafel	neushoorn	NL	1	-0.765238	-0.593084
	elephant	couch	olifant	zetel	NL	1	0.041667	-0.127491
	zebra	lion	zebra	leeuw	NL	0	-0.530273	-0.267699
	chair	tiger	stoel	tijger	NL	1	0.189504	0.078781
	lion	door	leeuw	deur	NL	1	1.515943	1.127102
	desk	elephant	bureau	olifant	NL	1	1.090760	1.472297