In [1]:
# Import Pandas data handing module
import pandas as pd
# For pretty display of tables
from IPython.display import display
# Load the data
data = pd.read_csv('data.csv', index_col=['subject', 'cue-english', 'association-english'])
# Transform the "raw" N400 amplitudes into distance measurements according to the equation above
data['distance'] = data['N400'] - data.groupby(level=['subject', 'association-english'])['N400'].transform('mean')
# Show the first 10 rows
display(data.head(10))
# Compute the list of stimuli and subjects
subjects = data.index.levels[0]
n_subjects = len(subjects)
stimuli = data.index.levels[2]
n_stimuli = len(stimuli)
In [2]:
import numpy as np
# Compute the full word2word distance matrix for each subject
distance = np.zeros((n_stimuli, n_stimuli, n_subjects))
for i, subject in enumerate(subjects):
data_subject = data.xs(subject, level='subject').reset_index()
# Reshape the data into a cue-association word distance matrix
matrix = data_subject.pivot(index='cue-english', columns='association-english', values='distance')
# The diagonal is currently NaN (=missing), the distance from a word to itself is considered zero
matrix = matrix.fillna(0)
# Collect the distance matrices in a big array
distance[:, :, i] = matrix
# Average across the subjects
mean_distance = distance.mean(axis=2)
# Make positive and symmetric
mean_distance = (mean_distance + mean_distance.T) / 2.
mean_distance -= np.min(mean_distance)
# The distance from a word to itself is 0
np.fill_diagonal(mean_distance, 0)
# Plot the result
from matplotlib import pyplot as plt
%matplotlib inline
plt.figure(figsize=(5, 4))
plt.matshow(mean_distance, cmap='magma', fignum=0)
plt.xticks(np.arange(n_stimuli), stimuli, rotation=90)
plt.yticks(np.arange(n_stimuli), stimuli);
cb = plt.colorbar()
cb.set_label('distance based on N400 amplitude')
In [3]:
# Load modules for hierarchical clustering
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, set_link_color_palette
# Perform hierarchical clustering
dist = squareform(mean_distance)
Z = linkage(dist, 'average')
# For display, abbreviate hippopotamus and rhinoceros
stimuli_abbr = ['hippo' if s == 'hippopotamus' else s for s in stimuli]
stimuli_abbr = ['rhino' if s == 'rhinoceros' else s for s in stimuli_abbr]
# Define a colorscheme based on the "magma" colormap we use for the distance matrix
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
cmap = get_cmap('magma')
set_link_color_palette([rgb2hex(color) for color in [cmap(0.25), cmap(0.5), cmap(0.75)]])
# Plot the dendrogram
plt.figure(figsize=(14, 4))
with plt.rc_context({'lines.linewidth': 2}):
d = dendrogram(Z, labels=stimuli_abbr, above_threshold_color='black')
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.ylim(-0.01, 0.8)
plt.title('Dendrogram for distance based on N400 amplitude')
plt.yticks([])
plt.grid(False)
In [4]:
# Sort the distance matrix according to the dendrogram
order = d['leaves']
stimuli_sorted = [stimuli_abbr[i] for i in order]
distance_sorted = mean_distance[order, :][:, order]
plt.figure(figsize=(5, 4))
plt.matshow(distance_sorted, cmap='magma', fignum=0)
plt.xticks(np.arange(n_stimuli), stimuli_sorted, rotation=90)
plt.yticks(np.arange(n_stimuli), stimuli_sorted)
plt.xlabel('target')
plt.ylabel('cue')
# Add thick lines along the cluster boundaries
plt.axhline(6.5, linewidth=3, color='black')
plt.axvline(6.5, linewidth=3, color='black')
cb = plt.colorbar()
cb.set_label('distance based on N400 amplitude')
plt.tight_layout()
plt.savefig('figure2.tiff', dpi=300)
In [7]:
def group(data, group1, group2):
"""Assigns 'between' or 'within' cluster labels to each cue-association pair"""
labels = []
for cue, association in zip(data.index.get_level_values(level='cue-english'),
data.index.get_level_values(level='association-english')):
if ((cue in group1 and association in group1) or
(cue in group2 and association in group2)):
labels.append('within')
elif ((cue in group1 and association in group2) or
(cue in group2 and association in group1)):
labels.append('between')
else:
labels.append(np.NaN)
data_ = data.copy()
data_['label'] = labels
# Drop all rows that were neither within-cluster or between-cluster
data_ = data_.dropna()
return data_
# Bring in a bridge to R for statistics
from rpy2 import robjects as r
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
# R-packages
importr('lme4')
importr('lmerTest')
# Student's T distribution
from scipy.stats import t as tdist
# The R code produces some harmless warnings that clutter up the page.
# This disables printing of the warnings. When modifying this notebook, you may want to turn
# this back on.
import warnings
warnings.filterwarnings('ignore')
def lme_test(group1, group2, data):
"""Performs a statistical test using an LME model to obtain a p-value for
'between' and 'within' cluster values
"""
# We can only do a test if the group sizes are both > 1
if len(group1) == 1 or len(group2) == 1:
return ''
# Annotate data with between-cluster and within-cluster labels
data = group(data, group1, group2)
# Send the data to R
r.globalenv['data'] = data.reset_index()
# Fit the LME model.
# Fit random slopes and intercepts for subjects.
r.r('m = lmer(distance ~ label + (label | subject), data=data)')
try:
df, t, p = r.r('summary(m)$coefficients["labelwithin", 3:5]')
except:
# Estimation of degress of freedom failed. Take n_subjects - 1 as ddof
n_subjects = len(data.index.levels[0])
df = n_subjects - 1
print('Estimation of degrees of freedom failed for cluster (%s-%s), using ddof=%d' %
(group1, group2, df))
t = r.r('summary(m)$coefficients["labelwithin", "t value"]')[0]
p = tdist.sf(np.abs(t), df)
# Bonferroni correction for p values
p *= 4 # There are 4 clusters we can test in this dataset
# Construct a text label with the results
if p < 0.001:
return 't(%d)=%.3f\np<0.001' % (df, t)
else:
return 't(%d)=%.3f\np=%.3f' % (df, t, p)
In [8]:
from annotate_dendrogram import annotate_dendrogram
# Plot the dendrogram like before
plt.figure(figsize=(14, 4))
with plt.rc_context({'lines.linewidth': 2}):
d = dendrogram(Z, labels=stimuli_abbr, above_threshold_color='black')
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.ylim(-0.01, 0.8)
plt.title('Dendrogram for distance based on N400 amplitude')
plt.yticks([])
plt.grid(False)
# Annotate the dendrogram with t- and p-values
annotate_dendrogram(Z, d, stimuli, lme_test, args=[data])
plt.savefig('figure3.tiff', dpi=300)