MEG Signal Analysis Using PLV, PCA and Hierarchical Clustering

This notebook demonstrates MEG analysis using windowed Phase Locking Value (PLV), Principal Component Analysis (PCA) and hierarchical clustering. The notebook:

Reads MEG ICANN 2011 dataset,
Estimates interaction matrix for each sample,
Projects the data into 2- and 3-dimensional planes, and
Performs hierarchical clustering of the data

For more information, please refer to https://nextonsblog.wordpress.com/?p=3087



In [1]:

    
import numpy as np
import scipy.io

from scipy.signal import hilbert
from scipy.cluster.hierarchy import dendrogram, linkage

from sklearn.decomposition import PCA

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline



In [2]:

    
# Global configuration.
FILENAME = "../../data/megicann_train_v2.mat"
DATASET_X = 'train_day1'
DATASET_Y = 'class_day1'
FREQ_DOMAINS = [5]
VALIDATION_SAMPLES = 0.2



In [3]:

    
# Original data is in format train[frequency][sample, channel, time].
# make different frequencies appear as channels and swap channel and time.
def preprocess_x(raw_data):
    # Move different frequencies to channels
    samples = raw_data[0, FREQ_DOMAINS[0]][:]
    for freq_domain in FREQ_DOMAINS[1:]:
        samples = np.concatenate([samples, raw_data[0, freq_domain][:]], axis=1)
        
    # Normalize the data within each sample (per channel) and ensure that type is ok
    for i, sample in enumerate(samples):
        sample_mean = sample.mean(axis=1);
        sample_std = sample.std(axis=1);
        samples[i] = ((sample.transpose() - sample_mean) / sample_std).transpose()

    samples = hilbert(samples)
    
    # Transpose the data to [sample, time, channel] format
    samples = np.transpose(samples, axes=(0, 2, 1))
    
    return samples



In [4]:

    
def plv(samples):
    plvs = []
    for _sample in samples:
        plv = np.abs(np.dot(_sample.conj().T, _sample)) / 200
        # The connectivity matrix is symmetrical. Look just the upper triangular matrix
        idx = np.triu_indices(plv.shape[0])
        plv = plv[idx[0],idx[1]]
        # Avoid memory exhaustion by thresholding data here and storing it as uint8
        plv[plv < 0.5] = 0
        plv[plv >= 0.5] = 1
        plv = np.uint8(plv)
        plvs.append(np.asarray(plv))
        
    return np.asarray(plvs)



In [5]:

    
# Load samples and preprocess them
train_dict = scipy.io.loadmat(FILENAME)
train_dataset_x = preprocess_x(train_dict[DATASET_X])
train_dataset_y = train_dict[DATASET_Y][:,0]

# Remove train_dict to save some memory
train_dict = None



In [6]:

    
train_dataset_x = plv(train_dataset_x)
train_dataset_x[train_dataset_x < 0.5] = 0
train_dataset_x[train_dataset_x >= 0.5] = 1

# Split samples into training and validation sets
num_val_samples = int(np.ceil(VALIDATION_SAMPLES * len(train_dataset_x) - 1))
val_dataset_x = train_dataset_x[:num_val_samples]
val_dataset_y = train_dataset_y[:num_val_samples]
train_dataset_x = train_dataset_x[num_val_samples:]
train_dataset_y = train_dataset_y[num_val_samples:]



In [8]:

    
# Initialize data for visualization
label_locations = np.arange(1, 5, 4.0 / 5.0) + (4.0 / 10.0)
label_colors = ['cyan', 'blue', 'gray', 'red', 'green']
labels = ['Artificial', 'Football', 'Nature', 'Bean', 'Chaplin']



In [9]:

    
# Project the preprocessed samples to 2 dimensional plane

pca = PCA(n_components=2)
train_dataset_x_projected = pca.fit(train_dataset_x).transform(train_dataset_x)

fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
scatter = plt.scatter(train_dataset_x_projected[:,0],
                      train_dataset_x_projected[:,1],
                      c=train_dataset_y,
                      cmap=matplotlib.colors.ListedColormap(label_colors))

colorbar = plt.colorbar(scatter)
colorbar.set_ticks(label_locations)
colorbar.set_ticklabels(labels)

plt.show()



In [10]:

    
# Project the preprocessed samples to 3 dimensional plane

pca = PCA(n_components=3)
train_dataset_x_projected = pca.fit(train_dataset_x).transform(train_dataset_x)

fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(train_dataset_x_projected[:,0],
                     train_dataset_x_projected[:,1],
                     train_dataset_x_projected[:,2],
                     c=train_dataset_y,
                     cmap=matplotlib.colors.ListedColormap(label_colors))

colorbar = plt.colorbar(scatter)
colorbar.set_ticks(label_locations)
colorbar.set_ticklabels(labels)

plt.show()



In [11]:

    
train_dataset_x_linkage = linkage(train_dataset_x, 'ward')

plt.figure(figsize=(15, 70))
plt.title('Hierarchical Clustering of the Samples')
plt.xlabel('Distance')
plt.ylabel('Sample Index')
dendrogram(
    train_dataset_x_linkage,
    leaf_font_size=10,
    labels=[labels[c - 1] for c in train_dataset_y],
    orientation='left',
    color_threshold=0)

plt.show()



In [ ]: