Analyze Distances Between Various Groups of Word Pairs

By: Adam Li



In [2]:

    
# Import Necessary Libraries
import numpy as np
import os, csv, json
import math
import random
import operator
import collections

import itertools
import matplotlib
from matplotlib import *
from matplotlib import pyplot as plt

from sklearn.decomposition import PCA
import scipy.stats as stats
import scipy.io
from scipy.spatial import distance as Distance

# pretty charting
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')

%matplotlib inline



In [3]:

    
######## Get list of files (.mat) we want to work with ########
filedir = '../condensed_data/groups/'
files = []
groups = []

# get all unique word match pairs and store in 'groups' list
for file in os.listdir(filedir):
    groups.append(file)
    if file.endswith('.mat'):
        files.append(file)

######## Load in EVENTS struct to find correct events
eventsDir = '../NIH034/behavioral/paRemap/' + 'events.mat'

events = scipy.io.loadmat(eventsDir)
events = events['events']

# print number of incorrect events and which words they belonged to
incorrectIndices = events['isCorrect'] == 0
incorrectEvents = events[incorrectIndices]
incorrectWords = []
wordList = {}
for i in range(0, len(incorrectEvents)):
    incorrectWords.append(incorrectEvents['probeWord'][i][0])

for word in np.unique(incorrectEvents['probeWord']):
    wordList[str(word)] = sum(incorrectWords == word)
    
print "There were ",len(incorrectEvents), " number of incorrect events."
print "The list of incorrect probe words: \n", wordList
# 
# get only correct events
correctIndices = events['isCorrect'] == 1
events = events[correctIndices]

print "\nThis is the length of the events struct with only correct responses: ", len(events)
print "The group of word pairings are: ", groups









    



There were  49  number of incorrect events.
The list of incorrect probe words: 
{"[u'PANTS']": 7, "[u'JUICE']": 8, "[u'BRICK']": 12, "[u'CLOCK']": 13, "[u'GLASS']": 9}

This is the length of the events struct with only correct responses:  1431
The group of word pairings are:  ['BRICK_CLOCK', 'BRICK_JUICE', 'BRICK_PANTS', 'CLOCK_BRICK', 'CLOCK_GLASS', 'GLASS_CLOCK', 'GLASS_JUICE', 'GLASS_PANTS', 'JUICE_BRICK', 'JUICE_GLASS', 'PANTS_BRICK', 'PANTS_GLASS']



In [4]:

    
diff_words_groups = []
reverse_words_groups = []
probe_words_groups = ()
target_words_groups = []

def inGroup(group, names):
    for i in range(0, len(group)):
        if cmpT(group[i],names):
            return True
    return False

def cmpT(t1, t2): 
    return sorted(t1) == sorted(t2)

# Create different groups
for pair_first in groups:
    # split words by delimiter '_' to determine groups
    firstpair = pair_first.split('_')
    
    for pair_second in groups:
        secondpair = pair_second.split('_')

        # make directory names for each word pair
        firstname = '_'.join(firstpair)
        secondname = '_'.join(secondpair)
        names = (firstname, secondname)
        
        ## 01: Different words group
        if not any(x in secondpair for x in firstpair) and not inGroup(diff_words_groups,names):
            diff_words_groups += (names,)
                
        ## 02: Probe Word Overlap Group
        if firstpair[0] == secondpair[0] and firstpair[1] != secondpair[1] and not inGroup(probe_words_groups,names):
            probe_words_groups += (names,)
        
        ## 03: Target Word Overlap Group
        if firstpair[1] == secondpair[1] and firstpair[0] != secondpair[0] and not inGroup(target_words_groups,names):
            target_words_groups += (names,)
            
        ## 04: Reverse words Group
        reverse_firstpair = firstpair
        reverse_firstpair.reverse()
        if '_'.join(reverse_firstpair) == secondname and not inGroup(reverse_words_groups,names):
            reverse_words_groups += (names,)

## printing 
print "For different words:"
for i in range(0, len(diff_words_groups)):
    print diff_words_groups[i]
    
print "For reverse words:"
for i in range(0, len(reverse_words_groups)):
    print reverse_words_groups[i]
    
print "For probe words:"
for i in range(0, len(probe_words_groups)):
    print probe_words_groups[i]
    
print "For target words:"
for i in range(0, len(target_words_groups)):
    print target_words_groups[i]









    



For different words:
('BRICK_CLOCK', 'GLASS_JUICE')
('CLOCK_BRICK', 'GLASS_PANTS')
('CLOCK_BRICK', 'JUICE_GLASS')
('CLOCK_BRICK', 'PANTS_GLASS')
('BRICK_JUICE', 'CLOCK_GLASS')
('JUICE_BRICK', 'GLASS_CLOCK')
('JUICE_BRICK', 'GLASS_PANTS')
('JUICE_BRICK', 'PANTS_GLASS')
('BRICK_PANTS', 'CLOCK_GLASS')
('PANTS_BRICK', 'GLASS_CLOCK')
('BRICK_PANTS', 'GLASS_JUICE')
('PANTS_BRICK', 'JUICE_GLASS')
('CLOCK_BRICK', 'GLASS_JUICE')
('BRICK_CLOCK', 'GLASS_PANTS')
('BRICK_CLOCK', 'JUICE_GLASS')
('BRICK_CLOCK', 'PANTS_GLASS')
('GLASS_CLOCK', 'BRICK_JUICE')
('CLOCK_GLASS', 'JUICE_BRICK')
('CLOCK_GLASS', 'PANTS_BRICK')
('GLASS_CLOCK', 'BRICK_PANTS')
('GLASS_JUICE', 'PANTS_BRICK')
('PANTS_GLASS', 'BRICK_JUICE')
('BRICK_JUICE', 'GLASS_PANTS')
('JUICE_GLASS', 'BRICK_PANTS')
For reverse words:
('JUICE_BRICK', 'BRICK_JUICE')
('BRICK_PANTS', 'PANTS_BRICK')
('CLOCK_BRICK', 'BRICK_CLOCK')
('GLASS_CLOCK', 'CLOCK_GLASS')
('PANTS_GLASS', 'GLASS_PANTS')
('JUICE_GLASS', 'GLASS_JUICE')
For probe words:
('BRICK_CLOCK', 'BRICK_PANTS')
('BRICK_JUICE', 'BRICK_CLOCK')
('BRICK_JUICE', 'BRICK_PANTS')
('JUICE_BRICK', 'JUICE_GLASS')
('PANTS_BRICK', 'PANTS_GLASS')
('CLOCK_BRICK', 'CLOCK_GLASS')
('GLASS_CLOCK', 'GLASS_PANTS')
('GLASS_CLOCK', 'GLASS_JUICE')
('GLASS_PANTS', 'GLASS_JUICE')
For target words:
('JUICE_BRICK', 'CLOCK_BRICK')
('BRICK_JUICE', 'GLASS_JUICE')
('PANTS_BRICK', 'CLOCK_BRICK')
('BRICK_CLOCK', 'GLASS_CLOCK')
('CLOCK_GLASS', 'JUICE_GLASS')
('CLOCK_GLASS', 'PANTS_GLASS')
('JUICE_GLASS', 'PANTS_GLASS')
('GLASS_PANTS', 'BRICK_PANTS')
('JUICE_BRICK', 'PANTS_BRICK')

Now we have lists of combinations of word pairs that we want to compare for each group. Hypothetically, we would assume that the "different word pairs" have distances farthest away and the reversed words, have distances closer, and probe and target are also closer.

Next, using ANOVA to perform a cutoff on channels and only taking the delta, theta and high gamma frequencies, we want to uncover this result.

Load Data
Extract Features
Plot Distance Metrics

Different Word Groups

The struct from MATLAB has

data.powerMatZ = thisPowMat;
data.chanNum = thisChan;
data.chanStr = thisChanStr;
data.probeWord = THIS_TRIGGER;
data.targetWord = targetWord;
data.timeZero = 45;
data.vocalization = data.timeZero + round([metaEvents.responseTime]/Overlap);

Input: A list of directories that correspond to each subgroup. Within each directory there is a list of structs that represent the data from each channel. If we want to further segment our data... say by block#, we would have to go into the data extraction and add a few lines of code that filter out the events by which block we're interested in.

Algorithm:

Loop through each channel:
- extract probewords, targetwords, Z scored power matrix, channel #, channel string, time zero(probe on), vocalization
Create Feature Vectors
- extract delta, theta and high gamma frequencies
- run ANOVA on each time/freq window
- compute a threshold to include channel for feature vector
Plot Histogram of Distances From the Other's centroid

Functions To Use Definitions



In [5]:

    
# Perform a type of grid search over the channels to determine the most significant channels
def gridSearchChannels(epsilon, first_group_files, second_group_files):
    ################## LOOPING THROUGH EACH CHANNEL ##################
    chan_pvals = {}
    chan_pval = []
    chan_sig = []
    for f in range(0, len(first_group_files)):
        #################### Set up data from the channel's mat file ####################
        # Go through each .mat file
        first_mat_file = first_filedir + first_group_files[f]
        second_mat_file = second_filedir + second_group_files[f]
            
        data_first = scipy.io.loadmat(first_mat_file)
        data_first = data_first['data']
        data_second = scipy.io.loadmat(second_mat_file)
        data_second = data_second['data']
        
        ## 03: get channel number
        first_chanNum = data_first['chanNum'][0][0][0][0]
        second_chanNum = data_second['chanNum'][0][0][0][0]
            
        ## 04: get channel string
        first_chanStr = data_first['chanStr'][0][0][0]
        second_chanStr = data_second['chanStr'][0][0][0]

        ## 05: get power matrix Z is a #events X #freq. bands X #time bins
        first_matrix = data_first['powerMatZ'][0][0]
        second_matrix = data_second['powerMatZ'][0][0]

        ########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
        ### 01: create power matrix with only delta, theta and high gamma
        first_matrix = first_matrix[:, [0,1,5],:]
        second_matrix = second_matrix[:, [0,1,5],:]
        
        ### 02: Run ANOVA on each frequency/time window
        p_vals = []
        for freq in range(0, first_matrix.shape[1]):
            for time in range(0, first_matrix.shape[2]):
                stat, p_val = stats.f_oneway(first_matrix[:,freq,time], second_matrix[:,freq,time])
                
                p_vals.append(1. - p_val)
        
        chan_pvals[str(first_chanNum)] = sum(p_vals)
        chan_pval.append(sum(p_vals))
            
    # compute quantile of 75% of most significant channel
    thresh = np.percentile(chan_pval, epsilon)
    chans_to_analyze = [k for k in chan_pvals if chan_pvals[k] > thresh]
    return chans_to_analyze

def add_list_to_set(my_list, my_set):
    [my_set.add(each) for each in my_list]
    return my_set

Different Word Groups Analysis



In [5]:

    
###### Create
print diff_words_groups

chans_to_analyze = set()
for group in diff_words_groups:
    ######## Get list of files (.mat) we want to work with ########
    first_filedir = '../condensed_data/groups/' + group[0] + '/'
    second_filedir = '../condensed_data/groups/' + group[1] + '/'

    # get list of files for both word pairs
    first_files = []
    second_files = []
    for file in os.listdir(first_filedir):
        if file.endswith('.mat'):
            first_files.append(file)
    for file in os.listdir(second_filedir):
        if file.endswith('.mat'):
            second_files.append(file)

    add_list_to_set(gridSearchChannels(95, first_files, second_files), chans_to_analyze)
    
print len(chans_to_analyze)
tuple(chans_to_analyze)
print len(chans_to_analyze)
print set(chans_to_analyze)









    



[('BRICK_CLOCK', 'GLASS_JUICE'), ('CLOCK_BRICK', 'GLASS_PANTS'), ('CLOCK_BRICK', 'JUICE_GLASS'), ('CLOCK_BRICK', 'PANTS_GLASS'), ('BRICK_JUICE', 'CLOCK_GLASS'), ('JUICE_BRICK', 'GLASS_CLOCK'), ('JUICE_BRICK', 'GLASS_PANTS'), ('JUICE_BRICK', 'PANTS_GLASS'), ('BRICK_PANTS', 'CLOCK_GLASS'), ('PANTS_BRICK', 'GLASS_CLOCK'), ('BRICK_PANTS', 'GLASS_JUICE'), ('PANTS_BRICK', 'JUICE_GLASS'), ('CLOCK_BRICK', 'GLASS_JUICE'), ('BRICK_CLOCK', 'GLASS_PANTS'), ('BRICK_CLOCK', 'JUICE_GLASS'), ('BRICK_CLOCK', 'PANTS_GLASS'), ('GLASS_CLOCK', 'BRICK_JUICE'), ('CLOCK_GLASS', 'JUICE_BRICK'), ('CLOCK_GLASS', 'PANTS_BRICK'), ('GLASS_CLOCK', 'BRICK_PANTS'), ('GLASS_JUICE', 'PANTS_BRICK'), ('PANTS_GLASS', 'BRICK_JUICE'), ('BRICK_JUICE', 'GLASS_PANTS'), ('JUICE_GLASS', 'BRICK_PANTS')]
63
63
set(['30', '58', '60', '61', '62', '63', '64', '67', '68', '69', '80', '81', '85', '24', '26', '27', '20', '21', '48', '46', '44', '45', '42', '43', '40', '41', '1', '3', '5', '6', '96', '18', '13', '77', '74', '73', '71', '70', '91', '90', '93', '92', '95', '94', '78', '10', '39', '59', '14', '17', '16', '55', '54', '31', '56', '51', '36', '34', '19', '89', '32', '57', '50'])



In [6]:

    
##### HYPER-PARAMETERS TO TUNE
anova_threshold = 90   # how many channels we want to keep
distances = Distance.cosine # define distance metric to use
num_time_windows = 5
freq_bands = [0, 1, 5]
freq_bands = np.arange(0,7,1)
print freq_bands









    



[0 1 2 3 4 5 6]



In [7]:

    
num_groups = len(diff_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))

for idx, group in enumerate(diff_words_groups):
    
    ######## Get list of files (.mat) we want to work with ########
    first_filedir = '../condensed_data/groups/' + group[0] + '/'
    second_filedir = '../condensed_data/groups/' + group[1] + '/'

    # get list of files for both word pairs
    first_files = []
    second_files = []
    
    files01 = os.listdir(first_filedir)
    files02 = os.listdir(second_filedir)
    for i in range(0, len(files01)):
        if files01[i].endswith('.mat'):
            first_files.append(files01[i])
        if files02[i].endswith('.mat'):
            second_files.append(files02[i])

#     chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
    chans_to_analyze = str(np.arange(1,96,1))
    
    ################## LOOPING THROUGH EACH CHANNEL ##################
    first_feature = []
    second_feature = []
    for f in range(0, len(first_files)):
        # make sure this is a channel we want
        chan_num = first_files[f].split('_')[0]
        
        if chan_num in chans_to_analyze:
            #################### Set up data from the channel's mat file ####################
            # Go through each .mat file
            first_mat_file = first_filedir + first_files[f]
            second_mat_file = second_filedir + second_files[f]

            data_first = scipy.io.loadmat(first_mat_file)
            data_first = data_first['data']
            data_second = scipy.io.loadmat(second_mat_file)
            data_second = data_second['data']

            ## 01: Grab probewords for each struct
            first_probeWord = data_first['probeWord'][0][0][0]
            second_probeWord = data_second['probeWord'][0][0][0]

            ## 02: Grab targetwords for each struct
            first_targetWord = data_first['targetWord'][0][0][0]
            second_targetWord = data_second['targetWord'][0][0][0]

            ## 03: get channel number
            first_chanNum = data_first['chanNum'][0][0][0][0]
            second_chanNum = data_second['chanNum'][0][0][0][0]

            ## 04: get channel string
            first_chanStr = data_first['chanStr'][0][0][0]
            second_chanStr = data_second['chanStr'][0][0][0]

            ## 05: get power matrix Z is a #events X #freq. bands X #time bins
            first_matrix = data_first['powerMatZ'][0][0]
            second_matrix = data_second['powerMatZ'][0][0]

            ## 06: get the time point for probeword on
            first_timeZero = data_first['timeZero'][0][0][0]
            second_timeZero = data_second['timeZero'][0][0][0]

            ## 07: get the time point of vocalization
            first_vocalization = data_first['vocalization'][0][0][0]
            second_vocalization = data_second['vocalization'][0][0][0]
        
            ########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
            ### 01: create power matrix with only delta, theta and high gamma
            first_matrix = first_matrix[:, freq_bands,:]
            second_matrix = second_matrix[:, freq_bands,:]
        
            ### 02: get only the time point before vocalization
            first_mean = []
            second_mean = []
            for i in range(0, len(first_vocalization)):
#                 first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
                
                first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
            for i in range(0, len(second_vocalization)):
#                 second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
                
                second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
               
            # append the averaged time and event vector = frequency band feature
            first_feature.append(np.mean(first_mean,axis=0))
            second_feature.append(np.mean(second_mean,axis=0))
                
        #### end of if/else
    #### end of for loop
    
    ### 03: after looping through all significant channels, create list of feature vectors from each channel
    first_feature = np.array(first_feature)
    second_feature = np.array(second_feature)
    
#     print first_feature.shape
#     print second_feature.shape
    
    # compute freqX1 centroid vector
    first_centroid = np.mean(first_feature,axis=0)
    second_centroid = np.mean(second_feature,axis=0)
     
    # compute list of distances from other centroid
    first_hist = [distances(x, second_centroid) for x in first_feature]
    second_hist = [distances(x, first_centroid) for x in second_feature]
    
    ## log transform
#     first_hist = np.log(first_hist)
#     second_hist = np.log(second_hist)
    
    ### 04: Plot Histogram of Distances
    plt.subplot(num_groups, 1, idx+1)
#     fig = plt.figure()
    first_label = 'Distance for ' + group[0]
    second_label = 'Distance For ' + group[1]
    plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
    plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
    plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
    plt.xlabel('Euclidean Distance From "other" centroid')
    plt.ylabel('Count')
    plt.legend(loc='upper right')
    
plt.tight_layout()
#     break

Reversed Word Pairs Overlap



In [8]:

    
num_groups = len(reverse_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))

for idx, group in enumerate(reverse_words_groups):
    
    ######## Get list of files (.mat) we want to work with ########
    first_filedir = '../condensed_data/groups/' + group[0] + '/'
    second_filedir = '../condensed_data/groups/' + group[1] + '/'

    # get list of files for both word pairs
    first_files = []
    second_files = []
    
    files01 = os.listdir(first_filedir)
    files02 = os.listdir(second_filedir)
    for i in range(0, len(files01)):
        if files01[i].endswith('.mat'):
            first_files.append(files01[i])
        if files02[i].endswith('.mat'):
            second_files.append(files02[i])

#     chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
    chans_to_analyze = str(np.arange(1,96,1))
    
    ################## LOOPING THROUGH EACH CHANNEL ##################
    first_feature = []
    second_feature = []
    for f in range(0, len(first_files)):
        # make sure this is a channel we want
        chan_num = first_files[f].split('_')[0]
        
        if chan_num in chans_to_analyze:
            #################### Set up data from the channel's mat file ####################
            # Go through each .mat file
            first_mat_file = first_filedir + first_files[f]
            second_mat_file = second_filedir + second_files[f]

            data_first = scipy.io.loadmat(first_mat_file)
            data_first = data_first['data']
            data_second = scipy.io.loadmat(second_mat_file)
            data_second = data_second['data']

            ## 01: Grab probewords for each struct
            first_probeWord = data_first['probeWord'][0][0][0]
            second_probeWord = data_second['probeWord'][0][0][0]

            ## 02: Grab targetwords for each struct
            first_targetWord = data_first['targetWord'][0][0][0]
            second_targetWord = data_second['targetWord'][0][0][0]

            ## 03: get channel number
            first_chanNum = data_first['chanNum'][0][0][0][0]
            second_chanNum = data_second['chanNum'][0][0][0][0]

            ## 04: get channel string
            first_chanStr = data_first['chanStr'][0][0][0]
            second_chanStr = data_second['chanStr'][0][0][0]

            ## 05: get power matrix Z is a #events X #freq. bands X #time bins
            first_matrix = data_first['powerMatZ'][0][0]
            second_matrix = data_second['powerMatZ'][0][0]

            ## 06: get the time point for probeword on
            first_timeZero = data_first['timeZero'][0][0][0]
            second_timeZero = data_second['timeZero'][0][0][0]

            ## 07: get the time point of vocalization
            first_vocalization = data_first['vocalization'][0][0][0]
            second_vocalization = data_second['vocalization'][0][0][0]
        
            ########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
            ### 01: create power matrix with only delta, theta and high gamma
            first_matrix = first_matrix[:, freq_bands,:]
            second_matrix = second_matrix[:, freq_bands,:]
        
            ### 02: get only the time point before vocalization
            first_mean = []
            second_mean = []
            for i in range(0, len(first_vocalization)):
#                 first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
                
                first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
            for i in range(0, len(second_vocalization)):
#                 second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
                
                second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
               
            # append the averaged time and event vector = frequency band feature
            first_feature.append(np.mean(first_mean,axis=0))
            second_feature.append(np.mean(second_mean,axis=0))
                
        #### end of if/else
    #### end of for loop
    
    ### 03: after looping through all significant channels, create list of feature vectors from each channel
    first_feature = np.array(first_feature)
    second_feature = np.array(second_feature)
    
#     print first_feature.shape
#     print second_feature.shape
    
    # compute freqX1 centroid vector
    first_centroid = np.mean(first_feature,axis=0)
    second_centroid = np.mean(second_feature,axis=0)
     
    # compute list of distances from other centroid
    first_hist = [distances(x, second_centroid) for x in first_feature]
    second_hist = [distances(x, first_centroid) for x in second_feature]
    
    ## log transform
#     first_hist = np.log(first_hist)
#     second_hist = np.log(second_hist)
    
    ### 04: Plot Histogram of Distances
    plt.subplot(num_groups, 1, idx+1)
#     fig = plt.figure()
    first_label = 'Distance for ' + group[0]
    second_label = 'Distance For ' + group[1]
    plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
    plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
    plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
    plt.xlabel('Euclidean Distance From "other" centroid')
    plt.ylabel('Count')
    plt.legend(loc='upper right')
    
plt.tight_layout()
#     break

Probe Words Overlap Analysis



In [9]:

    
num_groups = len(probe_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))

for idx, group in enumerate(probe_words_groups):
    
    ######## Get list of files (.mat) we want to work with ########
    first_filedir = '../condensed_data/groups/' + group[0] + '/'
    second_filedir = '../condensed_data/groups/' + group[1] + '/'

    # get list of files for both word pairs
    first_files = []
    second_files = []
    
    files01 = os.listdir(first_filedir)
    files02 = os.listdir(second_filedir)
    for i in range(0, len(files01)):
        if files01[i].endswith('.mat'):
            first_files.append(files01[i])
        if files02[i].endswith('.mat'):
            second_files.append(files02[i])

#     chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
    chans_to_analyze = str(np.arange(1,96,1))
    
    ################## LOOPING THROUGH EACH CHANNEL ##################
    first_feature = []
    second_feature = []
    for f in range(0, len(first_files)):
        # make sure this is a channel we want
        chan_num = first_files[f].split('_')[0]
        
        if chan_num in chans_to_analyze:
            #################### Set up data from the channel's mat file ####################
            # Go through each .mat file
            first_mat_file = first_filedir + first_files[f]
            second_mat_file = second_filedir + second_files[f]

            data_first = scipy.io.loadmat(first_mat_file)
            data_first = data_first['data']
            data_second = scipy.io.loadmat(second_mat_file)
            data_second = data_second['data']

            ## 01: Grab probewords for each struct
            first_probeWord = data_first['probeWord'][0][0][0]
            second_probeWord = data_second['probeWord'][0][0][0]

            ## 02: Grab targetwords for each struct
            first_targetWord = data_first['targetWord'][0][0][0]
            second_targetWord = data_second['targetWord'][0][0][0]

            ## 03: get channel number
            first_chanNum = data_first['chanNum'][0][0][0][0]
            second_chanNum = data_second['chanNum'][0][0][0][0]

            ## 04: get channel string
            first_chanStr = data_first['chanStr'][0][0][0]
            second_chanStr = data_second['chanStr'][0][0][0]

            ## 05: get power matrix Z is a #events X #freq. bands X #time bins
            first_matrix = data_first['powerMatZ'][0][0]
            second_matrix = data_second['powerMatZ'][0][0]

            ## 06: get the time point for probeword on
            first_timeZero = data_first['timeZero'][0][0][0]
            second_timeZero = data_second['timeZero'][0][0][0]

            ## 07: get the time point of vocalization
            first_vocalization = data_first['vocalization'][0][0][0]
            second_vocalization = data_second['vocalization'][0][0][0]
        
            ########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
            ### 01: create power matrix with only delta, theta and high gamma
            first_matrix = first_matrix[:, freq_bands,:]
            second_matrix = second_matrix[:, freq_bands,:]
        
            ### 02: get only the time point before vocalization
            first_mean = []
            second_mean = []
            for i in range(0, len(first_vocalization)):
#                 first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
                
                first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
            for i in range(0, len(second_vocalization)):
#                 second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
                
                second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
               
            # append the averaged time and event vector = frequency band feature
            first_feature.append(np.mean(first_mean,axis=0))
            second_feature.append(np.mean(second_mean,axis=0))
                
        #### end of if/else
    #### end of for loop
    
    ### 03: after looping through all significant channels, create list of feature vectors from each channel
    first_feature = np.array(first_feature)
    second_feature = np.array(second_feature)
    
#     print first_feature.shape
#     print second_feature.shape
    
    # compute freqX1 centroid vector
    first_centroid = np.mean(first_feature,axis=0)
    second_centroid = np.mean(second_feature,axis=0)
     
    # compute list of distances from other centroid
    first_hist = [distances(x, second_centroid) for x in first_feature]
    second_hist = [distances(x, first_centroid) for x in second_feature]
    
    ## log transform
#     first_hist = np.log(first_hist)
#     second_hist = np.log(second_hist)
    
    ### 04: Plot Histogram of Distances
    plt.subplot(num_groups, 1, idx+1)
#     fig = plt.figure()
    first_label = 'Distance for ' + group[0]
    second_label = 'Distance For ' + group[1]
    plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
    plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
    plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
    plt.xlabel('Euclidean Distance From "other" centroid')
    plt.ylabel('Count')
    plt.legend(loc='upper right')
    
plt.tight_layout()
#     break

Target Words Overlap Analysis



In [10]:

    
num_groups = len(target_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))

for idx, group in enumerate(target_words_groups):
    
    ######## Get list of files (.mat) we want to work with ########
    first_filedir = '../condensed_data/groups/' + group[0] + '/'
    second_filedir = '../condensed_data/groups/' + group[1] + '/'

    # get list of files for both word pairs
    first_files = []
    second_files = []
    
    files01 = os.listdir(first_filedir)
    files02 = os.listdir(second_filedir)
    for i in range(0, len(files01)):
        if files01[i].endswith('.mat'):
            first_files.append(files01[i])
        if files02[i].endswith('.mat'):
            second_files.append(files02[i])

#     chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
    chans_to_analyze = str(np.arange(1,96,1))
    
    ################## LOOPING THROUGH EACH CHANNEL ##################
    first_feature = []
    second_feature = []
    for f in range(0, len(first_files)):
        # make sure this is a channel we want
        chan_num = first_files[f].split('_')[0]
        
        if chan_num in chans_to_analyze:
            #################### Set up data from the channel's mat file ####################
            # Go through each .mat file
            first_mat_file = first_filedir + first_files[f]
            second_mat_file = second_filedir + second_files[f]

            data_first = scipy.io.loadmat(first_mat_file)
            data_first = data_first['data']
            data_second = scipy.io.loadmat(second_mat_file)
            data_second = data_second['data']

            ## 01: Grab probewords for each struct
            first_probeWord = data_first['probeWord'][0][0][0]
            second_probeWord = data_second['probeWord'][0][0][0]

            ## 02: Grab targetwords for each struct
            first_targetWord = data_first['targetWord'][0][0][0]
            second_targetWord = data_second['targetWord'][0][0][0]

            ## 03: get channel number
            first_chanNum = data_first['chanNum'][0][0][0][0]
            second_chanNum = data_second['chanNum'][0][0][0][0]

            ## 04: get channel string
            first_chanStr = data_first['chanStr'][0][0][0]
            second_chanStr = data_second['chanStr'][0][0][0]

            ## 05: get power matrix Z is a #events X #freq. bands X #time bins
            first_matrix = data_first['powerMatZ'][0][0]
            second_matrix = data_second['powerMatZ'][0][0]

            ## 06: get the time point for probeword on
            first_timeZero = data_first['timeZero'][0][0][0]
            second_timeZero = data_second['timeZero'][0][0][0]

            ## 07: get the time point of vocalization
            first_vocalization = data_first['vocalization'][0][0][0]
            second_vocalization = data_second['vocalization'][0][0][0]
        
            ########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
            ### 01: create power matrix with only delta, theta and high gamma
            first_matrix = first_matrix[:, freq_bands,:]
            second_matrix = second_matrix[:, freq_bands,:]
        
            ### 02: get only the time point before vocalization
            first_mean = []
            second_mean = []
            for i in range(0, len(first_vocalization)):
#                 first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
                
                first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
            for i in range(0, len(second_vocalization)):
#                 second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
                
                second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
               
            # append the averaged time and event vector = frequency band feature
            first_feature.append(np.mean(first_mean,axis=0))
            second_feature.append(np.mean(second_mean,axis=0))
                
        #### end of if/else
    #### end of for loop
    
    ### 03: after looping through all significant channels, create list of feature vectors from each channel
    first_feature = np.array(first_feature)
    second_feature = np.array(second_feature)
    
#     print first_feature.shape
#     print second_feature.shape
    
    # compute freqX1 centroid vector
    first_centroid = np.mean(first_feature,axis=0)
    second_centroid = np.mean(second_feature,axis=0)
     
    # compute list of distances from other centroid
    first_hist = [distances(x, second_centroid) for x in first_feature]
    second_hist = [distances(x, first_centroid) for x in second_feature]
    
    ## log transform
#     first_hist = np.log(first_hist)
#     second_hist = np.log(second_hist)
    
    ### 04: Plot Histogram of Distances
    plt.subplot(num_groups, 1, idx+1)
#     fig = plt.figure()
    first_label = 'Distance for ' + group[0]
    second_label = 'Distance For ' + group[1]
    plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
    plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
    plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
    plt.xlabel('Euclidean Distance From "other" centroid')
    plt.ylabel('Count')
    plt.legend(loc='upper right')
    
plt.tight_layout()
#     break

Discussion

I tried using cosine and euclidean distance measures and varying the amount of channels I include by changing the threshold hyperparameter. Many of these functions seem to overlay each other.

What I want to try underneath here is to show the distances for one of the very different words vs. a word pair that should be more similar and plot those two histograms. My worry is that there are so many euclidean distances close to 0.



In [6]:

    
##### HYPER-PARAMETERS TO TUNE
anova_threshold = 90   # how many channels we want to keep
distances = Distance.cosine # define distance metric to use
num_time_windows = 5
freq_bands = [0, 1, 5]
# freq_bands = np.arange(0,7,1)

freq_labels = ['delta', 'theta', 'alpha', 'beta', 'low gamma', 'high gamma', 'HFO']
print freq_bands
print [freq_labels[i] for i in freq_bands]

print "The length of the feature vector for each channel will be: ", num_time_windows*len(freq_bands)









    



[0, 1, 5]
['delta', 'theta', 'high gamma']
The length of the feature vector for each channel will be:  15

PCA Component Analysis



In [37]:

    
## plot all in pca space vs. another
num_groups = len(diff_words_groups)
# fig = plt.figure(figsize=(7, 5*num_groups))

for idx, group in enumerate(diff_words_groups):
    
    ######## Get list of files (.mat) we want to work with ########
    first_filedir = '../condensed_data/groups/' + group[0] + '/'
    second_filedir = '../condensed_data/groups/' + group[1] + '/'

    # get list of files for both word pairs
    first_files = []
    second_files = []
    
    files01 = os.listdir(first_filedir)
    files02 = os.listdir(second_filedir)
    for i in range(0, len(files01)):
        if files01[i].endswith('.mat'):
            first_files.append(files01[i])
        if files02[i].endswith('.mat'):
            second_files.append(files02[i])

#     chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
    chans_to_analyze = str(np.arange(1,97,1))
    
    ################## LOOPING THROUGH EACH CHANNEL ##################
    first_feature = []
    second_feature = []
    for f in range(0, len(first_files)):
        # make sure this is a channel we want
        chan_num = first_files[f].split('_')[0]
        
        if chan_num in chans_to_analyze:
            #################### Set up data from the channel's mat file ####################
            # Go through each .mat file
            first_mat_file = first_filedir + first_files[f]
            second_mat_file = second_filedir + second_files[f]

            data_first = scipy.io.loadmat(first_mat_file)
            data_first = data_first['data']
            data_second = scipy.io.loadmat(second_mat_file)
            data_second = data_second['data']

            ## 01: Grab probewords for each struct
            first_probeWord = data_first['probeWord'][0][0][0]
            second_probeWord = data_second['probeWord'][0][0][0]

            ## 02: Grab targetwords for each struct
            first_targetWord = data_first['targetWord'][0][0][0]
            second_targetWord = data_second['targetWord'][0][0][0]

            ## 03: get channel number
            first_chanNum = data_first['chanNum'][0][0][0][0]
            second_chanNum = data_second['chanNum'][0][0][0][0]

            ## 04: get channel string
            first_chanStr = data_first['chanStr'][0][0][0]
            second_chanStr = data_second['chanStr'][0][0][0]

            ## 05: get power matrix Z is a #events X #freq. bands X #time bins
            first_matrix = data_first['powerMatZ'][0][0]
            second_matrix = data_second['powerMatZ'][0][0]

            ## 06: get the time point for probeword on
            first_timeZero = data_first['timeZero'][0][0][0]
            second_timeZero = data_second['timeZero'][0][0][0]

            ## 07: get the time point of vocalization
            first_vocalization = data_first['vocalization'][0][0][0]
            second_vocalization = data_second['vocalization'][0][0][0]
        
            ########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
            ### 01: create power matrix with only delta, theta and high gamma
            first_matrix = first_matrix[:, freq_bands,:]
            second_matrix = second_matrix[:, freq_bands,:]
        
            ### 02: get only the time point before vocalization
            first_mean = []
            second_mean = []
            for i in range(0, len(first_vocalization)):
#                 first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
                
                first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
            for i in range(0, len(second_vocalization)):
#                 second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
                
                second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
               
            # append the averaged time and event vector = frequency band feature
            first_feature.append(np.mean(first_mean,axis=0))
            second_feature.append(np.mean(second_mean,axis=0))
                
        #### end of if/else
    #### end of for loop
    
    ### 03: after looping through all significant channels, create list of feature vectors from each channel
    first_feature = np.array(first_feature)
    second_feature = np.array(second_feature)
    
    ### 04: PCA On Features
    pca = PCA(n_components=3)
    first_pca = pca.fit_transform(first_feature)
    second_pca = pca.fit_transform(second_feature)
    
    # compute freqX1 centroid vector
#     first_centroid = np.mean(first_feature,axis=0)
#     second_centroid = np.mean(second_feature,axis=0)
     
#     # compute list of distances from other centroid
#     first_hist = np.array([distances(x, second_centroid) for x in first_pca])
#     second_hist = np.array([distances(x, first_centroid) for x in second_pca])
    
#     print first_feature.shape
#     print second_feature.shape
#     print first_hist.shape
#     print second_hist.shape
#     print first_pca.shape
#     print second_pca.shape
#     break
    
    plt.figure(figsize=(20,15))
    pca_plot_i = 1
    for i in range(0, first_pca.shape[1]):
        for j in range(0, first_pca.shape[1]):
            plt.subplot(first_pca.shape[1], first_pca.shape[1], pca_plot_i)
            if i==j:
                axes = plt.gca()
                ymin, ymax = axes.get_ylim()
                xmin, xmax = axes.get_xlim()
                plt.text((xmax-xmin)/4.5, (ymax-ymin)/2, r'PCA Component %d vs. %d'%(i+1, i+1), fontsize=20)
                plt.title('PCA Plot for ' + str(group))
                plt.grid(False)
            else:
                plt.scatter(first_pca[:,i], first_pca[:,j], color='g', label='first_feature pca')
                plt.scatter(second_pca[:,i], second_pca[:,j], color='b', label='second feature pca')
                plt.title('PCA Plot for ' + str(group))
                plt.legend()
            
            pca_plot_i += 1 # increment index to plot subplot

plt.tight_layout()



In [38]:

    
## plot all in pca space vs. another
num_groups = len(diff_words_groups)
# fig = plt.figure(figsize=(7, 5*num_groups))

for idx, group in enumerate(diff_words_groups):
    
    ######## Get list of files (.mat) we want to work with ########
    first_filedir = '../condensed_data/groups/' + group[0] + '/'
    second_filedir = '../condensed_data/groups/' + group[1] + '/'

    # get list of files for both word pairs
    first_files = []
    second_files = []
    
    files01 = os.listdir(first_filedir)
    files02 = os.listdir(second_filedir)
    for i in range(0, len(files01)):
        if files01[i].endswith('.mat'):
            first_files.append(files01[i])
        if files02[i].endswith('.mat'):
            second_files.append(files02[i])

    chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
#     chans_to_analyze = str(np.arange(1,97,1))
    
    ################## LOOPING THROUGH EACH CHANNEL ##################
    first_feature = []
    second_feature = []
    for f in range(0, len(first_files)):
        # make sure this is a channel we want
        chan_num = first_files[f].split('_')[0]
        
        if chan_num in chans_to_analyze:
            #################### Set up data from the channel's mat file ####################
            # Go through each .mat file
            first_mat_file = first_filedir + first_files[f]
            second_mat_file = second_filedir + second_files[f]

            data_first = scipy.io.loadmat(first_mat_file)
            data_first = data_first['data']
            data_second = scipy.io.loadmat(second_mat_file)
            data_second = data_second['data']

            ## 01: Grab probewords for each struct
            first_probeWord = data_first['probeWord'][0][0][0]
            second_probeWord = data_second['probeWord'][0][0][0]

            ## 02: Grab targetwords for each struct
            first_targetWord = data_first['targetWord'][0][0][0]
            second_targetWord = data_second['targetWord'][0][0][0]

            ## 03: get channel number
            first_chanNum = data_first['chanNum'][0][0][0][0]
            second_chanNum = data_second['chanNum'][0][0][0][0]

            ## 04: get channel string
            first_chanStr = data_first['chanStr'][0][0][0]
            second_chanStr = data_second['chanStr'][0][0][0]

            ## 05: get power matrix Z is a #events X #freq. bands X #time bins
            first_matrix = data_first['powerMatZ'][0][0]
            second_matrix = data_second['powerMatZ'][0][0]

            ## 06: get the time point for probeword on
            first_timeZero = data_first['timeZero'][0][0][0]
            second_timeZero = data_second['timeZero'][0][0][0]

            ## 07: get the time point of vocalization
            first_vocalization = data_first['vocalization'][0][0][0]
            second_vocalization = data_second['vocalization'][0][0][0]
        
            ########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
            ### 01: create power matrix with only delta, theta and high gamma
            first_matrix = first_matrix[:, freq_bands,:]
            second_matrix = second_matrix[:, freq_bands,:]
        
            ### 02: get only the time point before vocalization
            first_mean = []
            second_mean = []
            for i in range(0, len(first_vocalization)):
#                 first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
                
                first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
            for i in range(0, len(second_vocalization)):
#                 second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
                
                second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
               
            # append the averaged time and event vector = frequency band feature
            first_feature.append(np.mean(first_mean,axis=0))
            second_feature.append(np.mean(second_mean,axis=0))
                
        #### end of if/else
    #### end of for loop
    
    ### 03: after looping through all significant channels, create list of feature vectors from each channel
    first_feature = np.array(first_feature)
    second_feature = np.array(second_feature)
    
    ### 04: PCA On Features
    pca = PCA(n_components=3)
    first_pca = pca.fit_transform(first_feature)
    second_pca = pca.fit_transform(second_feature)
    
    # compute freqX1 centroid vector
#     first_centroid = np.mean(first_feature,axis=0)
#     second_centroid = np.mean(second_feature,axis=0)
     
#     # compute list of distances from other centroid
#     first_hist = np.array([distances(x, second_centroid) for x in first_pca])
#     second_hist = np.array([distances(x, first_centroid) for x in second_pca])
    
#     print first_feature.shape
#     print second_feature.shape
#     print first_hist.shape
#     print second_hist.shape
#     print first_pca.shape
#     print second_pca.shape
#     break
    
    plt.figure(figsize=(20,15))
    print 'PCA Plot for ' + str(group)
    pca_plot_i = 1
    for i in range(0, first_pca.shape[1]):
        for j in range(0, first_pca.shape[1]):
            plt.subplot(first_pca.shape[1], first_pca.shape[1], pca_plot_i)
            if i==j:
                axes = plt.gca()
                ymin, ymax = axes.get_ylim()
                xmin, xmax = axes.get_xlim()
                plt.text((xmax-xmin)/4.5, (ymax-ymin)/2, r'PCA Component %d vs. %d'%(i+1, i+1), fontsize=20)
                plt.title('PCA Plot for ' + str(group))
                plt.grid(False)
            else:
                plt.scatter(first_pca[:,i], first_pca[:,j], color='g', label='first_feature pca')
                plt.scatter(second_pca[:,i], second_pca[:,j], color='b', label='second feature pca')
                plt.title('PCA Plot for ' + str(group))
                plt.legend()
            
            pca_plot_i += 1 # increment index to plot subplot

    
plt.tight_layout()









    



PCA Plot for ('BRICK_CLOCK', 'GLASS_JUICE')
PCA Plot for ('CLOCK_BRICK', 'GLASS_PANTS')
PCA Plot for ('CLOCK_BRICK', 'JUICE_GLASS')
PCA Plot for ('CLOCK_BRICK', 'PANTS_GLASS')
PCA Plot for ('BRICK_JUICE', 'CLOCK_GLASS')
PCA Plot for ('JUICE_BRICK', 'GLASS_CLOCK')
PCA Plot for ('JUICE_BRICK', 'GLASS_PANTS')
PCA Plot for ('JUICE_BRICK', 'PANTS_GLASS')
PCA Plot for ('BRICK_PANTS', 'CLOCK_GLASS')
PCA Plot for ('PANTS_BRICK', 'GLASS_CLOCK')
PCA Plot for ('BRICK_PANTS', 'GLASS_JUICE')
PCA Plot for ('PANTS_BRICK', 'JUICE_GLASS')
PCA Plot for ('CLOCK_BRICK', 'GLASS_JUICE')
PCA Plot for ('BRICK_CLOCK', 'GLASS_PANTS')
PCA Plot for ('BRICK_CLOCK', 'JUICE_GLASS')
PCA Plot for ('BRICK_CLOCK', 'PANTS_GLASS')
PCA Plot for ('GLASS_CLOCK', 'BRICK_JUICE')
PCA Plot for ('CLOCK_GLASS', 'JUICE_BRICK')
PCA Plot for ('CLOCK_GLASS', 'PANTS_BRICK')
PCA Plot for ('GLASS_CLOCK', 'BRICK_PANTS')
PCA Plot for ('GLASS_JUICE', 'PANTS_BRICK')
PCA Plot for ('PANTS_GLASS', 'BRICK_JUICE')
PCA Plot for ('BRICK_JUICE', 'GLASS_PANTS')
PCA Plot for ('JUICE_GLASS', 'BRICK_PANTS')

Comparing Distances Relatively

Since there is no clear separation among these "different" word groups, I want to look at distances relative to others.

For example, completely different word groups vs. reversed word pair vs. probe word vs. target word vs. same word overlap.