Exploratory Response Times

Looking at spectrograms of different triggers with different response times. My hypothesis is that there is some difference in power depending on how the long it takes for the patient to vocalize a response. This would be related to sort of inherent memory vs. actively searching for the memory.

The different response times can all be categorized by looking at the events struct. Split it correspondingly by different probe words, targetwords, or word match pairs and analyze the distribution.

I will determine if the distributions of response times for different probewords are the same, and for different unique word match pairs.

Things to Change:

Patient dir if you analyze a new patient



In [1]:

    
# Import Necessary Libraries
import numpy as np
import os, csv, json
import math
import random

import itertools
import matplotlib
from matplotlib import *
from matplotlib import pyplot as plt

import scipy.io
from sklearn.decomposition import PCA

# pretty charting
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')

%matplotlib inline



In [2]:

    
######## Get list of files (.mat) we want to work with ########
filedir = '../condensed_data/freq_probeTovocal_binned/'
files = []

for file in os.listdir(filedir):
    if file.endswith('.mat'):
        files.append(file)

######## Load in EVENTS struct to find correct events
eventsDir = '../NIH034/behavioral/paRemap/' + 'events.mat'

events = scipy.io.loadmat(eventsDir)
events = events['events']

# print number of incorrect events and which words they belonged to
incorrectIndices = events['isCorrect'] == 0
incorrectEvents = events[incorrectIndices]
incorrectWords = []
wordList = {}
for i in range(0, len(incorrectEvents)):
    incorrectWords.append(incorrectEvents['probeWord'][i][0])

for word in np.unique(incorrectEvents['probeWord']):
    wordList[str(word)] = sum(incorrectWords == word)
    
print "There were ",len(incorrectEvents), " number of incorrect events."
print "The list of incorrect probe words: \n", wordList
# 
# get only correct events
correctIndices = events['isCorrect'] == 1
events = events[correctIndices]

print "\nThis is the length of the events struct with only correct responses: ", len(events)
print "There are ", len(files), " files inside our directory"
print "This should match the number of channels."









    



There were  49  number of incorrect events.
The list of incorrect probe words: 
{"[u'PANTS']": 7, "[u'JUICE']": 8, "[u'BRICK']": 12, "[u'CLOCK']": 13, "[u'GLASS']": 9}

This is the length of the events struct with only correct responses:  1431
There are  96  files inside our directory
This should match the number of channels.

Extract Important Events Information

Response Times, etc.



In [3]:

    
# convert responsetime into a list
responseTimes = []
for i in range(0, len(events)):
    responseTimes.append(events['responseTime'][i][0][0])
#     print events['responseTime']
#     break
responseTimes = np.array(responseTimes)
    
# create a dictionary of response times for each probe word
wordResponseTimes = {}
for word in np.unique(incorrectEvents['probeWord']):
    wordIndices = events['probeWord'] == word
    mask = np.where(wordIndices==True)[0]
    wordResponseTimes[str(word[0])] = responseTimes[mask]/1000
print events.dtype









    



[('experiment', 'O'), ('subject', 'O'), ('sessionName', 'O'), ('sessionNum', 'O'), ('type', 'O'), ('msoffset', 'O'), ('mstime', 'O'), ('probeWord', 'O'), ('targetWord', 'O'), ('isCorrect', 'O'), ('blocknumber', 'O'), ('miniblocknumber', 'O'), ('matchOnTime', 'O'), ('probeOffTime', 'O'), ('fixationOnTime', 'O'), ('fixationOffTime', 'O'), ('responseTime', 'O'), ('vocalizedWord', 'O'), ('eegfile', 'O'), ('eegoffset', 'O')]



In [4]:

    
################## LOOPING THROUGH EACH CHANNEL ##################
feature_dict = {} # the dict to hold the feature matrix for each channel

for f in range(0, len(files)):
    #################### Set up data from the channel's mat file ####################
    # Go through each .mat file
    mat_file = filedir + files[f]

    data = scipy.io.loadmat(mat_file)
    data = data['data']


    ## 01: reformat unique trigger types
    uniqueTrigTypes = data['uniqueTrigType'][0][0][0]
    buff = []
    for trig in uniqueTrigTypes:
        buff.append(str(trig[0]))
    uniqueTrigTypes = buff

    ## 02: reformat trigger types
    trigTypes = data['trigType'][0][0][0]
    buff = []
    for trig in trigTypes:
        buff.append(str(trig[0]))
    trigTypes = buff

    ## 03: get channel number
    chanNum = data['chanNum'][0][0][0][0]

    ## 04: get channel string
    chanStr = data['chanStr'][0][0][0]

    ## 05: get power matrix Z is a #events X #freq. bands
    matrix = data['powerMatZ'][0][0]

    ## 06: get freq band ticks and ylabels
    freqBandYtick = data['freqBandYtick'][0][0][0]
    freqBandYlabel = data['freqBandYlabel'][0][0][0]
    buff = []
    for freq in freqBandYlabel:
        buff.append(str(freq[0]))
    freqBandYlabel = buff

    #################### Getting those events and the corresonding averaged powermat  ####################
    ## Get events of interest
    TRIGGER_TYPES = uniqueTrigTypes
    probeWords = events['probeWord']
    targetWords = events['targetWord']

    # number of frequency bins
    num_freqs = len(freqBandYtick) - 1
    # total number of "data centers"
#     num_features = len(TRIGGER_TYPES)*len(tempTargets)
    features = {}
    
    for i in range(0,len(TRIGGER_TYPES)): # LOOP THRU EACH PROBEWORD
        current_trig = TRIGGER_TYPES[i]

        ## 01: get indices of the current trigger and get those events
        tempInd = events['probeWord'] == current_trig
        tempEvents = events[tempInd]
        tempTargets = np.unique(tempEvents['targetWord'])
    
        ## 02: go through each target word for this probeword
        for j in range(0, len(tempTargets)):
            targetWord = tempTargets[j][0] # set target word
    
            # get the indices of the events we want probe/target match
            eventInd = events['probeWord'] == current_trig
            eventInd2 = events['targetWord'] == targetWord
            eventInd = np.array([any(tup) for tup in zip(eventInd, eventInd2)])

            # get the matrix we want and average across all events 
            thisMat = np.mean(matrix[eventInd,:], axis=0)
            # -> a 7x1 vector that represents this match for this channel 
            
            feature_key = str(current_trig) + '_' + str(targetWord)
            
            features[feature_key] = thisMat
            # clear vars
            eventInd2 = 0
    
    # turn features into np array and append to a dict of the features
#     features = np.array(features)
    feature_dict[str(f+1)] = features

print "The final feature dictionary will have features from the "
print "following channels: ", sorted(feature_dict.keys())
print "Each channel has these features: ", feature_dict['2'].keys()
print "With shape of: ", feature_dict['2']['JUICE_GLASS'].shape









    



The final feature dictionary will have features from the 
following channels:  ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96']
Each channel has these features:  ['JUICE_GLASS', 'BRICK_JUICE', 'CLOCK_GLASS', 'PANTS_GLASS', 'GLASS_JUICE', 'BRICK_CLOCK', 'GLASS_PANTS', 'PANTS_BRICK', 'GLASS_CLOCK', 'JUICE_BRICK', 'BRICK_PANTS', 'CLOCK_BRICK']
With shape of:  (7,)



In [31]:

    
# test = responseTimes != 0
print max(responseTimes)
print min(responseTimes)
print responseTimes
plt.hist(responseTimes)









    



3830.65771484
676.918457031
[ 1739.0793457   1396.83227539  2609.80737305 ...,  1144.66210938
  1151.40600586  1184.86621094]






    Out[31]:





(array([ 165.,  745.,  317.,  104.,   45.,   21.,   16.,   10.,    6.,    2.]),
 array([  676.91845703,   992.29238281,  1307.66630859,  1623.04023438,
         1938.41416016,  2253.78808594,  2569.16201172,  2884.5359375 ,
         3199.90986328,  3515.28378906,  3830.65771484]),
 <a list of 10 Patch objects>)



In [38]:

    
print wordResponseTimes.keys()
plt.figure(figsize=(10, 10))
i = 1
for key in wordResponseTimes.keys():
    responseTime = wordResponseTimes[key]
    plt.subplot(5, 1, i)
    plt.hist(responseTime)
    plt.title("For Probeword: " + key)
    i += 1
plt.tight_layout()









    



['JUICE', 'GLASS', 'BRICK', 'PANTS', 'CLOCK']



In [19]:

    
print wordResponseTimes.keys()

percentile_responseTimes = {}

plt.figure(figsize=(10, 10))
i = 1
for key in wordResponseTimes.keys():
    percentile = np.percentile(wordResponseTimes[key], 90)
    responseTime = wordResponseTimes[key][wordResponseTimes[key] < percentile]
    plt.subplot(5, 1, i)
    plt.hist(responseTime)
    plt.title("For Probeword: " + key)
    i += 1
    
    percentile_responseTimes[key] = responseTime
plt.tight_layout()

juice = percentile_responseTimes['JUICE']
glass = percentile_responseTimes['GLASS']
brick = percentile_responseTimes['BRICK']
pants = percentile_responseTimes['PANTS']
clock = percentile_responseTimes['CLOCK']

# Ks 1 sample test
print scipy.stats.ks_2samp(juice, glass)
print scipy.stats.ks_2samp(juice, brick)
print scipy.stats.ks_2samp(juice, pants)
print scipy.stats.ks_2samp(glass, pants)
print scipy.stats.ks_2samp(brick, pants)
print scipy.stats.ks_2samp(glass, clock)
print scipy.stats.ks_2samp(brick, glass)
print scipy.stats.ks_2samp(brick, clock)









    



['JUICE', 'GLASS', 'BRICK', 'PANTS', 'CLOCK']
Ks_2sampResult(statistic=0.11768219832735966, pvalue=0.050345319614906216)
Ks_2sampResult(statistic=0.17481466639951915, pvalue=0.0006041054830149018)
Ks_2sampResult(statistic=0.14450598232782313, pvalue=0.019004368003896797)
Ks_2sampResult(statistic=0.23887189942235815, pvalue=4.9520399098733666e-07)
Ks_2sampResult(statistic=0.10556157045985526, pvalue=0.1032780558121716)
Ks_2sampResult(statistic=0.1122004357298475, pvalue=0.079476030461080108)
Ks_2sampResult(statistic=0.25452419292999007, pvalue=1.068255129327557e-09)
Ks_2sampResult(statistic=0.33111070515162588, pvalue=1.3517785242495158e-12)

Testing Differences in Response Time Distribution

Here, I use the nonparametric Kolmogorov-smirnov test statistic to determine whether the various response time distributions are the same, or not.

Should I run a quantile filter? Since there seems to be a decent amount of outliers in terms of response times



In [56]:

    
print wordResponseTimes.keys()

for key in wordResponseTimes.keys():
    print wordResponseTimes[key].shape

juice = wordResponseTimes['JUICE']
glass = wordResponseTimes['GLASS']
brick = wordResponseTimes['BRICK']
pants = wordResponseTimes['PANTS']
clock = wordResponseTimes['CLOCK']

# Ks 1 sample test
print scipy.stats.ks_2samp(juice, glass)
print scipy.stats.ks_2samp(juice, brick)
print scipy.stats.ks_2samp(juice, pants)
print scipy.stats.ks_2samp(glass, pants)
print scipy.stats.ks_2samp(brick, pants)
print scipy.stats.ks_2samp(glass, clock)
print scipy.stats.ks_2samp(brick, glass)
print scipy.stats.ks_2samp(brick, clock)









    



['JUICE', 'GLASS', 'BRICK', 'PANTS', 'CLOCK']
(242,)
(361,)
(358,)
(243,)
(227,)
Ks_2sampResult(statistic=0.10514869165083224, pvalue=0.075800257106005922)
Ks_2sampResult(statistic=0.15864074980377674, pvalue=0.0011924007999215126)
Ks_2sampResult(statistic=0.12985069550726119, pvalue=0.030415943669708533)
Ks_2sampResult(statistic=0.21411716425566846, pvalue=2.4710818977871132e-06)
Ks_2sampResult(statistic=0.095638779685955355, pvalue=0.1336845235499069)
Ks_2sampResult(statistic=0.099942645856468282, pvalue=0.11619386767953627)
Ks_2sampResult(statistic=0.22985499620854544, pvalue=7.8317130441076093e-09)
Ks_2sampResult(statistic=0.29809514434080675, pvalue=2.1998681227156155e-11)

Testing Simulation For KS Test Statistic

Here, I demonstrate the power of the Kolmogorov-Smirnov Test Statistic with varying sample sizes.

Ideally we would have around 1000 samples per each distribution, but that is not feasible.

Other ways we could improve the power of this test would be to bootstrap our sample data to increase our overall sample size.



In [48]:

    
# define number of subjects per class
np.random.seed(123456789)  # for reproducibility, set random seed

S = np.array((4, 6, 8, 10, 14, 18, 20, 26, 30, 40,
              50, 60, 70, 80, 100, 120, 150, 200, 250,
              300, 400, 500, 750, 1000, 1500, 2000,
              3000, 5000))
alpha = 0.05
N = 50 # # samples at each iteration



In [51]:

    
pow_null = np.array((), dtype=np.dtype('float64'))
powks_null = np.array((), dtype=np.dtype('float64'))
powks2_null = np.array((), dtype=np.dtype('float64'))

# compute this statistic for various sizes of datasets
for s in S:
    # compute this many times for each operating point to get average
    kspval = np.array((), dtype=np.dtype('float64')) 
    ks2pval = np.array((), dtype=np.dtype('float64')) 
    
    for _ in itertools.repeat(None,N):
        g0 = np.random.uniform(0, 1, s) # (null)
        g1 = np.random.uniform(0, 1, s) # null
        
        # compute Kolmogorov-test statistic on generated data
        ks2test_stat = scipy.stats.ks_2samp(g0, g1)
        ks2pval = np.append(pval, ks2test_stat.pvalue)
        
        kstest_stat = scipy.stats.kstest(g0, 'uniform', args=(0,1))
        kspval = np.append(pval, kstest_stat.pvalue)       
    
    # record average p value at operating point
    powks_null = np.append(powks_null, np.sum(1.0*(kspval < alpha))/N)
    powks2_null = np.append(powks2_null, np.sum(1.0*(ks2pval < alpha))/N)









    



/Users/adam2392/anaconda/lib/python2.7/site-packages/scipy/stats/morestats.py:2384: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")



In [52]:

    
pow_alt = np.array((), dtype=np.dtype('float64'))
powks_alt = np.array((), dtype=np.dtype('float64'))
powks2_alt = np.array((), dtype=np.dtype('float64'))

# compute this statistic for various sizes of datasets
for s in S:

    # compute this many times for each operating point to get average    
    kspval = np.array((), dtype=np.dtype('float64')) 
    ks2pval = np.array((), dtype=np.dtype('float64')) 
    
    for _ in itertools.repeat(None,N):
        g0 = np.random.uniform(0.2, 0.9, s) # (null)
        g1 = np.random.uniform(0, 1, s) # alternative
        
        # compute Kolmogorov-test statistic on generated data
        ks2test_stat = scipy.stats.ks_2samp(g0, g1)
        ks2pval = np.append(pval, ks2test_stat.pvalue)
        
        kstest_stat = scipy.stats.kstest(g0, 'uniform', args=(0,1))
        kspval = np.append(pval, kstest_stat.pvalue)
        
    # record average p value at operating point
    powks_alt = np.append(powks_alt, np.sum(1.0*(kspval < alpha))/N)
    powks2_alt = np.append(powks2_alt, np.sum(1.0*(ks2pval < alpha))/N)



In [59]:

    
# Plotting power vs. n on the null set
fig = plt.figure(figsize=(10,10))
plt.subplot(312)
plt.scatter(S, powks_null, hold=True, label='null')
plt.scatter(S, powks_alt, color='green', hold=True, label='alt')
plt.xscale('log')
plt.xlabel('number of samples')
plt.ylabel('power')
plt.title('Strength of Uniform Classification under With Ks one sample test')
plt.axhline(alpha, color='red', linestyle='--', label='alpha')
plt.axvline(350, color='black', linestyle='--', label='n of our data')
plt.axvline(227, color='black', linestyle='--', label='n of our data')
plt.legend(loc=5)
plt.show()

# Plotting power vs. n on the null set
fig = plt.figure(figsize=(10,10))
plt.subplot(313)
plt.scatter(S, powks2_null, hold=True, label='null')
plt.scatter(S, powks2_alt, color='green', hold=True, label='alt')
plt.xscale('log')
plt.xlabel('number of samples')
plt.ylabel('power')
plt.title('Strength of Uniform Classification under With Ks two sample test')
plt.axhline(alpha, color='red', linestyle='--', label='alpha')
plt.axvline(350, color='black', linestyle='--', label='n of our data')
plt.axvline(227, color='black', linestyle='--', label='n of our data')
plt.legend(loc=5)
plt.show()



In [ ]: