Looking at spectrograms of different triggers with different response times. My hypothesis is that there is some difference in power depending on how the long it takes for the patient to vocalize a response. This would be related to sort of inherent memory vs. actively searching for the memory.
The different response times can all be categorized by looking at the events struct. Split it correspondingly by different probe words, targetwords, or word match pairs and analyze the distribution.
I will determine if the distributions of response times for different probewords are the same, and for different unique word match pairs.
Things to Change:
In [1]:
# Import Necessary Libraries
import numpy as np
import os, csv, json
import math
import random
import itertools
import matplotlib
from matplotlib import *
from matplotlib import pyplot as plt
import scipy.io
from sklearn.decomposition import PCA
# pretty charting
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')
%matplotlib inline
In [2]:
######## Get list of files (.mat) we want to work with ########
filedir = '../condensed_data/freq_probeTovocal_binned/'
files = []
for file in os.listdir(filedir):
if file.endswith('.mat'):
files.append(file)
######## Load in EVENTS struct to find correct events
eventsDir = '../NIH034/behavioral/paRemap/' + 'events.mat'
events = scipy.io.loadmat(eventsDir)
events = events['events']
# print number of incorrect events and which words they belonged to
incorrectIndices = events['isCorrect'] == 0
incorrectEvents = events[incorrectIndices]
incorrectWords = []
wordList = {}
for i in range(0, len(incorrectEvents)):
incorrectWords.append(incorrectEvents['probeWord'][i][0])
for word in np.unique(incorrectEvents['probeWord']):
wordList[str(word)] = sum(incorrectWords == word)
print "There were ",len(incorrectEvents), " number of incorrect events."
print "The list of incorrect probe words: \n", wordList
#
# get only correct events
correctIndices = events['isCorrect'] == 1
events = events[correctIndices]
print "\nThis is the length of the events struct with only correct responses: ", len(events)
print "There are ", len(files), " files inside our directory"
print "This should match the number of channels."
In [3]:
# convert responsetime into a list
responseTimes = []
for i in range(0, len(events)):
responseTimes.append(events['responseTime'][i][0][0])
# print events['responseTime']
# break
responseTimes = np.array(responseTimes)
# create a dictionary of response times for each probe word
wordResponseTimes = {}
for word in np.unique(incorrectEvents['probeWord']):
wordIndices = events['probeWord'] == word
mask = np.where(wordIndices==True)[0]
wordResponseTimes[str(word[0])] = responseTimes[mask]/1000
print events.dtype
In [4]:
################## LOOPING THROUGH EACH CHANNEL ##################
feature_dict = {} # the dict to hold the feature matrix for each channel
for f in range(0, len(files)):
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
mat_file = filedir + files[f]
data = scipy.io.loadmat(mat_file)
data = data['data']
## 01: reformat unique trigger types
uniqueTrigTypes = data['uniqueTrigType'][0][0][0]
buff = []
for trig in uniqueTrigTypes:
buff.append(str(trig[0]))
uniqueTrigTypes = buff
## 02: reformat trigger types
trigTypes = data['trigType'][0][0][0]
buff = []
for trig in trigTypes:
buff.append(str(trig[0]))
trigTypes = buff
## 03: get channel number
chanNum = data['chanNum'][0][0][0][0]
## 04: get channel string
chanStr = data['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands
matrix = data['powerMatZ'][0][0]
## 06: get freq band ticks and ylabels
freqBandYtick = data['freqBandYtick'][0][0][0]
freqBandYlabel = data['freqBandYlabel'][0][0][0]
buff = []
for freq in freqBandYlabel:
buff.append(str(freq[0]))
freqBandYlabel = buff
#################### Getting those events and the corresonding averaged powermat ####################
## Get events of interest
TRIGGER_TYPES = uniqueTrigTypes
probeWords = events['probeWord']
targetWords = events['targetWord']
# number of frequency bins
num_freqs = len(freqBandYtick) - 1
# total number of "data centers"
# num_features = len(TRIGGER_TYPES)*len(tempTargets)
features = {}
for i in range(0,len(TRIGGER_TYPES)): # LOOP THRU EACH PROBEWORD
current_trig = TRIGGER_TYPES[i]
## 01: get indices of the current trigger and get those events
tempInd = events['probeWord'] == current_trig
tempEvents = events[tempInd]
tempTargets = np.unique(tempEvents['targetWord'])
## 02: go through each target word for this probeword
for j in range(0, len(tempTargets)):
targetWord = tempTargets[j][0] # set target word
# get the indices of the events we want probe/target match
eventInd = events['probeWord'] == current_trig
eventInd2 = events['targetWord'] == targetWord
eventInd = np.array([any(tup) for tup in zip(eventInd, eventInd2)])
# get the matrix we want and average across all events
thisMat = np.mean(matrix[eventInd,:], axis=0)
# -> a 7x1 vector that represents this match for this channel
feature_key = str(current_trig) + '_' + str(targetWord)
features[feature_key] = thisMat
# clear vars
eventInd2 = 0
# turn features into np array and append to a dict of the features
# features = np.array(features)
feature_dict[str(f+1)] = features
print "The final feature dictionary will have features from the "
print "following channels: ", sorted(feature_dict.keys())
print "Each channel has these features: ", feature_dict['2'].keys()
print "With shape of: ", feature_dict['2']['JUICE_GLASS'].shape
In [31]:
# test = responseTimes != 0
print max(responseTimes)
print min(responseTimes)
print responseTimes
plt.hist(responseTimes)
Out[31]:
In [38]:
print wordResponseTimes.keys()
plt.figure(figsize=(10, 10))
i = 1
for key in wordResponseTimes.keys():
responseTime = wordResponseTimes[key]
plt.subplot(5, 1, i)
plt.hist(responseTime)
plt.title("For Probeword: " + key)
i += 1
plt.tight_layout()
In [19]:
print wordResponseTimes.keys()
percentile_responseTimes = {}
plt.figure(figsize=(10, 10))
i = 1
for key in wordResponseTimes.keys():
percentile = np.percentile(wordResponseTimes[key], 90)
responseTime = wordResponseTimes[key][wordResponseTimes[key] < percentile]
plt.subplot(5, 1, i)
plt.hist(responseTime)
plt.title("For Probeword: " + key)
i += 1
percentile_responseTimes[key] = responseTime
plt.tight_layout()
juice = percentile_responseTimes['JUICE']
glass = percentile_responseTimes['GLASS']
brick = percentile_responseTimes['BRICK']
pants = percentile_responseTimes['PANTS']
clock = percentile_responseTimes['CLOCK']
# Ks 1 sample test
print scipy.stats.ks_2samp(juice, glass)
print scipy.stats.ks_2samp(juice, brick)
print scipy.stats.ks_2samp(juice, pants)
print scipy.stats.ks_2samp(glass, pants)
print scipy.stats.ks_2samp(brick, pants)
print scipy.stats.ks_2samp(glass, clock)
print scipy.stats.ks_2samp(brick, glass)
print scipy.stats.ks_2samp(brick, clock)
Here, I use the nonparametric Kolmogorov-smirnov test statistic to determine whether the various response time distributions are the same, or not.
Should I run a quantile filter? Since there seems to be a decent amount of outliers in terms of response times
In [56]:
print wordResponseTimes.keys()
for key in wordResponseTimes.keys():
print wordResponseTimes[key].shape
juice = wordResponseTimes['JUICE']
glass = wordResponseTimes['GLASS']
brick = wordResponseTimes['BRICK']
pants = wordResponseTimes['PANTS']
clock = wordResponseTimes['CLOCK']
# Ks 1 sample test
print scipy.stats.ks_2samp(juice, glass)
print scipy.stats.ks_2samp(juice, brick)
print scipy.stats.ks_2samp(juice, pants)
print scipy.stats.ks_2samp(glass, pants)
print scipy.stats.ks_2samp(brick, pants)
print scipy.stats.ks_2samp(glass, clock)
print scipy.stats.ks_2samp(brick, glass)
print scipy.stats.ks_2samp(brick, clock)
Here, I demonstrate the power of the Kolmogorov-Smirnov Test Statistic with varying sample sizes.
Ideally we would have around 1000 samples per each distribution, but that is not feasible.
Other ways we could improve the power of this test would be to bootstrap our sample data to increase our overall sample size.
In [48]:
# define number of subjects per class
np.random.seed(123456789) # for reproducibility, set random seed
S = np.array((4, 6, 8, 10, 14, 18, 20, 26, 30, 40,
50, 60, 70, 80, 100, 120, 150, 200, 250,
300, 400, 500, 750, 1000, 1500, 2000,
3000, 5000))
alpha = 0.05
N = 50 # # samples at each iteration
In [51]:
pow_null = np.array((), dtype=np.dtype('float64'))
powks_null = np.array((), dtype=np.dtype('float64'))
powks2_null = np.array((), dtype=np.dtype('float64'))
# compute this statistic for various sizes of datasets
for s in S:
# compute this many times for each operating point to get average
kspval = np.array((), dtype=np.dtype('float64'))
ks2pval = np.array((), dtype=np.dtype('float64'))
for _ in itertools.repeat(None,N):
g0 = np.random.uniform(0, 1, s) # (null)
g1 = np.random.uniform(0, 1, s) # null
# compute Kolmogorov-test statistic on generated data
ks2test_stat = scipy.stats.ks_2samp(g0, g1)
ks2pval = np.append(pval, ks2test_stat.pvalue)
kstest_stat = scipy.stats.kstest(g0, 'uniform', args=(0,1))
kspval = np.append(pval, kstest_stat.pvalue)
# record average p value at operating point
powks_null = np.append(powks_null, np.sum(1.0*(kspval < alpha))/N)
powks2_null = np.append(powks2_null, np.sum(1.0*(ks2pval < alpha))/N)
In [52]:
pow_alt = np.array((), dtype=np.dtype('float64'))
powks_alt = np.array((), dtype=np.dtype('float64'))
powks2_alt = np.array((), dtype=np.dtype('float64'))
# compute this statistic for various sizes of datasets
for s in S:
# compute this many times for each operating point to get average
kspval = np.array((), dtype=np.dtype('float64'))
ks2pval = np.array((), dtype=np.dtype('float64'))
for _ in itertools.repeat(None,N):
g0 = np.random.uniform(0.2, 0.9, s) # (null)
g1 = np.random.uniform(0, 1, s) # alternative
# compute Kolmogorov-test statistic on generated data
ks2test_stat = scipy.stats.ks_2samp(g0, g1)
ks2pval = np.append(pval, ks2test_stat.pvalue)
kstest_stat = scipy.stats.kstest(g0, 'uniform', args=(0,1))
kspval = np.append(pval, kstest_stat.pvalue)
# record average p value at operating point
powks_alt = np.append(powks_alt, np.sum(1.0*(kspval < alpha))/N)
powks2_alt = np.append(powks2_alt, np.sum(1.0*(ks2pval < alpha))/N)
In [59]:
# Plotting power vs. n on the null set
fig = plt.figure(figsize=(10,10))
plt.subplot(312)
plt.scatter(S, powks_null, hold=True, label='null')
plt.scatter(S, powks_alt, color='green', hold=True, label='alt')
plt.xscale('log')
plt.xlabel('number of samples')
plt.ylabel('power')
plt.title('Strength of Uniform Classification under With Ks one sample test')
plt.axhline(alpha, color='red', linestyle='--', label='alpha')
plt.axvline(350, color='black', linestyle='--', label='n of our data')
plt.axvline(227, color='black', linestyle='--', label='n of our data')
plt.legend(loc=5)
plt.show()
# Plotting power vs. n on the null set
fig = plt.figure(figsize=(10,10))
plt.subplot(313)
plt.scatter(S, powks2_null, hold=True, label='null')
plt.scatter(S, powks2_alt, color='green', hold=True, label='alt')
plt.xscale('log')
plt.xlabel('number of samples')
plt.ylabel('power')
plt.title('Strength of Uniform Classification under With Ks two sample test')
plt.axhline(alpha, color='red', linestyle='--', label='alpha')
plt.axvline(350, color='black', linestyle='--', label='n of our data')
plt.axvline(227, color='black', linestyle='--', label='n of our data')
plt.legend(loc=5)
plt.show()
In [ ]: