With the paRemap dataset, there are too many frequency bands and channels to analyze. In order to reduce this dimensionality, it would perhaps help to take a systematic approach to selecting relevant channels and frequency bands.
Using various classification techniques, perform feature selection on the groups of word pairs.
For same_word_group = 0, diff_word_group = 1 labels.
In [1]:
# Import Necessary Libraries
import numpy as np
import os, csv, json
import math
import random
import itertools
import matplotlib
from matplotlib import *
from matplotlib import pyplot as plt
import scipy.io
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# pretty charting
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')
%matplotlib inline
In [2]:
np.random.seed(12345678) # for reproducibility, set random seed
names = ["Linear SVM",
"Random Forest",
"Quadratic Discriminant Analysis",
"Logistic Regression"]
classifiers = [
SVC(kernel="linear", C=0.5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
QuadraticDiscriminantAnalysis(),
LogisticRegression()]
In [3]:
#### Extract wordpairs data into a dictionary for a subject/session/block
#### dictionary{wordpair:{channels}}
def extractSubjSessionBlockData(subj, session, block):
# file directory for a subj/session/block
filedir = '../../condensed_data_' + subj + '/sessions/' + session + '/' + block
wordpairs = os.listdir(filedir)
# initialize data dictionary with meta data
data_dict = {}
data_dict['meta'] = {'subject': subj,
'session': session,
'block': block}
data_dict['data'] = {}
for wordpair in wordpairs: # loop thru all wordpairs
wordpair_dir = filedir + '/' + wordpair
all_channel_mats = os.listdir(wordpair_dir)
data_dict['data'][wordpair] = {}
for channel in all_channel_mats: # loop thru all channels
chan_file = wordpair_dir + '/' + channel
## 00: load in data
data = scipy.io.loadmat(chan_file)
data = data['data']
## 01: get the time point for probeword on
timeZero = data['timeZero'][0][0][0]
## 02: get the time point of vocalization
vocalization = data['vocalization'][0][0][0]
## 03: Get Power Matrix
power_matrix = data['powerMatZ'][0][0]
# lowbands = np.mean(powerMat[:,0:2,:], axis=1, keepdims=True)
# medbands = powerMat[:,2:4,:]
# highbands = np.mean(powerMat[:,4:,:], axis=1, keepdims=True)
# power_matrix = np.concatenate((lowbands, medbands, highbands), axis=1)
if np.isnan(power_matrix.any()):
print channel, " wtf."
chan = channel.split('_')[0]
# convert channel data into a json dict
data_dict['data'][wordpair][chan] = {'timeZero': timeZero,
'timeVocalization':vocalization,
'powerMat': power_matrix}
data_dict['meta']['description'] = data['description'][0][0][0]
# print "The size of power matrices are: ", power_matrix.shape
return data_dict
def isReverse(pair1, pair2):
pair1split = pair1.split('_')
pair2split = pair2.split('_')
if pair1split[0] == pair2split[1] and pair1split[1] == pair2split[0]:
return True
else:
return False
In [41]:
print baseaccuracy.shape
print 1288/1961.
freq_labels = ['delta', 'theta', 'alpha', 'beta', 'low gamma', 'high gamma', 'HFO']
test = powerMat[:,0:4,:]
highbands = np.mean(powerMat[:,4:,:], axis=1, keepdims=True)
lowbands = powerMat[:,0:4,:]
highbands = np.mean(powerMat[:,4:,:], axis=1, keepdims=True)
power_matrix = np.concatenate((lowbands, highbands), axis=1)
print power_matrix.shape
freq_bands = np.array(['delta_theta', 'alpha', 'beta', 'lowgamma_hfo'])
print freq_bands
Now we have a base accuracy for each channel. We can start removing features and see how the accuracy changes. This will give us an idea of the importance of a certain frequency band (delta -> HFO).
Delta->Theta, Alpha, Beta, LowGamma->HFO
Store a dictionary for j=1,...,4 feature vectors and Dictionary[1] = scores for all the combinations
In [ ]:
np.random.seed(12345678) # for reproducibility, set random seed
names = ["Linear SVM",
"Random Forest",
"Quadratic Discriminant Analysis",
"Logistic Regression"]
classifiers = [
SVC(kernel="linear", C=0.5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
QuadraticDiscriminantAnalysis(),
LogisticRegression()]
In [47]:
import random
np.random.seed(123456789) # for reproducibility, set random seed
####### Function to build up that performs forward feature selection of X
# Input: - X: with samples X feature vectors
# - Y: the vector of binary labels (0, 1)
def forwardFeatureSelection(features, y):
################# 03: Perform Logistic Regression and Feature Selection #################
# y = np.ones((same_word_data.shape[0],))
# y = np.concatenate((y, np.zeros((diff_word_data.shape[0],))))
# X = np.concatenate((same_word_data, diff_word_data), axis=0)
X = features
y = y
numPartitions = 20
partitionLength = X.shape[0]/numPartitions
# print X.shape
# print X.shape[0]/numPartitions
## Shuffle feature set and labels
c = np.c_[X.reshape(len(X), -1), y.reshape(len(y), -1)]
X2 = c[:, :X.size//len(X)].reshape(X.shape)
y2 = c[:, X.size//len(X)].reshape(y.shape)
np.random.shuffle(c)
m = 4 # number of attributes
# loop through all partitions of dataset
indice_range = range(0, len(y2))
for i in range(0, numPartitions):
score_dict = []
max_score = 0
## 01: set outer test set and labels
if i==numPartitions-1:
indices = i*partitionLength
outertestset = X2[indices:,:,:]
outertestlabels = y2[indices:]
else:
indices = range(i*partitionLength,(i+1)*partitionLength)
outertestset = X2[indices,:,:]
outertestlabels = y2[indices]
## 02: Create outertrainset
indices = list(set(indice_range) - set(indices))
outertrainset = X2[indices,:,:]
outertrainlabels = y2[indices]
## 03: Create innertrainset and innertestset
# create a random 70% of the indices
indices = np.random.choice(range(outertrainlabels.shape[0]), size=np.floor(outertrainlabels.shape[0]*0.7), replace=False)
innertrainset = outertrainset[indices,:,:]
innertrainlabels = outertrainlabels[indices]
# innertestset created from the other 30% of the partition
indice_range = range(0, outertrainlabels.shape[0])
indices = list(set(indice_range) - set(indices))
innertestset = outertrainset[indices,:,:]
innertestlabels = outertrainlabels[indices]
## Search for best feature set through all combinations of m
features = np.reshape(innertrainset, (innertrainset.shape[0], 6*4))
y = innertrainlabels
testrange = np.arange(0,4,1) # test range to provide itertools with comb of #'s
for j in range(1, m):
score_dict.append({})
combs = list(itertools.combinations(testrange,j))
for cdx, comb in enumerate(combs):
indices = [range(comb[i]*6, comb[i]*6+6) for i in range(0, len(comb))]
# print indices
# print features.shape
new_features = np.reshape(features[:, indices], (features.shape[0], 6*len(comb)))
## perform leave one out
clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1).fit(new_features, y)
loo = LeaveOneOut(len(new_features))
scores = cross_validation.cross_val_score(clf, new_features, y, cv=loo)
## Store performance in dictionary
if len(comb) == 1:
key = freq_bands[comb[0]]
elif len(comb) == 2:
key = freq_bands[comb[0]] + '&' + freq_bands[comb[1]]
elif len(comb) == 3:
key = freq_bands[comb[0]] + '&' + freq_bands[comb[1]] + '&' + freq_bands[comb[2]]
elif len(comb) == 4:
key = 'all'
if max_score < scores.mean():
max_score = max(scores.mean(), max_score)
max_id = comb
score_dict[j-1][key] = [scores.mean(), scores.std()]
# break
# break
# print score_dict
break
print max_id
print max_score
In [48]:
######## Get list of files (.mat) we want to work with ########
subj = 'NIH039' # change the directories if you want
filedir = '../../condensed_data_' + subj + '/sessions/'
sessions = os.listdir(filedir)
# sessions = sessions[2:] # change which sessions we want
print "Analyzing subject: ", subj
print "The sessions are: ", sessions
baseaccuracy=np.zeros((72,2))
print baseaccuracy.shape
debug_on = 0
channels = np.arange(1, 73, 1)
for cdx, channel in enumerate(channels):
channel = str(channel)
# initialize arrays
same_word_data = np.array(()) # array to store all the feature freq. vectors for a specific word
diff_word_data = np.array(())
# loop through each session
for idx, session in enumerate(sessions):
# the session directory
sessiondir = filedir + sessions[idx]
# get all blocks for this session
blocks = os.listdir(sessiondir)
if idx==0:
print "Each session has: \n", blocks, ' \n'
if len(blocks) != 6: # error check on the directories
print blocks
print("Error in the # of blocks. There should be 5.")
break
# loop through each block one at a time, analyze
for i in range(0, 6):
block = blocks[i]
block_dir = sessiondir + '/' + block
# in each block, get list of word pairs from first and second block
wordpairs = os.listdir(block_dir)
# within-groups analysis only has: SAME, REVERSE, DIFFERENT
diff_word_group = []
reverse_word_group = []
same_word_group = []
################# 01: Create WordPair Groups #################
# create same group pairs
for idx, pair in enumerate(wordpairs):
same_word_group.append([pair, pair])
# create reverse, and different groups
for idx, pairs in enumerate(itertools.combinations(wordpairs,2)):
if isReverse(pairs[0], pairs[1]):
reverse_word_group.append([pairs[0], pairs[1]])
else:
diff_word_group.append([pairs[0], pairs[1]])
# print meta about this code
if debug_on:
print "Analyzing block ", blocks[i]
print 'Subject: ', subj
print 'Session: ', session
print 'Block: ', block, '\n'
## EXTRACT DATA INTO A DICTIONARY
block_data = extractSubjSessionBlockData(subj, session, block)
# print block_data['data'].keys()
# print block_data['meta']
# get the list of channels for this subject
# wordpair = block_data['data'].keys()[0]
# channels = block_data['data'][wordpair].keys()# channels
################# 02a: Same Words Feature Construction #################
# extract channel data for same word group
for wdx, same_words in enumerate(same_word_group):
# extract data to process - average across time
same_word_key = same_words[0]
probeOnTime = block_data['data'][same_word_key][channel]['timeZero']
numevents = block_data['data'][same_word_key][channel]['powerMat'].shape[0]
if numevents % 2 != 0:
block_data['data'][same_word_key][channel]['powerMat'] = block_data['data'][same_word_key][channel]['powerMat'][0:numevents-1,:,:]
## split same_word data in within groups in half
vocalizationTime = block_data['data'][same_word_key][channel]['timeVocalization']
powerMat_first = block_data['data'][same_word_key][channel]['powerMat'][0:numevents/2,:,:]
powerMat_second = block_data['data'][same_word_key][channel]['powerMat'][numevents/2:numevents,:,:]
powerMat = powerMat_second - powerMat_first # get the difference in spectral power
# append each event
for i in range(0, numevents/2):
# either go from timezero -> vocalization, or some other timewindow
if same_word_data.size == 0:
same_word_data = powerMat[i,:,probeOnTime:probeOnTime+6]
same_word_data = np.reshape(same_word_data, (1, 7, 6))
else:
same_word_data = np.append(same_word_data, np.reshape(powerMat[i,:,probeOnTime:probeOnTime+6], (1,7,6)), axis=0)
################# 02b: Different Words Feature Construction #################
for diff_words in diff_word_group:
# extract wordKey and data from MAIN block dictinoary
diff_word_one = diff_words[0]
diff_word_two = diff_words[1]
probeOnTime = block_data['data'][diff_word_one][channel]['timeZero']
vocalizationTime = block_data['data'][diff_word_one][channel]['timeVocalization']
powerMat_first = block_data['data'][diff_word_one][channel]['powerMat']
powerMat_second = block_data['data'][diff_word_two][channel]['powerMat']
# check events and get the number of events X frequency X time
numevents_first = powerMat_first.shape[0]
numevents_second = powerMat_second.shape[0]
numevents = min(numevents_first, numevents_second)
powerMat = powerMat_second[0:numevents,:,:] - powerMat_first[0:numevents,:,:]
# append each event
for i in range(0, numevents):
# either go from timezero -> vocalization, or some other timewindow
if diff_word_data.size == 0:
diff_word_data = np.reshape(powerMat[i,:,probeOnTime:probeOnTime+6], (1,7,6))
else:
diff_word_data = np.append(diff_word_data, np.reshape(powerMat[i,:,probeOnTime:probeOnTime+6], (1,7,6)), axis=0)
# break # only do 1 channel
# break # 1 block
# break # 1 session
buff1 = np.mean(same_word_data[:,0:2,:], axis=1, keepdims=True)
buff2 = same_word_data[:,2:4,:]
buff3 = np.mean(same_word_data[:,4:,:], axis=1, keepdims=True)
same_word_data = np.concatenate((buff1, buff2, buff3), axis=1)
buff1 = np.mean(diff_word_data[:,0:2,:], axis=1, keepdims=True)
buff2 = diff_word_data[:,2:4,:]
buff3 = np.mean(diff_word_data[:,4:,:], axis=1, keepdims=True)
diff_word_data = np.concatenate((buff1, buff2, buff3), axis=1)
# print same_word_data.shape
# print diff_word_data.shape
################# 03: Perform Logistic Regression and Feature Selection #################
# Create classes and feature vects
# first_data = np.reshape(diff_word_data, (diff_word_data.shape[0], 24))
# zero_data = np.reshape(same_word_data, (same_word_data.shape[0], 24))
first_data = diff_word_data
zero_data = same_word_data
features = np.append(first_data, zero_data, axis=0)
y = np.ones((first_data.shape[0],))
y = np.concatenate((y, np.zeros((zero_data.shape[0],))))
print "channel: ", channel
forwardFeatureSelection(features, y)
# print features.shape
# print y.shape
# for idx, cla in enumerate(classifiers):
# print names[idx]
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, y, test_size=0.4, random_state=0)
# clf = cla.fit(X_train, y_train)
# loo = LeaveOneOut(len(features))
# scores = cross_validation.cross_val_score(clf, features, y, cv=loo)
# baseaccuracy[cdx,] = [scores.mean(), scores.std()]
# print "Channel: ", channel
# print baseaccuracy[cdx,]
# break
Here I went through each channel and did a forward feature selection using Logistic Regression. The best accuracy score I got was around 68% with channel 41, which could be just due to variance though.
The general frequency bands that were selected though were delta_theta, alpha, beta and lowgamma_HFO.
Some things I could try differently are the time periods after probeWord is on, but the difficulty is in the fact that not all events have vocalization at the same time, so if I timelock to vocalization, I can only do vocalization -> vocalization-~600 ms, because some people responded within half a second.