In [87]:
# Import Necessary Libraries
import numpy as np
import os, csv, json
import math
import random
import itertools
import matplotlib
from matplotlib import *
from matplotlib import pyplot as plt
import scipy.io
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# pretty charting
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')
%matplotlib inline
In [83]:
np.random.seed(12345678) # for reproducibility, set random seed
names = ["Nearest Neighbors", "Linear SVM", "Random Forest",
"Linear Discriminant Analysis", "Quadratic Discriminant Analysis",
"Logistic Regression"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
LogisticRegression()]
In [5]:
######## Get list of files (.mat) we want to work with ########
filedir = '../condensed_data/freq_probeTovocal_binned/'
files = []
for file in os.listdir(filedir):
if file.endswith('.mat'):
files.append(file)
print "There are ", len(files), " files inside our directory"
print "This should match the number of channels."
######## Load in events struct to find correct events
eventsDir = '../NIH034/behavioral/paRemap/' + 'events.mat'
events = scipy.io.loadmat(eventsDir)
events = events['events']
# get correct events
correctIndices = events['isCorrect'] == 1
events = events[correctIndices]
print "This is the length of the events struct with only correct responses: ", len(events)
In [33]:
################## LOOPING THROUGH EACH CHANNEL ##################
probe_dict = {} # the dict to hold the feature matrix for each channel
for f in range(0, len(files)):
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
mat_file = filedir + files[f]
data = scipy.io.loadmat(mat_file)
data = data['data']
## 01: reformat unique trigger types
uniqueTrigTypes = data['uniqueTrigType'][0][0][0]
buff = []
for trig in uniqueTrigTypes:
buff.append(str(trig[0]))
uniqueTrigTypes = buff
## 02: reformat trigger types
trigTypes = data['trigType'][0][0][0]
buff = []
for trig in trigTypes:
buff.append(str(trig[0]))
trigTypes = buff
## 05: get power matrix Z is a #events X #freq. bands
matrix = data['powerMatZ'][0][0]
########### Getting those events and the corresonding averaged powermat ################
## Get events of interest
TRIGGER_TYPES = uniqueTrigTypes
probeWords = events['probeWord']
features = {}
for i in range(0,len(TRIGGER_TYPES)): # LOOP THRU EACH PROBEWORD
current_trig = TRIGGER_TYPES[i]
## 01: get indices of the current trigger and get those events
tempInd = events['probeWord'] == current_trig
tempEvents = events[tempInd]
# # average across events
# thisMat = np.mean(matrix[tempInd,:,], axis=0)
thisMat = matrix[tempInd,:]
features[current_trig] = thisMat
probe_dict[str(f+1)] = features
print "The final feature dictionary will have features from "
print "the following channels: ", sorted(probe_dict.keys())
In [119]:
summ = 0
for key in sorted(probe_dict['1'].keys()):
print key
summ += probe_dict['1'][key].shape[0]
print probe_dict['1'][key].shape
print summ
In [51]:
### Concatenate a dict of all feature match/pairs
match_features = dict.fromkeys(probe_dict['1'].keys())
for key in match_features.keys():
match_features[key] = []
print "These are the keys in our dict: ", match_features.keys(), "\n"
# loop through each probe_dict keys (channel)
channels = probe_dict.keys()
for idx, chan in enumerate(channels):
probe_channel = probe_dict[chan] # get the specific feature for that channel
# loop through each match pair
probes = probe_channel.keys()
for probe in probes:
# get feature for this match/pair
feature = probe_channel[probe]
if idx==0:
match_features[probe] = feature
match_features[probe] = np.array(match_features[probe])
else:
match_features[probe] = np.append(match_features[probe],feature,axis=1)
# convert everything into np arrays
for key in match_features.keys():
match_features[key] = np.array(match_features[key])
print "Shape for Probeword: ", key, " ", match_features[key].shape
print "This is the shape of our new feature matrix for a certain word pair match: "
print match_features[pair].shape
In [105]:
comb = sum([map(list, itertools.combinations(match_features.keys(), 2))], [])
print comb
In [104]:
accuracy=np.zeros((len(comb),len(classifiers),2))
print accuracy.shape
for i,pair in enumerate(comb):
# Create classes and feature vects
firstprobe = match_features[pair[0]]
secondprobe = match_features[pair[1]]
features = np.append(firstprobe, secondprobe, axis=0)
y = np.ones((firstprobe.shape[0],))
y = np.concatenate((y, np.zeros((secondprobe.shape[0],))))
print("Accuracy for pair: ", pair)
for idx, cla in enumerate(classifiers):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, y, test_size=0.4, random_state=0)
clf = cla.fit(X_train, y_train)
loo = LeaveOneOut(len(features))
scores = cross_validation.cross_val_score(clf, features, y, cv=loo)
accuracy[i,idx,] = [scores.mean(), scores.std()]
# print("Accuracy of %s: %0.2f (+/- %0.2f)" % (names[idx], scores.mean(), scores.std() * 2))
In [116]:
################## LOOPING THROUGH EACH CHANNEL ##################
feature_dict = {} # the dict to hold the feature matrix for each channel
for f in range(0, len(files)):
############### Set up data from the channel's mat file ##############
# Go through each .mat file
mat_file = filedir + files[f]
data = scipy.io.loadmat(mat_file)
data = data['data']
## 01: reformat unique trigger types
uniqueTrigTypes = data['uniqueTrigType'][0][0][0]
buff = []
for trig in uniqueTrigTypes:
buff.append(str(trig[0]))
uniqueTrigTypes = buff
## 02: reformat trigger types
trigTypes = data['trigType'][0][0][0]
buff = []
for trig in trigTypes:
buff.append(str(trig[0]))
trigTypes = buff
## 03: get channel number
chanNum = data['chanNum'][0][0][0][0]
## 04: get channel string
chanStr = data['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands
matrix = data['powerMatZ'][0][0]
## 06: get freq band ticks and ylabels
freqBandYtick = data['freqBandYtick'][0][0][0]
freqBandYlabel = data['freqBandYlabel'][0][0][0]
buff = []
for freq in freqBandYlabel:
buff.append(str(freq[0]))
freqBandYlabel = buff
################ Getting those events and the corresonding averaged powermat #############
## Get events of interest
TRIGGER_TYPES = uniqueTrigTypes
probeWords = events['probeWord']
targetWords = events['targetWord']
# number of frequency bins
num_freqs = len(freqBandYtick) - 1
# total number of "data centers"
# num_features = len(TRIGGER_TYPES)*len(tempTargets)
features = {}
for i in range(0,len(TRIGGER_TYPES)): # LOOP THRU EACH PROBEWORD
current_trig = TRIGGER_TYPES[i]
## 01: get indices of the current trigger and get those events
tempInd = events['probeWord'] == current_trig
tempEvents = events[tempInd]
tempTargets = np.unique(tempEvents['targetWord'])
## 02: go through each target word for this probeword
for j in range(0, len(tempTargets)):
targetWord = tempTargets[j][0] # set target word
# get the indices of the events we want probe/target match
eventInd = events['probeWord'] == current_trig
eventInd2 = events['targetWord'] == targetWord
eventInd = eventInd & eventInd2
# get the matrix we want and average across all events
# thisMat = np.mean(matrix[eventInd,:], axis=0)
# -> a 7x1 vector that represents this match for this channel
thisMat = matrix[eventInd,:]
feature_key = str(current_trig) + '_' + str(targetWord)
features[feature_key] = thisMat
# clear vars
eventInd2 = 0
# turn features into np array and append to a dict of the features
# features = np.array(features)
feature_dict[str(f+1)] = features
print "The final feature dictionary will have features"
print "from the following channels: ", sorted(feature_dict.keys())
In [120]:
# summ = 0
# for key in sorted(feature_dict['1'].keys()):
# print key
# summ += feature_dict['1'][key].shape[0]
# print feature_dict['1'][key].shape
# print summ
In [124]:
### Concatenate a dict of all feature match/pairs
match_features = dict.fromkeys(feature_dict['1'].keys())
for key in match_features.keys():
match_features[key] = []
print "These are the keys in our dict: ", match_features.keys(), "\n"
# loop through each probe_dict keys (channel)
channels = feature_dict.keys()
for idx, chan in enumerate(channels):
match_channel = feature_dict[chan] # get the specific feature for that channel
# loop through each match pair
matches = match_channel.keys()
for match in matches:
# get feature for this match/pair
feature = match_channel[match]
if idx==0:
match_features[match] = feature
match_features[match] = np.array(match_features[match])
else:
match_features[match] = np.append(match_features[match],feature,axis=1)
# convert everything into np arrays
for key in match_features.keys():
match_features[key] = np.array(match_features[key])
print "Shape for Probeword: ", key, " ", match_features[key].shape
print "This is the shape of our new feature matrix for a certain word pair match: "
print match_features['JUICE_GLASS'].shape
print "#events X (frequency vector X 96 channels)"
In [126]:
comb = sum([map(list, itertools.combinations(match_features.keys(), 2))], [])
In [127]:
accuracy=np.zeros((len(comb),len(classifiers),2))
print accuracy.shape
for i,pair in enumerate(comb):
# Create classes and feature vects
firstprobe = match_features[pair[0]]
secondprobe = match_features[pair[1]]
features = np.append(firstprobe, secondprobe, axis=0)
y = np.ones((firstprobe.shape[0],))
y = np.concatenate((y, np.zeros((secondprobe.shape[0],))))
print("Accuracy for pair: ", pair)
for idx, cla in enumerate(classifiers):
X_train, X_test,
y_train, y_test = cross_validation.train_test_split(features, y,
test_size=0.4, random_state=0)
clf = cla.fit(X_train, y_train)
loo = LeaveOneOut(len(features))
scores = cross_validation.cross_val_score(clf, features, y, cv=loo)
accuracy[i,idx,] = [scores.mean(), scores.std()]
print "Accuracy of %s: %0.2f (+/- %0.2f)" % (names[idx], scores.mean(), scores.std() * 2
In [130]:
for i,pair in enumerate(comb):
print pair
for idx, cla in enumerate(classifiers):
print("Accuracy of %s: %0.2f (+/- %0.2f)" % (names[idx], accuracy[i,idx,0], accuracy[i,idx,1] * 2))
In [136]:
# Plotting accuracy
print accuracy.shape
plt.errorbar(np.arange(0,66), accuracy[:,0,0], yerr = accuracy[:,0,1], hold=True, label=names[0])
plt.errorbar(np.arange(0,66), accuracy[:,1,0], yerr = accuracy[:,1,1], color='green', hold=True, label=names[1])
plt.errorbar(np.arange(0,66), accuracy[:,2,0], yerr = accuracy[:,2,1], color='red', hold=True, label=names[2])
plt.errorbar(np.arange(0,66), accuracy[:,3,0], yerr = accuracy[:,3,1], color='black', hold=True, label=names[3])
plt.errorbar(np.arange(0,66), accuracy[:,4,0], yerr = accuracy[:,4,1], color='brown', hold=True, label=names[4])
plt.errorbar(np.arange(0,66), accuracy[:,5,0], yerr = accuracy[:,5,1], color='orange', hold=True, label=names[5])
# plt.xscale('log')
plt.xlabel('Combination Of Match Pairs')
plt.ylabel('accuracy')
plt.title('Accuracty of Word Pair classification Under Real Data')
plt.axhline(1, color='red', linestyle='--')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
We see here that all of our classifiers essentially performed as well as chance still...