In [2]:
# Import Necessary Libraries
import numpy as np
import os, csv, json
import math
import random
import operator
import collections
import itertools
import matplotlib
from matplotlib import *
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
import scipy.stats as stats
import scipy.io
from scipy.spatial import distance as Distance
# pretty charting
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')
%matplotlib inline
In [3]:
######## Get list of files (.mat) we want to work with ########
filedir = '../condensed_data/groups/'
files = []
groups = []
# get all unique word match pairs and store in 'groups' list
for file in os.listdir(filedir):
groups.append(file)
if file.endswith('.mat'):
files.append(file)
######## Load in EVENTS struct to find correct events
eventsDir = '../NIH034/behavioral/paRemap/' + 'events.mat'
events = scipy.io.loadmat(eventsDir)
events = events['events']
# print number of incorrect events and which words they belonged to
incorrectIndices = events['isCorrect'] == 0
incorrectEvents = events[incorrectIndices]
incorrectWords = []
wordList = {}
for i in range(0, len(incorrectEvents)):
incorrectWords.append(incorrectEvents['probeWord'][i][0])
for word in np.unique(incorrectEvents['probeWord']):
wordList[str(word)] = sum(incorrectWords == word)
print "There were ",len(incorrectEvents), " number of incorrect events."
print "The list of incorrect probe words: \n", wordList
#
# get only correct events
correctIndices = events['isCorrect'] == 1
events = events[correctIndices]
print "\nThis is the length of the events struct with only correct responses: ", len(events)
print "The group of word pairings are: ", groups
In [4]:
diff_words_groups = []
reverse_words_groups = []
probe_words_groups = ()
target_words_groups = []
def inGroup(group, names):
for i in range(0, len(group)):
if cmpT(group[i],names):
return True
return False
def cmpT(t1, t2):
return sorted(t1) == sorted(t2)
# Create different groups
for pair_first in groups:
# split words by delimiter '_' to determine groups
firstpair = pair_first.split('_')
for pair_second in groups:
secondpair = pair_second.split('_')
# make directory names for each word pair
firstname = '_'.join(firstpair)
secondname = '_'.join(secondpair)
names = (firstname, secondname)
## 01: Different words group
if not any(x in secondpair for x in firstpair) and not inGroup(diff_words_groups,names):
diff_words_groups += (names,)
## 02: Probe Word Overlap Group
if firstpair[0] == secondpair[0] and firstpair[1] != secondpair[1] and not inGroup(probe_words_groups,names):
probe_words_groups += (names,)
## 03: Target Word Overlap Group
if firstpair[1] == secondpair[1] and firstpair[0] != secondpair[0] and not inGroup(target_words_groups,names):
target_words_groups += (names,)
## 04: Reverse words Group
reverse_firstpair = firstpair
reverse_firstpair.reverse()
if '_'.join(reverse_firstpair) == secondname and not inGroup(reverse_words_groups,names):
reverse_words_groups += (names,)
## printing
print "For different words:"
for i in range(0, len(diff_words_groups)):
print diff_words_groups[i]
print "For reverse words:"
for i in range(0, len(reverse_words_groups)):
print reverse_words_groups[i]
print "For probe words:"
for i in range(0, len(probe_words_groups)):
print probe_words_groups[i]
print "For target words:"
for i in range(0, len(target_words_groups)):
print target_words_groups[i]
Now we have lists of combinations of word pairs that we want to compare for each group. Hypothetically, we would assume that the "different word pairs" have distances farthest away and the reversed words, have distances closer, and probe and target are also closer.
Next, using ANOVA to perform a cutoff on channels and only taking the delta, theta and high gamma frequencies, we want to uncover this result.
The struct from MATLAB has
Input: A list of directories that correspond to each subgroup. Within each directory there is a list of structs that represent the data from each channel. If we want to further segment our data... say by block#, we would have to go into the data extraction and add a few lines of code that filter out the events by which block we're interested in.
Algorithm:
In [5]:
# Perform a type of grid search over the channels to determine the most significant channels
def gridSearchChannels(epsilon, first_group_files, second_group_files):
################## LOOPING THROUGH EACH CHANNEL ##################
chan_pvals = {}
chan_pval = []
chan_sig = []
for f in range(0, len(first_group_files)):
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_group_files[f]
second_mat_file = second_filedir + second_group_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, [0,1,5],:]
second_matrix = second_matrix[:, [0,1,5],:]
### 02: Run ANOVA on each frequency/time window
p_vals = []
for freq in range(0, first_matrix.shape[1]):
for time in range(0, first_matrix.shape[2]):
stat, p_val = stats.f_oneway(first_matrix[:,freq,time], second_matrix[:,freq,time])
p_vals.append(1. - p_val)
chan_pvals[str(first_chanNum)] = sum(p_vals)
chan_pval.append(sum(p_vals))
# compute quantile of 75% of most significant channel
thresh = np.percentile(chan_pval, epsilon)
chans_to_analyze = [k for k in chan_pvals if chan_pvals[k] > thresh]
return chans_to_analyze
def add_list_to_set(my_list, my_set):
[my_set.add(each) for each in my_list]
return my_set
In [5]:
###### Create
print diff_words_groups
chans_to_analyze = set()
for group in diff_words_groups:
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
for file in os.listdir(first_filedir):
if file.endswith('.mat'):
first_files.append(file)
for file in os.listdir(second_filedir):
if file.endswith('.mat'):
second_files.append(file)
add_list_to_set(gridSearchChannels(95, first_files, second_files), chans_to_analyze)
print len(chans_to_analyze)
tuple(chans_to_analyze)
print len(chans_to_analyze)
print set(chans_to_analyze)
In [6]:
##### HYPER-PARAMETERS TO TUNE
anova_threshold = 90 # how many channels we want to keep
distances = Distance.cosine # define distance metric to use
num_time_windows = 5
freq_bands = [0, 1, 5]
freq_bands = np.arange(0,7,1)
print freq_bands
In [7]:
num_groups = len(diff_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))
for idx, group in enumerate(diff_words_groups):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
# chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
chans_to_analyze = str(np.arange(1,96,1))
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = []
second_mean = []
for i in range(0, len(first_vocalization)):
# first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
# second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
#### end of if/else
#### end of for loop
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
# print first_feature.shape
# print second_feature.shape
# compute freqX1 centroid vector
first_centroid = np.mean(first_feature,axis=0)
second_centroid = np.mean(second_feature,axis=0)
# compute list of distances from other centroid
first_hist = [distances(x, second_centroid) for x in first_feature]
second_hist = [distances(x, first_centroid) for x in second_feature]
## log transform
# first_hist = np.log(first_hist)
# second_hist = np.log(second_hist)
### 04: Plot Histogram of Distances
plt.subplot(num_groups, 1, idx+1)
# fig = plt.figure()
first_label = 'Distance for ' + group[0]
second_label = 'Distance For ' + group[1]
plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
plt.xlabel('Euclidean Distance From "other" centroid')
plt.ylabel('Count')
plt.legend(loc='upper right')
plt.tight_layout()
# break
In [8]:
num_groups = len(reverse_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))
for idx, group in enumerate(reverse_words_groups):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
# chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
chans_to_analyze = str(np.arange(1,96,1))
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = []
second_mean = []
for i in range(0, len(first_vocalization)):
# first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
# second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
#### end of if/else
#### end of for loop
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
# print first_feature.shape
# print second_feature.shape
# compute freqX1 centroid vector
first_centroid = np.mean(first_feature,axis=0)
second_centroid = np.mean(second_feature,axis=0)
# compute list of distances from other centroid
first_hist = [distances(x, second_centroid) for x in first_feature]
second_hist = [distances(x, first_centroid) for x in second_feature]
## log transform
# first_hist = np.log(first_hist)
# second_hist = np.log(second_hist)
### 04: Plot Histogram of Distances
plt.subplot(num_groups, 1, idx+1)
# fig = plt.figure()
first_label = 'Distance for ' + group[0]
second_label = 'Distance For ' + group[1]
plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
plt.xlabel('Euclidean Distance From "other" centroid')
plt.ylabel('Count')
plt.legend(loc='upper right')
plt.tight_layout()
# break
In [9]:
num_groups = len(probe_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))
for idx, group in enumerate(probe_words_groups):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
# chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
chans_to_analyze = str(np.arange(1,96,1))
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = []
second_mean = []
for i in range(0, len(first_vocalization)):
# first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
# second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
#### end of if/else
#### end of for loop
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
# print first_feature.shape
# print second_feature.shape
# compute freqX1 centroid vector
first_centroid = np.mean(first_feature,axis=0)
second_centroid = np.mean(second_feature,axis=0)
# compute list of distances from other centroid
first_hist = [distances(x, second_centroid) for x in first_feature]
second_hist = [distances(x, first_centroid) for x in second_feature]
## log transform
# first_hist = np.log(first_hist)
# second_hist = np.log(second_hist)
### 04: Plot Histogram of Distances
plt.subplot(num_groups, 1, idx+1)
# fig = plt.figure()
first_label = 'Distance for ' + group[0]
second_label = 'Distance For ' + group[1]
plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
plt.xlabel('Euclidean Distance From "other" centroid')
plt.ylabel('Count')
plt.legend(loc='upper right')
plt.tight_layout()
# break
In [10]:
num_groups = len(target_words_groups)
fig = plt.figure(figsize=(7, 5*num_groups))
for idx, group in enumerate(target_words_groups):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
# chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
chans_to_analyze = str(np.arange(1,96,1))
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = []
second_mean = []
for i in range(0, len(first_vocalization)):
# first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
# second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
#### end of if/else
#### end of for loop
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
# print first_feature.shape
# print second_feature.shape
# compute freqX1 centroid vector
first_centroid = np.mean(first_feature,axis=0)
second_centroid = np.mean(second_feature,axis=0)
# compute list of distances from other centroid
first_hist = [distances(x, second_centroid) for x in first_feature]
second_hist = [distances(x, first_centroid) for x in second_feature]
## log transform
# first_hist = np.log(first_hist)
# second_hist = np.log(second_hist)
### 04: Plot Histogram of Distances
plt.subplot(num_groups, 1, idx+1)
# fig = plt.figure()
first_label = 'Distance for ' + group[0]
second_label = 'Distance For ' + group[1]
plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
plt.xlabel('Euclidean Distance From "other" centroid')
plt.ylabel('Count')
plt.legend(loc='upper right')
plt.tight_layout()
# break
I tried using cosine and euclidean distance measures and varying the amount of channels I include by changing the threshold hyperparameter. Many of these functions seem to overlay each other.
What I want to try underneath here is to show the distances for one of the very different words vs. a word pair that should be more similar and plot those two histograms. My worry is that there are so many euclidean distances close to 0.
In [6]:
##### HYPER-PARAMETERS TO TUNE
anova_threshold = 90 # how many channels we want to keep
distances = Distance.cosine # define distance metric to use
num_time_windows = 5
freq_bands = [0, 1, 5]
# freq_bands = np.arange(0,7,1)
freq_labels = ['delta', 'theta', 'alpha', 'beta', 'low gamma', 'high gamma', 'HFO']
print freq_bands
print [freq_labels[i] for i in freq_bands]
print "The length of the feature vector for each channel will be: ", num_time_windows*len(freq_bands)
In [37]:
## plot all in pca space vs. another
num_groups = len(diff_words_groups)
# fig = plt.figure(figsize=(7, 5*num_groups))
for idx, group in enumerate(diff_words_groups):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
# chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
chans_to_analyze = str(np.arange(1,97,1))
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = []
second_mean = []
for i in range(0, len(first_vocalization)):
# first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
# second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
#### end of if/else
#### end of for loop
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
### 04: PCA On Features
pca = PCA(n_components=3)
first_pca = pca.fit_transform(first_feature)
second_pca = pca.fit_transform(second_feature)
# compute freqX1 centroid vector
# first_centroid = np.mean(first_feature,axis=0)
# second_centroid = np.mean(second_feature,axis=0)
# # compute list of distances from other centroid
# first_hist = np.array([distances(x, second_centroid) for x in first_pca])
# second_hist = np.array([distances(x, first_centroid) for x in second_pca])
# print first_feature.shape
# print second_feature.shape
# print first_hist.shape
# print second_hist.shape
# print first_pca.shape
# print second_pca.shape
# break
plt.figure(figsize=(20,15))
pca_plot_i = 1
for i in range(0, first_pca.shape[1]):
for j in range(0, first_pca.shape[1]):
plt.subplot(first_pca.shape[1], first_pca.shape[1], pca_plot_i)
if i==j:
axes = plt.gca()
ymin, ymax = axes.get_ylim()
xmin, xmax = axes.get_xlim()
plt.text((xmax-xmin)/4.5, (ymax-ymin)/2, r'PCA Component %d vs. %d'%(i+1, i+1), fontsize=20)
plt.title('PCA Plot for ' + str(group))
plt.grid(False)
else:
plt.scatter(first_pca[:,i], first_pca[:,j], color='g', label='first_feature pca')
plt.scatter(second_pca[:,i], second_pca[:,j], color='b', label='second feature pca')
plt.title('PCA Plot for ' + str(group))
plt.legend()
pca_plot_i += 1 # increment index to plot subplot
plt.tight_layout()
In [38]:
## plot all in pca space vs. another
num_groups = len(diff_words_groups)
# fig = plt.figure(figsize=(7, 5*num_groups))
for idx, group in enumerate(diff_words_groups):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
# chans_to_analyze = str(np.arange(1,97,1))
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = []
second_mean = []
for i in range(0, len(first_vocalization)):
# first_mean.append(np.mean(first_matrix[i,:,first_vocalization[i]-10:first_vocalization[i]], axis=1))
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
# second_mean.append(np.mean(second_matrix[i,:,second_vocalization[i]-10:second_vocalization[i]], axis=1))
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
#### end of if/else
#### end of for loop
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
### 04: PCA On Features
pca = PCA(n_components=3)
first_pca = pca.fit_transform(first_feature)
second_pca = pca.fit_transform(second_feature)
# compute freqX1 centroid vector
# first_centroid = np.mean(first_feature,axis=0)
# second_centroid = np.mean(second_feature,axis=0)
# # compute list of distances from other centroid
# first_hist = np.array([distances(x, second_centroid) for x in first_pca])
# second_hist = np.array([distances(x, first_centroid) for x in second_pca])
# print first_feature.shape
# print second_feature.shape
# print first_hist.shape
# print second_hist.shape
# print first_pca.shape
# print second_pca.shape
# break
plt.figure(figsize=(20,15))
print 'PCA Plot for ' + str(group)
pca_plot_i = 1
for i in range(0, first_pca.shape[1]):
for j in range(0, first_pca.shape[1]):
plt.subplot(first_pca.shape[1], first_pca.shape[1], pca_plot_i)
if i==j:
axes = plt.gca()
ymin, ymax = axes.get_ylim()
xmin, xmax = axes.get_xlim()
plt.text((xmax-xmin)/4.5, (ymax-ymin)/2, r'PCA Component %d vs. %d'%(i+1, i+1), fontsize=20)
plt.title('PCA Plot for ' + str(group))
plt.grid(False)
else:
plt.scatter(first_pca[:,i], first_pca[:,j], color='g', label='first_feature pca')
plt.scatter(second_pca[:,i], second_pca[:,j], color='b', label='second feature pca')
plt.title('PCA Plot for ' + str(group))
plt.legend()
pca_plot_i += 1 # increment index to plot subplot
plt.tight_layout()
Since there is no clear separation among these "different" word groups, I want to look at distances relative to others.
For example, completely different word groups vs. reversed word pair vs. probe word vs. target word vs. same word overlap.
Algorithm:
In [22]:
##### HYPER-PARAMETERS TO TUNE
anova_threshold = 90 # how many channels we want to keep
distances = Distance.cosine # define distance metric to use
num_time_windows = 10
freq_bands = [0, 1, 5]
# freq_bands = np.arange(0,7,1)
freq_labels = ['delta', 'theta', 'alpha', 'beta', 'low gamma', 'high gamma', 'HFO']
print freq_bands
print [freq_labels[i] for i in freq_bands]
print "The length of the feature vector for each channel will be: ", num_time_windows*len(freq_bands)
In [31]:
print len(groups)
i = 1
# loop through each unique word pairing
for word_pair in groups:
group_pairings = []
check_diff = [item for item in diff_words_groups if word_pair in item]
check_reverse = [item for item in reverse_words_groups if word_pair in item]
check_probe = [item for item in probe_words_groups if word_pair in item]
check_target = [item for item in target_words_groups if word_pair in item]
# check if there is a match in diff_words_groups, reverse_words, probe_words and target_words
if len(check_diff) > 0:
if len(check_reverse) > 0:
if len(check_probe) > 0:
if len(check_target) > 0:
# add group pairing for all 4 groups
group_pairings.append(check_diff[0])
group_pairings.append(check_reverse[0])
group_pairings.append(check_probe[0])
group_pairings.append(check_target[0])
# initialize dictionary to hold histogram data for each word pairing
group_pair_hist = {}
fig=plt.figure()
axes = plt.gca()
ymin, ymax = axes.get_ylim()
xmin, xmax = axes.get_xlim()
plt.text((xmax-xmin)/4.5, (ymax-ymin)/2, r'Relative Distance Histogram For %s'%(word_pair), fontsize=20)
fig = plt.figure(figsize=(10,10))
num_groups = len(group_pairings)
## Gotten the file names needed to analyze data
for idx, group in enumerate(group_pairings):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
# chans_to_analyze = str(np.arange(1,96,1))
# 3. LOOP THROUGH EACH CHANNEL FOR A CERTAIN GROUP (e.g. brick_clock)
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = [] # initialize average across time vector
second_mean = [] # initialize average across time vector
# append the time points
for i in range(0, len(first_vocalization)):
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
print first_feature.shape
print second_feature.shape
# compute freqX1 centroid vector
first_centroid = np.mean(first_feature,axis=0)
second_centroid = np.mean(second_feature,axis=0)
# compute list of distances from other centroid
first_hist = [distances(x, second_centroid) for x in first_feature]
second_hist = [distances(x, first_centroid) for x in second_feature]
## log transform
# first_hist = np.log(first_hist)
# second_hist = np.log(second_hist)
## 04: Plot Histogram of Distances
plt.subplot(num_groups, 1, idx+1)
# fig = plt.figure()
first_label = 'Distance for ' + group[0]
second_label = 'Distance For ' + group[1]
plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
plt.xlim([0, 2.0])
plt.xlabel('Euclidean Distance From "other" centroid')
plt.ylabel('Count')
plt.legend(loc='upper right')
plt.tight_layout()
In [32]:
print len(groups)
i = 1
# loop through each unique word pairing
for word_pair in groups:
group_pairings = []
check_diff = [item for item in diff_words_groups if word_pair in item]
check_reverse = [item for item in reverse_words_groups if word_pair in item]
check_probe = [item for item in probe_words_groups if word_pair in item]
check_target = [item for item in target_words_groups if word_pair in item]
# check if there is a match in diff_words_groups, reverse_words, probe_words and target_words
if len(check_diff) > 0:
if len(check_reverse) > 0:
if len(check_probe) > 0:
if len(check_target) > 0:
# add group pairing for all 4 groups
group_pairings.append(check_diff[0])
group_pairings.append(check_reverse[0])
group_pairings.append(check_probe[0])
group_pairings.append(check_target[0])
# initialize dictionary to hold histogram data for each word pairing
group_pair_hist = {}
fig=plt.figure()
axes = plt.gca()
ymin, ymax = axes.get_ylim()
xmin, xmax = axes.get_xlim()
plt.text((xmax-xmin)/4.5, (ymax-ymin)/2, r'Relative Distance Histogram For %s'%(word_pair), fontsize=20)
fig = plt.figure(figsize=(10,10))
num_groups = len(group_pairings)
## Gotten the file names needed to analyze data
for idx, group in enumerate(group_pairings):
######## Get list of files (.mat) we want to work with ########
first_filedir = '../condensed_data/groups/' + group[0] + '/'
second_filedir = '../condensed_data/groups/' + group[1] + '/'
# get list of files for both word pairs
first_files = []
second_files = []
files01 = os.listdir(first_filedir)
files02 = os.listdir(second_filedir)
for i in range(0, len(files01)):
if files01[i].endswith('.mat'):
first_files.append(files01[i])
if files02[i].endswith('.mat'):
second_files.append(files02[i])
# chans_to_analyze = gridSearchChannels(anova_threshold, first_files, second_files)
chans_to_analyze = str(np.arange(1,96,1))
# 3. LOOP THROUGH EACH CHANNEL FOR A CERTAIN GROUP (e.g. brick_clock)
################## LOOPING THROUGH EACH CHANNEL ##################
first_feature = []
second_feature = []
for f in range(0, len(first_files)):
# make sure this is a channel we want
chan_num = first_files[f].split('_')[0]
if chan_num in chans_to_analyze:
#################### Set up data from the channel's mat file ####################
# Go through each .mat file
first_mat_file = first_filedir + first_files[f]
second_mat_file = second_filedir + second_files[f]
data_first = scipy.io.loadmat(first_mat_file)
data_first = data_first['data']
data_second = scipy.io.loadmat(second_mat_file)
data_second = data_second['data']
## 01: Grab probewords for each struct
first_probeWord = data_first['probeWord'][0][0][0]
second_probeWord = data_second['probeWord'][0][0][0]
## 02: Grab targetwords for each struct
first_targetWord = data_first['targetWord'][0][0][0]
second_targetWord = data_second['targetWord'][0][0][0]
## 03: get channel number
first_chanNum = data_first['chanNum'][0][0][0][0]
second_chanNum = data_second['chanNum'][0][0][0][0]
## 04: get channel string
first_chanStr = data_first['chanStr'][0][0][0]
second_chanStr = data_second['chanStr'][0][0][0]
## 05: get power matrix Z is a #events X #freq. bands X #time bins
first_matrix = data_first['powerMatZ'][0][0]
second_matrix = data_second['powerMatZ'][0][0]
## 06: get the time point for probeword on
first_timeZero = data_first['timeZero'][0][0][0]
second_timeZero = data_second['timeZero'][0][0][0]
## 07: get the time point of vocalization
first_vocalization = data_first['vocalization'][0][0][0]
second_vocalization = data_second['vocalization'][0][0][0]
########### FINISHED EXTRACTING DATA FROM STRUCTS ###################
### 01: create power matrix with only delta, theta and high gamma
first_matrix = first_matrix[:, freq_bands,:]
second_matrix = second_matrix[:, freq_bands,:]
### 02: get only the time point before vocalization
first_mean = [] # initialize average across time vector
second_mean = [] # initialize average across time vector
# append the time points
for i in range(0, len(first_vocalization)):
first_mean.append(np.ndarray.flatten(first_matrix[i,:,first_vocalization[i]-num_time_windows:first_vocalization[i]]))
for i in range(0, len(second_vocalization)):
second_mean.append(np.ndarray.flatten(second_matrix[i,:,second_vocalization[i]-num_time_windows:second_vocalization[i]]))
# append the averaged time and event vector = frequency band feature
first_feature.append(np.mean(first_mean,axis=0))
second_feature.append(np.mean(second_mean,axis=0))
### 03: after looping through all significant channels, create list of feature vectors from each channel
first_feature = np.array(first_feature)
second_feature = np.array(second_feature)
print first_feature.shape
print second_feature.shape
# compute freqX1 centroid vector
first_centroid = np.mean(first_feature,axis=0)
second_centroid = np.mean(second_feature,axis=0)
# compute list of distances from other centroid
first_hist = [distances(x, second_centroid) for x in first_feature]
second_hist = [distances(x, first_centroid) for x in second_feature]
## log transform
# first_hist = np.log(first_hist)
# second_hist = np.log(second_hist)
## 04: Plot Histogram of Distances
plt.subplot(num_groups, 1, idx+1)
# fig = plt.figure()
first_label = 'Distance for ' + group[0]
second_label = 'Distance For ' + group[1]
plt.hist(first_hist, label=first_label, lw=3, alpha = 0.75)
plt.hist(second_hist, label=second_label, lw=3, alpha = 0.5)
plt.title('Comparing Distances for: ' + group[0] + ' and ' + group[1])
plt.xlim([0, 2.0])
plt.xlabel('Euclidean Distance From "other" centroid')
plt.ylabel('Count')
plt.legend(loc='upper right')
plt.tight_layout()
In [ ]: