In [10]:
import numpy as np
import pandas as pd
from math import log
from os import listdir
from os.path import isfile, join
from scipy.stats import linregress
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from time import time
from timeit import timeit
#Returns 1 if a point is inside a radius, if not, returns 0
def dist(p, r):
return 1 if p <= r else 0
#Makes the distance function aplicable to numpy arrays
check_dist = np.vectorize(dist)
The algorithm is adjusted to the dataset of the experiment so the number of attributes must be modified, it calculates the approximated fractal dimension after deleting an attribute, if the new value is inside the threshold value it can be eliminated and ends when no attribute is delete from the dataset.
In [11]:
def fractal_feature_selection(df, threshold=0.09):
#Obtains the approximate fractal dimension of the original dataset
base_fd = fractal_dimension(df)
print('Whole dataset approximate fractal dimension: {}'.format(base_fd))
#List for keeping track of the attributes index in ther starting order
sorted_attrib = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5],
[6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13]]
attribute_not_deleted = True
while attribute_not_deleted:
fd_list = []
for i in sorted_attrib:
#Deletes i attribute from dataset
X = np.delete(df, i[0], axis=1)
partial_fd = fractal_dimension(X)
#Adds the information of the approximate fractal dimension to a list to obtain the one that
#contribute less to the whole dataset
fd_list.append([i[0],
partial_fd,
abs((partial_fd / indicator_fd) - 1),
abs((partial_fd / indicator_fd) - 1) < threshold])
#Sort by partial fractal dimension value
fd_list.sort(key = lambda row: row[2])
for i in fd_list:
#Checks if the variation of the fractal dimension value is inside the threshold
if i[3] == True:
#Update fractal dimension value
indicator_fd = i[1]
#Deletes attribute that doesn't contributes more the threshold value to the farctal dimension value
df = np.delete(df, i[0], axis=1)
#Deletes the i attribute from our reference list
sorted_attrib = np.delete(sorted_attrib, i[0], axis=0)
#Decremets the relative value of the attributes to the right of the deleted one
for j in xrange(i[0], len(sorted_attrib)):
sorted_attrib[j][0] -= 1
break
#No attribute was deleted
attribute_not_deleted = False
return sorted_attrib
In [12]:
def fractal_dimension(dataset):
#Data set cardinality
N = len(dataset)
#Results list of correlation integral values
cm = []
#List of radius to test distance between points
r = [1.0];
r_index = 0;
#Executes while the sumation is greater than 0
tempSumation = 0
while True:
#Number of points that return 1 in the heaviside function
sumation = 0
#Obtaining distance between point Xi and all of the others
for j in range(N-1):
euclidean_dist_array = euclidean_distances(dataset[j].reshape(1, -1), dataset[j+1:])
sumation += np.sum(check_dist(euclidean_dist_array, r[r_index]))
if sumation <= 0 or tempSumation == sumation:
break;
cm.append((2.0 * sumation) / (N * (N - 1.0)))
r.append(r[r_index] / 2.0)
tempSumation = sumation
r_index += 1
#Deletes extra value in r
del r[-1]
#Calculate ln of both r and cm
ln_r = map(log,r)
ln_cm = map(log,cm)
#Calculate linear regresion of the points
slope_as_fd, _, _, _, _ = linregress(ln_r,ln_cm)
#Return slope as aproximate fractal dimension
return slope_as_fd
In [13]:
start_time = time()
path = "..\..\Data\The Tesis EEG\Train"
files = [f for f in listdir(path) if isfile(join(path, f))]
print(files)
In [14]:
threshold_values = [0.005]
#Apply fractal dimension feature selection to all the datasets in the folder for each one of the threshold values
for i in threshold_values:
results = []
for j in files:
print(j)
stdsc = StandardScaler()
df = pd.read_csv(path + '\\' + j)
X = df.ix[:,0:14]
X_std = stdsc.fit_transform(X)
X_std = np.array(X_std)
results.append(fractal_feature_selection(X_std, i))
#Interpretation oh the obtained results
print('Threshold = {}'.format(i))
for k in results:
ref = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
for l in k:
ref[l[1]] = -1
for l in ref:
if l >= 0:
print('0'),
else:
print('1'),
print('')
print('\nElapsed time: {}'.format(time() - start_time))
Here we see that on both datasets (filtered and not filtered) three attributes are deleted, the first and the sixth are eliminated on both of them which is interesting.