Libraries, utilities and definitions


In [10]:
import numpy as np
import pandas as pd

from math import log
from os import listdir
from os.path import isfile, join
from scipy.stats import linregress
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from time import time
from timeit import timeit

#Returns 1 if a point is inside a radius, if not, returns 0
def dist(p, r):
    return 1 if p <= r else 0

#Makes the distance function aplicable to numpy arrays
check_dist = np.vectorize(dist)

Fractal dimension feature selection algorithm

The algorithm is adjusted to the dataset of the experiment so the number of attributes must be modified, it calculates the approximated fractal dimension after deleting an attribute, if the new value is inside the threshold value it can be eliminated and ends when no attribute is delete from the dataset.


In [11]:
def fractal_feature_selection(df, threshold=0.09):
    
    #Obtains the approximate fractal dimension of the original dataset
    base_fd = fractal_dimension(df)
    print('Whole dataset approximate fractal dimension: {}'.format(base_fd))
    
    #List for keeping track of the attributes index in ther starting order
    sorted_attrib = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], 
                     [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13]]
    
    attribute_not_deleted = True
    while attribute_not_deleted:
        fd_list = []
        for i in sorted_attrib:
            #Deletes i attribute from dataset
            X = np.delete(df, i[0], axis=1)
            partial_fd = fractal_dimension(X)
            #Adds the information of the approximate fractal dimension to a list to obtain the one that 
            #contribute less to the whole dataset
            fd_list.append([i[0], 
                            partial_fd, 
                            abs((partial_fd / indicator_fd) - 1), 
                            abs((partial_fd / indicator_fd) - 1) < threshold])
            
        #Sort by partial fractal dimension value
        fd_list.sort(key = lambda row: row[2])
        
        for i in fd_list:
            #Checks if the variation of the fractal dimension value is inside the threshold
            if i[3] == True:
                #Update fractal dimension value
                indicator_fd = i[1]
                #Deletes attribute that doesn't contributes more the threshold value to the farctal dimension value
                df = np.delete(df, i[0], axis=1)
                #Deletes the i attribute from our reference list
                sorted_attrib = np.delete(sorted_attrib, i[0], axis=0)
                #Decremets the relative value of the attributes to the right of the deleted one
                for j in xrange(i[0], len(sorted_attrib)):
                    sorted_attrib[j][0] -= 1
                break
            #No attribute was deleted
            attribute_not_deleted = False
    return sorted_attrib

Fractal dimesion of a dataset

This algorithm calculates the approximate fractal dimension of the given dataset which must be loaded on a numpy data frame.


In [12]:
def fractal_dimension(dataset):
    #Data set cardinality
    N = len(dataset)
    #Results list of correlation integral values
    cm = []
    #List of radius to test distance between points
    r = [1.0];
    r_index = 0;
    
    #Executes while the sumation is greater than 0
    tempSumation = 0
    while True:
        #Number of points that return 1 in the heaviside function
        sumation = 0
        #Obtaining distance between point Xi and all of the others
        for j in range(N-1):
            euclidean_dist_array = euclidean_distances(dataset[j].reshape(1, -1), dataset[j+1:])
            sumation += np.sum(check_dist(euclidean_dist_array, r[r_index]))
        if sumation <= 0 or tempSumation == sumation:
            break;
        cm.append((2.0 * sumation) / (N * (N - 1.0)))
        r.append(r[r_index] / 2.0)
        tempSumation = sumation
        r_index += 1       
    
    #Deletes extra value in r
    del r[-1]
        
    #Calculate ln of both r and cm
    ln_r = map(log,r)
    ln_cm = map(log,cm)
    
    #Calculate linear regresion of the points
    slope_as_fd, _, _, _, _ = linregress(ln_r,ln_cm)

    #Return slope as aproximate fractal dimension
    return slope_as_fd

Fetching files


In [13]:
start_time = time()
path = "..\..\Data\The Tesis EEG\Train"
files = [f for f in listdir(path) if isfile(join(path, f))]
print(files)


['EEG_Train_Filtered_0.5_30Hz_l4.csv', 'EEG_Train_Sorted.csv']

Processing the dataset

In this experiment the training data is analyzed using a threshold value of 0.005


In [14]:
threshold_values = [0.005]
#Apply fractal dimension feature selection to all the datasets in the folder for each one of the threshold values
for i in threshold_values:
    results = []
    for j in files:
        print(j)
        stdsc = StandardScaler()
        df = pd.read_csv(path + '\\' + j)
        X = df.ix[:,0:14]
        X_std = stdsc.fit_transform(X)
        X_std = np.array(X_std)
    
        results.append(fractal_feature_selection(X_std, i))

    #Interpretation oh the obtained results
    print('Threshold = {}'.format(i))
    for k in results:
        ref = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
        for l in k:
            ref[l[1]] = -1
        for l in ref:
            if l >= 0:
                print('0'),
            else:
                print('1'),
        print('')
print('\nElapsed time: {}'.format(time() - start_time))


EEG_Train_Filtered_0.5_30Hz_l4.csv
Whole dataset approximate fractal dimension: 7.14545607293
EEG_Train_Sorted.csv
Whole dataset approximate fractal dimension: 5.24572496911
Threshold = 0.005
0 1 1 1 1 0 1 1 1 1 0 1 1 1 
0 1 1 0 1 0 1 1 1 1 1 1 1 1 

Elapsed time: 36182.3209999

Here we see that on both datasets (filtered and not filtered) three attributes are deleted, the first and the sixth are eliminated on both of them which is interesting.