In [2]:
%matplotlib inline
# autoreload class definition
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
import scipy.io
import os
from seizures.Global import Global

In [2]:


In [3]:
matdict = scipy.io.loadmat(Global.get_subject_folder(os.path.join('Dog_1', 'Dog_1_ictal_segment_1.mat')))
data = matdict['data']

In [4]:
# plot each channel 
plt.figure(figsize=(10,4))
plt.plot(data.T)
plt.xlabel('time')
plt.ylabel('magnitude')
plt.title('%d channels'%data.shape[0])
plt.grid(True)



In [5]:
plt.figure(figsize=(10,4))
plt.plot(data[8,:])


Out[5]:
[<matplotlib.lines.Line2D at 0xad7dee0c>]

In [6]:
np.hstack(data).shape


Out[6]:
(6400,)

test DataLoader_v2


In [7]:
from seizures.data.DataLoader_v2 import DataLoader
from seizures.features.RandomFeatures import RandomFeatures
from seizures.features.ARFeatures import ARFeatures
base_folder = Global.path_map('clips_folder')
#base_folder = '/home/nuke/git/gatsby-hackathon-seizure/wj_data/'
feature_extractor = ARFeatures()
loader = DataLoader(base_folder, feature_extractor)
subject_name = 'Dog_1'

X, type_labels, early_labels = loader.training_data(subject_name, max_segments=20)


Loading train data for Dog_1
Load with extractor = AR
418 interictal segments for Dog_1
178 ictal segments for Dog_1
subsampling from 596 segments to 20
95.0  percent complete         
done

In [8]:
# 
X.shape


Out[8]:
(20, 528)

In [9]:
plt.plot(X[2, :])


Out[9]:
[<matplotlib.lines.Line2D at 0xa8d2382c>]

In [10]:
# load_data() method will fill the content of attribute features_train 
assert(type(loader.features_train)==list)
features_train = loader.features_train
print '#ictal+interictal segments in %s = %d'% (subject_name, len(features_train))
print '#features of %s = %d' % (type(feature_extractor), len(features_train[0]) )
# loader.features_train


#ictal+interictal segments in Dog_1 = 20
#features of <class 'seizures.features.ARFeatures.ARFeatures'> = 528

In [11]:
# data for cross validation
n_fold = 3
max_segments=200
X_list,y_seizure, y_early = loader.blocks_for_Xvalidation(subject_name, n_fold, max_segments)


Loading train data for Dog_1
Load with extractor = AR
418 interictal segments for Dog_1
178 ictal segments for Dog_1
subsampling from 596 segments to 200
99.5  percent complete         
done

In [12]:
print len(X_list)
print len(y_seizure)
print len(y_early)
sum(x.shape[0] for x in X_list)


3
3
3
Out[12]:
198

In [13]:
X_list[2].shape
y_early[0]


Out[13]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Test gradient boosting classifier


In [13]:


In [14]:
def gen_gaussian_sum(n, d):
    """toy data. X = Gaussian. Y = sum of coordinates of X"""
    X = np.matrix(np.random.randn(n, d))
    y = ( (np.sum(X, 1) + np.random.randn(n, 1)*0.1) > 0.5).astype(np.int)
    #y = np.array(y, dtype=np.int)
    X = np.asarray(X)
    y = np.squeeze(np.asarray(y))
    return X,y

# n x d data matrix
n=100
d=10
X,y = gen_gaussian_sum(n ,d)

print X.shape
print y.shape


(100, 10)
(100,)

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
# fit a gradient boosting classifier
# n_estimators = number of boosting stages
gbc = GradientBoostingClassifier(n_estimators=200)
gbc.verbose=True
gbc.fit(X, y)


      Iter       Train Loss   Remaining Time 
         1           1.2402            0.50s
         2           1.1211            0.50s
         3           1.0220            0.46s
         4           0.9448            0.43s
         5           0.8751            0.39s
         6           0.8037            0.36s
         7           0.7516            0.33s
         8           0.6955            0.31s
         9           0.6477            0.30s
        10           0.5959            0.29s
        20           0.3053            0.22s
        30           0.1697            0.20s
        40           0.0980            0.18s
        50           0.0586            0.16s
        60           0.0357            0.15s
        70           0.0219            0.14s
        80           0.0136            0.13s
        90           0.0083            0.12s
       100           0.0052            0.11s
       200           0.0002            0.00s
Out[15]:
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, min_samples_leaf=1,
              min_samples_split=2, n_estimators=200, random_state=None,
              subsample=1.0, verbose=True)

In [16]:
# testing data
nte = 50
Xte, yte = gen_gaussian_sum(nte, d)

In [17]:
# test the learned gradient boosting classifier
ypred = gbc.predict(Xte)
print 'ground truth: '
print yte
print 'predicted: '
print ypred
print 'accuracy = %f'% ( np.mean(ypred==yte) )


ground truth: 
[0 1 0 1 1 0 1 1 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0
 0 0 0 1 1 0 0 1 0 1 1 1 1]
predicted: 
[1 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1
 0 1 0 1 1 0 1 1 0 1 1 1 0]
accuracy = 0.740000

In [18]:
# prediction probabilities
ypred_prob = gbc.predict_proba(Xte)

test my AdaBoostTrees


In [19]:
import seizures.prediction.Boosting
seizures.prediction.Boosting.main()


fitting AdaBoost trees
[ 0.49947853]

Test scipy stats


In [20]:
import scipy.stats as st
import numpy as np
from seizures.features.StatsFeatures import StatsFeatures

from seizures.data.DataLoader_v2 import DataLoader
from seizures.features.RandomFeatures import RandomFeatures
from seizures.features.ARFeatures import ARFeatures

In [21]:
base_folder = Global.path_map('clips_folder')
feature_extractor = StatsFeatures()
loader = DataLoader(base_folder, feature_extractor)
subject_name = 'Dog_1'

X, type_labels, early_labels = loader.training_data(subject_name, max_segments=10)
X.shape


Loading train data for Dog_1
Load with extractor = Stats
418 interictal segments for Dog_1
178 ictal segments for Dog_1
subsampling from 596 segments to 10
90.0  percent complete         
done
Out[21]:
(10, 48)

Test plainica


In [130]:
import scot
import numpy as np
from seizures.data.SubjectEEGData import SubjectEEGData

loader = SubjectEEGData('Patient_1', max_train_segments=10)
L = loader.get_train_data()
tup = L[9]
ins = tup[0]
dndata = ins.eeg_data
nddata = dndata.T

# reducedim: A number of less than 1 in interpreted as 
# the fraction of variance that should remain in the data.
keep = min(nddata.shape[1], 5)
ica = scot.plainica.plainica(nddata, reducedim=keep)


104 interictal segments for Patient_1
70 ictal segments for Patient_1
subsampling from 174 segments to 10
90.0  percent complete         
done
/home/nuke/local/lib/python2.7/site-packages/scot/binica/ica_linux found
waiting for binica to finish...

In [131]:
M = ica.mixing
U = ica.unmixing
# plot mixing matrix

plt.imshow(M)
plt.colorbar()
plt.title('mixing matrix')

# plot unmixing matrix
plt.figure()
plt.imshow(U)
plt.colorbar()
plt.title('unmixing matrix')


Out[131]:
<matplotlib.text.Text at 0xa6ab9acc>

Compare many feature extractors, many predictors on one patient dataset


In [1]:
from seizures.features.ARFeatures import *
from seizures.features.MixFeatures import StackFeatures
from seizures.features.PLVFeatures import PLVFeatures
from seizures.features.XCHUHFeatures import XCHUHFeatures
from seizures.features.SEFeatures import SEFeatures
#from seizures.features.LyapunovFeatures import LyapunovFeatures
from seizures.features.StatsFeatures import StatsFeatures
from seizures.prediction.ForestPredictor import ForestPredictor
from seizures.prediction.Boosting import AdaBoostTrees
from seizures.pipelines.FeaturePredictorTest import CVFeaturesPredictorsTester
from seizures.pipelines.FeaturePredictorTest import CachedCVFeaPredTester


feature_extractors = [
    ARFeatures(), 
    SEFeatures(),
    StatsFeatures(),
    #VarLagsARFeatures(4),
    #StackFeatures(ARFeatures(), VarLagsARFeatures(4), VarLagsARFeatures(6))
    #StackFeatures(ARFeatures(), VarLagsARFeatures(4)),
    #StackFeatures(ARFeatures(), PLVFeatures(), SEFeatures()),
    #PLVFeatures(),
    #StackFeatures(ARFeatures(), VarLagsARFeatures(5), PLVFeatures()), 
    #StackFeatures(ARFeatures(),  PLVFeatures(), XCHUHFeatures(), SEFeatures())
]
#predictors = [AdaBoostTrees(), ForestPredictor(n_estimators=200)]
predictors = [ ForestPredictor(n_estimators=100, max_features=0.2)]
patient = 'Dog_1'
#patient = 'Patient_1'
#tester = CVFeaturesPredictorsTester(feature_extractors, predictors, patient)
tester = CachedCVFeaPredTester(feature_extractors, predictors, patient)
# randomly select subsamples of total segments (ictal + interictal)
# To make it faster. I expect using the full data to give similar result anyway.
max_segments=400

table = tester.test_combination(fold=2, max_segments=max_segments)


418 interictal segments for Dog_1
178 ictal segments for Dog_1
subsampling from 596 segments to 400
99.75  percent complete         
done
Extracting features with AR
Extracting features with SE
Extracting features with Stats
Evaluating feat: AR + pred: Forest on seizure task
Fitting a random forest predictor
Fitting a random forest predictor
Evaluating feat: AR + pred: Forest on early seizure task
Fitting a random forest predictor
Fitting a random forest predictor
Evaluating feat: SE + pred: Forest on seizure task
Fitting a random forest predictor
Fitting a random forest predictor
Evaluating feat: SE + pred: Forest on early seizure task
Fitting a random forest predictor
Fitting a random forest predictor
Evaluating feat: Stats + pred: Forest on seizure task
Fitting a random forest predictor
Fitting a random forest predictor
Evaluating feat: Stats + pred: Forest on early seizure task
Fitting a random forest predictor
Fitting a random forest predictor

In [2]:
# the argument to print_table(..) can be 
# seizure_mean_auc, seizure_std_auc, early_mean_auc, early_std_auc
table.print_table('seizure_mean_auc')


# From FeaturesPredictsTable
Reporting seizure_mean_auc
+---------------+--------+
| feat. \ pred. | Forest |
+---------------+--------+
|       AR      | 0.988  |
|       SE      | 0.989  |
|     Stats     | 0.858  |
+---------------+--------+

In [3]:
table.print_table('early_mean_auc')


# From FeaturesPredictsTable
Reporting early_mean_auc
+---------------+--------+
| feat. \ pred. | Forest |
+---------------+--------+
|       AR      | 0.916  |
|       SE      | 0.961  |
|     Stats     | 0.721  |
+---------------+--------+

In [ ]: