In [1]:
import nibabel as nib
import numpy as np
from glob import glob
import os.path as osp
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import recall_score
from time import time
In [2]:
PARENT_DIR = '/data/shared/bvFTD/VBM/default/data'
SIZE_VOXELS = 121 * 145 * 121
In [3]:
ftd_files = glob(osp.join(PARENT_DIR, 'bvFTD', '*', 'structural', 'mri', 's*'))
psych_files = glob(osp.join(PARENT_DIR, 'psychiatric', '*', 'structural', 'mri', 's*'))
neurol_files = glob(osp.join(PARENT_DIR, 'neurological', '*', 'structural', 'mri', 's*'))
Potential ToDo: one of the subjects in FTD has a weird/possibly broken GM pattern. Might need to exclude
In [4]:
print '#FTD: {}; #Psychiatry: {}; #Neurological: {}'.format(len(ftd_files), len(psych_files), len(neurol_files))
In [6]:
def load_data(X, files_to_load, start_id=0):
for id_file, file_to_load in enumerate(files_to_load):
X[id_file + start_id] = nib.load(file_to_load).get_data().ravel().astype(np.float64)
In [14]:
def accuracy(y_true, y_pred):
return (y_true == y_pred).mean()
def balanced_accuracy(y_true, y_pred):
return 0.5 * (((y_true == 1) & (y_pred == 1)).sum()/float(y_true.sum()) +
((y_true == 0) & (y_pred == 0)).sum()/float(y_true.size - y_true.sum()))
def specificity(y_true, y_pred):
return np.sum((y_true == 0) & (y_pred == 0)) / float(np.sum(y_true == 0))
def sensitivity(y_true, y_pred):
return recall_score(y_true, y_pred)
In [15]:
def make_classification(first_class_files, other_files, use_pca=False, additional_files=None):
if additional_files is None:
X = np.zeros((len(first_class_files) + len(other_files), SIZE_VOXELS))
y = np.concatenate((np.ones(len(first_class_files), dtype=np.int), np.zeros(len(other_files), dtype=np.int)))
load_data(X, first_class_files)
load_data(X, other_files, start_id=len(first_class_files))
else:
X = np.zeros((len(first_class_files) + len(other_files) + len(additional_files), SIZE_VOXELS))
y = np.concatenate((np.ones(len(first_class_files) * 2, dtype=np.int),
np.ones(len(other_files), dtype=np.int),
np.zeros(len(additional_files), dtype=np.int)))
load_data(X, first_class_files)
load_data(X, other_files, start_id=len(ftd_files))
load_data(X, additional_files, start_id=len(first_class_files) + len(other_files))
# Since we don't have a mask, we create one for ourselves by checking which voxels are zero across all the subjects
id_keep = ~np.all(X == 0, axis=0)
X = X[:, id_keep]
y_pred = np.zeros_like(y)
pca = PCA(n_components=0.9)
# other_files_equal_sampled = np.random.choice(other_files, size = len(first_class_files), replace = False)
loo = LeaveOneOut()
t1, t2 = 0., 0.
for id_split, (train_id, test_id) in enumerate(loo.split(X)):
print '{}/{} {}'.format(id_split + 1, loo.get_n_splits(X), t2 - t1)
t1 = time()
X_train, y_train = X[train_id, :], y[train_id]
X_test, y_test = X[test_id, :], y[test_id]
if use_pca:
X_train = pca.fit_transform(X_train)
print 'Number PCA Components: {}'.format(pca.components_.shape[0])
X_test = pca.transform(X_test)
# for multiclass: one-vs-one classification as done in libsvm
svm = SVC(kernel='linear', class_weight='balanced', decision_function_shape='ovo')
svm.fit(X_train, y_train)
y_pred[test_id] = svm.predict(X_test)
t2 = time()
print 'Accuracy: {}, Balanced Accuracy: {}, Sensitivity: {}, Specificity: {}'.format(accuracy(y, y_pred),
balanced_accuracy(y, y_pred),
sensitivity(y, y_pred),
specificity(y, y_pred))
In [8]:
print 'No PCA'
print 'FTD vs. Psychiatry'
make_classification(ftd_files, psych_files)
print 'Ratio: #Psychiatry/#FTD {}'.format(float(len(psych_files))/len(ftd_files))
print 'FTD vs. Neurlogical'
make_classification(ftd_files, neurol_files)
print 'Ratio: #Neurological/#FTD {}'.format(float(len(neurol_files))/len(ftd_files))
print 'Psychiatry vs. Neurlogical'
make_classification(psych_files, neurol_files)
print 'Ratio: #Neurological/#Psychiatry {}'.format(float(len(psych_files))/len(neurol_files))
In [11]:
print 'Use PCA'
print 'FTD vs. Psychiatry'
make_classification(ftd_files, psych_files, use_pca=True)
print 'Ratio: #Psychiatry/#FTD {}'.format(float(len(psych_files))/len(ftd_files))
print 'FTD vs. Neurlogical'
make_classification(ftd_files, neurol_files, use_pca=True)
print 'Ratio: #Neurological/#FTD {}'.format(float(len(neurol_files))/len(ftd_files))
print 'Psychiatry vs. Neurlogical'
make_classification(psych_files, neurol_files, use_pca=True)
print 'Ratio: #Neurological/#Psychiatry {}'.format(float(len(psych_files))/len(neurol_files))
Does not work yet!
In [ ]:
PARENT_DIR = '/data/shared/bvFTD/VBM/default_non_modulated'
ftd_non_mod_files = glob(osp.join(PARENT_DIR, 'bvFTD', '*', 'structural', 'mri', 's*'))
psych_non_mod_files = glob(osp.join(PARENT_DIR, 'psychiatric', '*', 'structural', 'mri', 's*'))
neurol_non_mod_files = glob(osp.join(PARENT_DIR, 'neurological', '*', 'structural', 'mri', 's*'))
print 'No PCA, Non-Modulated'
print 'FTD vs. Psychiatry'
make_classification(ftd_non_mod_files, psych_non_mod_files)
print 'Ratio: #Psychiatry/#FTD {}'.format(float(len(psych_files))/len(ftd_files))
print 'FTD vs. Neurlogical'
make_classification(ftd_non_mod_files, neurol_non_mod_files)
print 'Ratio: #Neurological/#FTD {}'.format(float(len(neurol_files))/len(ftd_files))
print 'Psychiatry vs. Neurlogical'
make_classification(psych_non_mod_files, neurol_non_mod_files)
print 'Ratio: #Neurological/#Psychiatry {}'.format(float(len(psych_files))/len(neurol_files))
In [ ]:
print 'Multiclass: No PCA'
make_classification(ftd_files, psych_files, use_pca=False, additional_files=neurol_files)