This tutorial will provide a quick introduction to running machine learning analyses using modern python based modules. We will briefly cover supervised methods using an example pain dataset from Chang et al., (In Press) and Krishnan et al., (Under Review), which is publically available in neurovault. We will also provide a brief example of unsupervised based methods to parcellate a region of interest into functionally homogenous subregions using neurosynth.
In [1]:
# iPython notebook magic commands
%load_ext autoreload
%autoreload 2
%matplotlib inline
#General modules
import os
from os.path import join, basename, isdir
from os import makedirs
import pandas as pd
import matplotlib.pyplot as plt
import time
import pickle
# Supervised Modules
from pyneurovault import api
import nibabel as nb
import numpy as np
from nltools.analysis import Predict, apply_mask, Roc
from nltools.utils import get_resource_path
# Unsupervised Modules
# from neurosynth import Dataset, Clusterer, Masker, Decoder
# from neurosynth.analysis.cluster import cluster_similarity
from nilearn import plotting, datasets
from sklearn.decomposition import RandomizedPCA
# Define output folder
out_folder = "/Users/lukechang/Downloads/nv_tmp"
In [4]:
tic = time.time() #Start Timer
# Pain Collection
collection = 504
# Will extract all collections and images in one query to work from
nv = api.NeuroVault()
# Download all images to file
standard = os.path.join(os.path.dirname(api.__file__),'data','MNI152_T1_2mm_brain.nii.gz')
nv.download_images(dest_dir = out_folder,target=standard, collection_ids=[collection],resample=True)
# Create Variables
collection_data = nv.get_images_df().ix[nv.get_images_df().collection_id == collection,:].reset_index()
img_index = sorted((e,i) for i,e in enumerate(collection_data.file))
index = [x[1] for x in img_index]
img_file = [x[0] for x in img_index]
dat = nb.funcs.concat_images([os.path.join(out_folder,'resampled','00' + str(x) + '.nii.gz') for x in collection_data.image_id[index]])
# dat = nb.funcs.concat_images([os.path.join(out_folder,'original',str(x) + '.nii.gz') for x in collection_data.image_id[index]])
holdout = [int(x.split('_')[-2]) for x in img_file]
heat_level = [x.split('_')[-1].split('.')[0] for x in img_file]
Y_dict = {'High':3,'Medium':2,'Low':1}
Y = np.array([Y_dict[x] for x in heat_level])
print Y
# Pickle for later use
# Saving the objects:
with open(os.path.join(out_folder,'Pain_Data.pkl'), 'w') as f:
pickle.dump([dat,holdout,Y], f)
print 'Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
In [2]:
tic = time.time() #Start Timer
# Getting back the objects:
with open(os.path.join(out_folder,'Pain_Data.pkl')) as f:
dat, holdout, Y = pickle.load(f)
print 'Load Pickled File - Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
This code will initialize a Predict object from nltools. Requires:
Will run Linear Support Vector Regression ('svr') with 5 fold cross-validation.
In [3]:
tic = time.time() #Start Timer
# Test Prediction with kfold xVal
svr = Predict(dat,Y,algorithm='svr', output_dir=out_folder,
cv_dict = {'type':'kfolds','n_folds':5,'subject_id':holdout},
**{'kernel':"linear"})
svr.predict()
print 'Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
Run Linear Support Vector Regression with leave one subject out cross-validation (LOSO)
In [193]:
tic = time.time() #Start Timer
# Test Prediction with LOSO xVal
svr = Predict(dat,Y,algorithm='svr', output_dir=out_folder,
cv_dict = {'type':'loso','subject_id':holdout},
**{'kernel':"linear"})
svr.predict()
print 'Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
Run Ridge Regression with 5 fold Cross-Validation and a nested cross-validation to estimate shrinkage parameter
In [176]:
tic = time.time() #Start Timer
# Test Ridge with kfold xVal + grid search for regularization
ridge = Predict(dat, Y, algorithm='ridgeCV',output_dir=out_folder,
cv_dict = {'type':'kfolds','n_folds':5,'subject_id':holdout},
**{'alphas':np.linspace(.1, 10, 5)})
ridge.predict()
print 'Total Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
Run Principal Components Regression with no Cross-Validation. This pattern should be very similar to the pain pattern reported in Krishnan et al., (Under Review). Principal Components Regression is much slower than the other linear methods, but scales well when feature set is large
In [177]:
tic = time.time() #Start Timer
# Principal Components Regression
pcr = Predict(dat,Y,algorithm='pcr', output_dir=out_folder,
cv_dict = {'type':'kfolds','n_folds':5,'subject_id':holdout})
pcr.predict()
print 'Total Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
You might be interested in only training a pattern on a subset of the brain using an anatomical mask. Here we use a mask of subcortex.
In [21]:
tic = time.time() #Start Timer
mask = nb.load(os.path.join(get_resource_path(),'gray_matter_mask.nii.gz'))
# Test Prediction with kfold xVal
ridge = Predict(dat,Y,algorithm='ridge', output_dir=out_folder,
cv_dict = {'type':'kfolds','n_folds':5, 'subject_id':holdout},
mask = mask)
ridge.predict()
print 'Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
After training a pattern we are typically interested in testing it on new data to see how sensitive and specific it might be to the construct it was trained to predict. Here we provide an example of applying it to a new dataset using the 'dot_product' method. This will produce a scalar prediction per image akin to regression (don't forget to add the intercept!). We can also use a standardized method, which examines the overall spatial correlation between two images (i.e., 'correlation').
In [199]:
tic = time.time() #Start Timer
# Load data using nibabel
pines = nb.load(os.path.join(out_folder, 'ridgeCV_weightmap.nii.gz'))
pexpd = apply_mask(data=dat, weight_map=pines, output_dir=out_folder, method='dot_product', save_output=True)
pexpc = apply_mask(data=dat, weight_map=pines, output_dir=out_folder, method='correlation', save_output=True)
plt.subplot(2, 1, 1)
plt.plot(pexpd)
plt.title('Pattern Expression')
plt.ylabel('Dot Product')
plt.subplot(2, 1, 2)
plt.plot(pexpc)
plt.xlabel('Subject')
plt.ylabel('Correlation')
print 'Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
We are often interested in evaluating how well a pattern can discriminate between different classes of data. Here we apply a pattern to a new dataset and evaluate how well it can discriminate between high and low pain using single-interval classification. We use the Roc class to initialize an Roc object and the plot() and summary() methods to run the analyses. We could also just run the calculate() method to run the analysis without plotting.
In [201]:
tic = time.time() #Start Timer
# Create Variables
include = (svr.Y==3) | (svr.Y==1)
input_values = svr.yfit_xval[include]
binary_outcome = svr.Y[include]
binary_outcome = binary_outcome==3
# Single-Interval
roc = Roc(input_values=input_values, binary_outcome=binary_outcome)
roc.plot()
roc.summary()
print 'Total Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
The above example uses single-interval classification, which attempts to determine the optimal classification interval. However, sometimes we are intersted in directly comparing responses to two images within the same person. In this situation we should use forced-choice classification, which looks at the relative classification accuracy between two images.
In [202]:
tic = time.time() #Start Timer
# Forced Choice
roc_fc = Roc(input_values=input_values, binary_outcome=binary_outcome, forced_choice=True)
roc_fc.plot()
roc_fc.summary()
print 'Total Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
Sometimes we are interested in understanding how a region of interested might be functionally organized. One increasingly popular technique is to parcellate the region based on shared patterns of functional connectivity. These functional correlations can be derived from functional connectivity using resting-state fMRI or functional coactivation from meta-analyses. See our paper on the insula for example (Images can be downloaded from neurovault). This tutorial will show how to perform a similar functional coactivation based parcellation of a region using neurosynth tools.
The basic idea is to:
Warning: This can take a lot of RAM if you use the full neurosynth Dataset!
In [ ]:
# Initialize main clustering object: use PCA with 100 components for dimensionality reduction;
# keep all voxels with minimum of 100 studies (approx. 1% base rate).
reducer = RandomizedPCA(100)
roi_mask = os.path.join(out_folder,'Clustering/Masks/ROIs/FSL_TPJ.nii.gz')
clstr = Clusterer(dataset, 'coactivation', output_dir=os.path.join(out_folder,'Clustering'),
min_studies_per_voxel=100, reduce_reference=reducer, roi_mask=roi_mask)
In [ ]:
clstr.cluster(algorithm='kmeans', n_clusters=range(2,11,1),
bundle=False, coactivation_maps=True,
precomputed_distances=True)
In [ ]:
K = range(2,11,1)
fig, axes = plt.subplots(len(K), 1)
for i, k in enumerate(K):
plotting.plot_roi(os.path.join(out_folder,'Clustering/kmeans_k%d/kmeans_k%dcluster_labels.nii.gz' % (k, k)),
title="Whole-brain k-means clustering (k = %d)" % k, display_mode='x',
cut_coords=range(-60,-45,5) + range(50,65,5), axes=axes[i])
fig.set_size_inches((15, 20))
fig.savefig(os.path.join(out_folder,'Clustering/Sagittal_Slice_Montage.png'))
In [ ]:
# Decoding Polar Plots
K = range(2,11,1)
dcdr = Decoder(dataset, method='roi')
for i in K:
res = dcdr.decode(os.path.join(out_folder,'Clustering/kmeans_k' + str(i) + '/kmeans_k' + str(i) + 'cluster_labels.nii.gz'),
save=os.path.join(out_folder,'Clustering/kmeans_k' + str(i) + '/decoding_results_z.txt'), value='r')
_ = dcdr.plot_polar(res, overplot=True, n_top=2)
_.savefig(os.path.join(out_folder,'Clustering/kmeans_k' + str(i) + '/Decode_PolarPlot_k' + str(i) + '.pdf'))
In [8]:
dat.get_affine()
Out[8]:
In [ ]: