In [1]:
""" Code to grab a private collection from NeuroVault, and compute an SVM discriminating between negative and neutral images
"""
# Authors: Luke Chang, Chris Filo Gorgolewski, Gael Varoquaux
# License: BSD
import os
import json
import urllib, os, errno
from urllib2 import Request, urlopen, HTTPError
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
from nipype.utils.filemanip import split_filename
import nibabel as nb
from joblib import Memory
from nilearn.input_data import NiftiMasker
from nilearn.plotting import *
from nilearn.image import resample_img, mean_img
from nilearn.masking import compute_background_mask, _extrapolate_out_mask
from sklearn.svm import SVC
from sklearn.cross_validation import KFold, StratifiedKFold, LeaveOneLabelOut
from sklearn.metrics import roc_curve, auc
from scipy import interp
import matplotlib.pyplot as plt
import time
import seaborn as sns
from nltools.analysis import Predict
%matplotlib inline
# Github Project Root
basedir = os.path.abspath('..')
# Set Paths
dest_dir = "/Users/lukechang/Dropbox/NEPA/Test_Analysis"
canonical = os.path.join(basedir,"resources/MNI152_T1_2mm.nii.gz")
# Use a joblib memory, to avoid depending on an Internet connection
mem = Memory(cachedir='/tmp/neurovault_analysis/cache')
def url_get(url):
request = Request(url)
response = urlopen(request)
return response.read()
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
def get_collections_df(key = None):
"""Downloads metadata about collections/papers stored in NeuroVault and
return it as a pandas DataFrame"""
if key is None:
request = Request('http://neurovault.org/api/collections/?format=json')
else:
request=Request('http://neurovault.org/api/collections/' + key + '?format=json')
response = urlopen(request)
elevations = response.read()
data = json.loads(elevations)
collections_df = json_normalize(data)
collections_df.rename(columns={'id':'collection_id'}, inplace=True)
collections_df.set_index("collection_id")
return collections_df
def get_images_df():
"""Downloads metadata about images/statistical maps stored in NeuroVault and
return it as a pandas DataFrame"""
request = Request('http://neurovault.org/api/images/?format=json')
response = urlopen(request)
elevations = response.read()
data = json.loads(elevations)
images_df = json_normalize(data)
images_df['collection'] = images_df['collection'].apply(lambda x: int(x.split("/")[-2]))
images_df['image_id'] = images_df['url'].apply(lambda x: int(x.split("/")[-2]))
images_df.rename(columns={'collection':'collection_id'}, inplace=True)
return images_df
def get_images_with_private_collections_df(key=None):
"""Downloads metadata about images/statistical maps stored in NeuroVault and
and enriches it with metadata of the corresponding collections. The result
is returned as a pandas DataFrame"""
collections_df = get_collections_df(key=key)
images_df = get_images_df()
combined_df = pd.merge(images_df, collections_df, how='inner', on='collection_id',
suffixes=('_image', '_collection'))
return combined_df
def get_images_with_collections_df():
"""Downloads metadata about images/statistical maps stored in NeuroVault and
and enriches it with metadata of the corresponding collections. The result
is returned as a pandas DataFrame"""
collections_df = get_collections_df()
images_df = get_images_df()
combined_df = pd.merge(images_df, collections_df, how='left', on='collection_id',
suffixes=('_image', '_collection'))
return combined_df
def download_and_resample(images_df, dest_dir, target):
"""Downloads all stat maps and resamples them to a common space.
"""
target_nii = nb.load(target)
orig_path = os.path.join(dest_dir, "original")
mkdir_p(orig_path)
resampled_path = os.path.join(dest_dir, "resampled")
mkdir_p(resampled_path)
out_df = combined_df.copy()
for row in combined_df.iterrows():
# Downloading the file to the "original" subfolder
_, _, ext = split_filename(row[1]['file'])
orig_file = os.path.join(orig_path, "%04d%s" % (row[1]['image_id'], ext))
if not os.path.exists(orig_file):
print "Downloading %s" % orig_file
urllib.urlretrieve(row[1]['file'], orig_file)
# Compute the background and extrapolate outside of the mask
print "Extrapolating %s" % orig_file
niimg = nb.load(orig_file)
data = niimg.get_data().squeeze()
niimg = nb.Nifti1Image(data, niimg.affine,
header=niimg.get_header())
bg_mask = compute_background_mask(niimg).get_data()
# Test if the image has been masked:
out_of_mask = data[np.logical_not(bg_mask)]
if np.all(np.isnan(out_of_mask)) or len(np.unique(out_of_mask)) == 1:
# Need to extrapolate
data = _extrapolate_out_mask(data.astype(np.float), bg_mask,
iterations=3)[0]
niimg = nb.Nifti1Image(data, niimg.affine,
header=niimg.get_header())
del out_of_mask, bg_mask
# Resampling the file to target and saving the output in the "resampled"
# folder
resampled_file = os.path.join(resampled_path,
"%06d%s" % (row[1]['image_id'], ext))
print "Resampling %s" % orig_file
resampled_nii = resample_img(niimg, target_nii.get_affine(),
target_nii.shape)
resampled_nii = nb.Nifti1Image(resampled_nii.get_data().squeeze(),
resampled_nii.get_affine(),
header=niimg.get_header())
if len(resampled_nii.shape) == 3:
resampled_nii.to_filename(resampled_file)
else:
# We have a 4D file
assert len(resampled_nii.shape) == 4
resampled_data = resampled_nii.get_data()
affine = resampled_nii.affine
for index in range(resampled_nii.shape[-1]):
# First save the files separately
this_nii = nb.Nifti1Image(resampled_data[..., index],
affine)
this_id = int("%i%i" % (-row[1]['image_id'], index))
this_file = os.path.join(resampled_path,
"%06d%s" % (this_id, ext))
this_nii.to_filename(this_file)
# Second, fix the dataframe
out_df = out_df[out_df.image_id != row[1]['image_id']]
this_row = row[1].copy()
this_row.image_id = this_id
out_df = out_df.append(this_row)
return out_df
def roc_plot(tpr,fpr):
mean_auc = auc(fpr, tpr)
fig = plt.plot(fpr, tpr, 'r-', label='Mean ROC (area = %0.2f)' % mean_auc, lw=4)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic ')
return fig
def dist_from_hyperplane_plot(stats_output):
fig = sns.factorplot("SubID", "xval_dist_from_hyperplane", hue="Y", data=stats_out,kind='point')
plt.xlabel('Subject')
plt.ylabel('Distance from Hyperplane')
plt.title('SVM Classification')
return fig
In [6]:
# Test Data collection Key: /collections/NNGSIZTQ/
# Get Meta-Data associated with private key
k = 'NNGSIZTQ' # Private Key
coll_dat = get_collections_df(key = k)
combined_df = get_images_with_private_collections_df(k)
# Download and resample
combined_df = mem.cache(download_and_resample)(combined_df,dest_dir, canonical)
combined_df.to_csv('%s/metadata.csv' % dest_dir, encoding='utf8')
In [14]:
tic = time.time() #Start Timer
# Separate image types and sort
combined_df = pd.read_csv('%s/metadata.csv' % dest_dir, encoding='utf8')
neg_list = combined_df.ix[combined_df.name_image.str.contains('Neg'),].sort(columns='name_image')
neu_list = combined_df.ix[combined_df.name_image.str.contains('Neu'),].sort(columns='name_image')
# Load data using nibabel
neg_file_list = [dest_dir + '/resampled/00' + str(x) + '.nii.gz' for x in neg_list.image_id]
neu_file_list = [dest_dir + '/resampled/00' + str(x) + '.nii.gz' for x in neu_list.image_id]
dat = nb.funcs.concat_images(neg_file_list+neu_file_list)
# Mask data
mask_img = nb.load(os.path.join(basedir,'resources/MNI152_T1_2mm_brain_mask_dil.nii.gz'))
mn_img = mean_img(dat)
nifti_masker = NiftiMasker(mask_img=mask_img)
dat_masked = nifti_masker.fit_transform(dat)
# Classification
nSub = 20
Y = np.array([1]*nSub+[0]*nSub)
svc = SVC(kernel='linear')
svc.fit(dat_masked, Y)
# Save Weights: Write out weight map to nifti image
coef_ = svc.coef_
intercept_ = svc.intercept_
coef_img = nifti_masker.inverse_transform(coef_)
nb.save(coef_img, dest_dir + '/Neg_vs_Neu_SVM_weights.nii')
# Cross-validation
cv = StratifiedKFold(Y, n_folds=5)
cv_scores = []
mean_tpr = 0.0
all_tpr = []
mean_fpr = np.linspace(0, 1, 100)
yfit = np.array([0]*nSub*2)
xval_dist_from_hyperplane = np.array([0]*nSub*2)
for train, test in cv:
svc.fit(dat_masked[train], Y[train])
yfit[test] = svc.predict(dat_masked[test])
xval_dist_from_hyperplane[test] = svc.decision_function(dat_masked[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(Y[test], yfit[test])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr /= len(cv)
mean_tpr[0] = 0.0
mean_tpr[-1] = 1.0
# Save Stats Output
stats_out = pd.DataFrame({
'SubID' : range(0,nSub)*2,
'xval_dist_from_hyperplane' : xval_dist_from_hyperplane,
'Y' : Y,
'yfit' : yfit })
stats_out.to_csv(dest_dir + 'SVM_Stats_Output.csv')
# Display results
print 'overall CV accuracy: %.2f' % np.mean(yfit==Y)
print 'AUC: %.2f' % auc(mean_fpr,mean_tpr)
print 'Forced Choice Accuracy: %.2f' % np.mean((xval_dist_from_hyperplane[0:nSub] - xval_dist_from_hyperplane[nSub:]) > 0)
# Plots
fig3 = roc_plot(mean_tpr, mean_fpr)
plt.savefig(dest_dir + '/Neg_vs_Neu_SVM_ROC_plot.png')
fig1 = plot_roi(nifti_masker.mask_img_, mn_img, title="Mask", cut_coords=range(-40, 40, 10), display_mode='z')
fig1.savefig(dest_dir + '/Neg_vs_Neu_SVM_mask.png')
fig2 = plot_stat_map(coef_img, mn_img, title="SVM weights", cut_coords=range(-40, 40, 10), display_mode='z')
fig2.savefig(dest_dir + '/Neg_vs_Neu_SVM_weightmap.png')
fig4 = dist_from_hyperplane_plot(stats_out)
fig4.savefig(dest_dir + '/Neg_vs_Neu_SVM_xVal_Distance_from_Hyperplane.png')
print 'Elapsed: %.2f seconds' % (time.time() - tic)
In [8]:
#PINES key /collections/TMTKHDFM/
### Download Weight Map ###
# Get Meta-Data associated with private key
k = 'TMTKHDFM' # Private Key
coll_pines = get_collections_df(key = k)
pines_df = get_images_with_private_collections_df(k)
# Download and resample
dest_dir = "/Users/lukechang/Dropbox/NEPA/Test_Analysis"
target = os.path.join(basedir,"resources/MNI152_T1_2mm.nii.gz")
pines_df = mem.cache(download_and_resample)(pines_df,dest_dir, target)
pines_df.to_csv('%s/pines_metadata.csv' % dest_dir, encoding='utf8')
In [9]:
tic = time.time() #Start Timer
# Load data using nibabel
dat = nb.funcs.concat_images(neg_file_list+neu_file_list)
pines = nb.load(os.path.abspath(dest_dir + '/resampled/00' + str(combined_df.image_id[0]) + '.nii.gz'))
# Mask data
nifti_masker = NiftiMasker(mask_img=mask_img)
dat_masked = nifti_masker.fit_transform(dat)
pines_masked = nifti_masker.fit_transform(pines)
# Calculate pattern expression
pexp = np.dot(dat_masked,np.transpose(pines_masked)) + intercept_
# Calculate ROC plot
mean_fpr = np.linspace(0, 1, 100)
fpr, tpr, thresholds = roc_curve(Y, pexp)
stats_out.xval_dist_from_hyperplane = pexp
# Display results
print 'overall CV accuracy: %.2f' % np.mean(pexp + 1.1236 > 0)
print 'AUC: %.2f' % auc(fpr,tpr)
print 'Forced Choice Accuracy: %.2f' % np.mean((pexp[0:nSub] - pexp[nSub:]) > 0)
fig1 = roc_plot(tpr, fpr)
plt.savefig(dest_dir + '/PINES_PExp_Neg_vs_Neu_SVM_ROC_plot.png')
fig2 = dist_from_hyperplane_plot(stats_out)
fig2.savefig(dest_dir + '/PINES_PExp_Neg_vs_Neu_SVM_xVal_Distance_from_Hyperplane.png')
print 'Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
In [14]:
In [7]:
tic = time.time() #Start Timer
# Separate image types and sort
combined_df = pd.read_csv('%s/metadata.csv' % dest_dir, encoding='utf8')
neg_list = combined_df.ix[combined_df.name_image.str.contains('Neg'),].sort(columns='name_image')
neu_list = combined_df.ix[combined_df.name_image.str.contains('Neu'),].sort(columns='name_image')
# Load data using nibabel
neg_file_list = [dest_dir + '/resampled/00' + str(x) + '.nii.gz' for x in neg_list.image_id]
neu_file_list = [dest_dir + '/resampled/00' + str(x) + '.nii.gz' for x in neu_list.image_id]
dat = nb.funcs.concat_images(neg_file_list+neu_file_list)
Y = np.array([1]*len(neg_file_list)+[0]*len(neu_file_list))
# negvneu = Predict(dat,Y,algorithm='svm',output_dir=dest_dir, kernel="linear")
# negvneu = Predict(dat,Y,algorithm='linear',output_dir=dest_dir)
negvneu = Predict(dat,Y,algorithm='svm',output_dir=dest_dir)
negvneu.predict()
print 'Elapsed: %.2f seconds' % (time.time() - tic) #Stop timer
In [ ]:
In [4]:
sklearn.svm.LinearSVC
In [4]:
In [ ]: