This notebook will walk through a comparison of different thresholding approaches for fMRI data. The example data used here are from the UCLA CNP Dataset; the specific contrast used is the comparison of go trials to baseline for the stop signal task. Data are from the first 125 subjects in the dataset; these data were used to generate datasets of smaller sizes (25,50,75, and 100 subjects) for comparison across different smaple sizes.

First we need to set up our environment.


In [1]:
%matplotlib inline
import nibabel
import numpy
import seaborn as sns
import matplotlib.pyplot as plt
from nilearn import plotting
import scipy.stats
import scipy.ndimage

Now let's load up the data files for each of the datasets, starting with the raw t statistic maps.


In [2]:
nvals=[25,50,75,100,125]
tstat={}
for n in nvals:
    tstat[n]=nibabel.load('data/tt_%dsubs_tstat1.nii.gz'%n).get_data()

In [3]:
maskimg=nibabel.load('data/mask.nii.gz')
mask=maskimg.get_data()
maskvox=numpy.where(mask>0)

In [4]:
for n in nvals:
    h,b=numpy.histogram(tstat[n][maskvox],bins=100)
    plt.plot(b[1:],h)
plt.legend(nvals)


Out[4]:
<matplotlib.legend.Legend at 0x11553bd50>

Create a function to extract the cluster size distribution for an image.


In [5]:
def get_cluster_dist(img):
    # need to give a 27-connectivity structure to match FSL cluster results
    l=scipy.ndimage.label(img,numpy.ones((3,3,3)))
    h=numpy.histogram(l[0],bins=numpy.arange(0.5,l[1]+0.5,1))
    return h[0]

First get maps using uncorrected p<0.001


In [6]:
unc_nvox={}
unc_cdist={}
unc_pval=0.005

for n in nvals:
    cutoff=scipy.stats.t.ppf(1.-unc_pval,n)
    unc_nvox[n]=numpy.sum(tstat[n]>cutoff)
    unc_cdist[n]=get_cluster_dist(tstat[n]>cutoff)
    plotting.plot_stat_map('data/tt_%dsubs_tstat1.nii.gz'%n, bg_img='data/bg_image.nii.gz',
                       threshold=cutoff, title="uncorrected: %d subs"%n,
                       cut_coords=(-36,-18,54))


Now load the GRF maps created using FSL's cluster tool.


In [7]:
grfdata={}
grf_nvox={}
grf_cdist={}
for n in nvals:
    img=nibabel.load('data/thresh_%dsubs.nii.gz'%n)
    grfdata[n]=img.get_data()
    grf_nvox[n]=numpy.sum(grfdata[n]>2.3)
    grf_cdist[n]=get_cluster_dist(grfdata[n]>2.3)
    plotting.plot_stat_map('data/thresh_%dsubs.nii.gz'%n, bg_img='data/bg_image.nii.gz',
                       threshold=2.3, title="Cluster-based GRF: %d subs"%n,
                       cut_coords=(-36,-18,54))


Get voxel-based GRF data


In [8]:
vgrfdata={}
vgrf_nvox={}
vgrf_cdist={}
for n in nvals:
    img=nibabel.load('data/voxelGRF_%dsubs.nii.gz'%n)
    vgrfdata[n]=img.get_data()
    vgrf_nvox[n]=numpy.sum(vgrfdata[n]>0)
    vgrf_cdist[n]=get_cluster_dist(vgrfdata[n]>0)
    plotting.plot_stat_map('data/voxelGRF_%dsubs.nii.gz'%n, bg_img='data/bg_image.nii.gz',
                       threshold=2.3, title="Voxel-based GRF: %d subs"%n,
                       cut_coords=(-36,-18,54))


Now do the same for FDR data


In [9]:
fdrdata={}
fdr_nvox={}
fdr_cdist={}
for n in nvals:
    img=nibabel.load('data/fdr_tstat_%dsubs.nii.gz'%n)
    fdrdata[n]=img.get_data()
    fdr_nvox[n]=numpy.sum(fdrdata[n]>0)
    fdr_cdist=get_cluster_dist(fdrdata[n]>0)
    plotting.plot_stat_map(img, bg_img='data/bg_image.nii.gz',
                       threshold=2.3, title="FDR: %d subs"%n,
                       cut_coords=(-36,-18,54))


Now load the nonparametric results (obtained using FSL's randomise)


In [10]:
randdata={'clustere':{},'clusterm':{},'tfce':{}}

rand_nvox={'clustere':{},'clusterm':{},'tfce':{}}
rand_cdist={'clustere':{},'clusterm':{},'tfce':{}}

for type in ['clustere','clusterm','tfce']:
  for n in nvals:
    imgfile='data/randomize_%dsubs/tt_%dsubs_%s_corrp_tstat1.nii.gz'%(n,n,type)
    img=nibabel.load(imgfile)
    randdata[type][n]=img.get_data()
    rand_cdist[type][n]=get_cluster_dist(randdata[type][n]>0.95)
    rand_nvox[type][n]=numpy.sum(randdata[type][n]>0.95)
    plotting.plot_stat_map(img, bg_img='data/bg_image.nii.gz',
                       threshold=0.95, title="randomize %s: %d subs"%(type,n),
                       cut_coords=(-36,-18,54))


Now get the variance-normalized versions of the randomize results


In [11]:
randvndata={'clustere':{},'clusterm':{},'tfce':{}}

randvn_nvox={'clustere':{},'clusterm':{},'tfce':{}}
randvn_cdist={'clustere':{},'clusterm':{},'tfce':{}}

for type in ['clustere','clusterm','tfce']:
  for n in nvals:
    imgfile='data/randomize_%dsubs_vn/tt_%dsubs_%s_corrp_tstat1.nii.gz'%(n,n,type)
    img=nibabel.load(imgfile)
    randvndata[type][n]=img.get_data()
    randvn_cdist[type][n]=get_cluster_dist(randvndata[type][n]>0.95)
    randvn_nvox[type][n]=numpy.sum(randvndata[type][n]>0.95)
    plotting.plot_stat_map(img, bg_img='data/bg_image.nii.gz',
                       threshold=0.95, title="randomize+VN %s: %d subs"%(type,n),
                       cut_coords=(-36,-18,54))


Now plot the number of active voxels for each method as a function of sample size


In [12]:
nvox_data=numpy.zeros((len(nvals),10))
for i in range(len(nvals)):
    nv=nvals[i]
    nvox_data[i,0]=unc_nvox[nv]
    nvox_data[i,1]=grf_nvox[nv]
    nvox_data[i,2]=vgrf_nvox[nv]
    nvox_data[i,3]=fdr_nvox[nv]
    nvox_data[i,4]=rand_nvox['clustere'][nv]
    nvox_data[i,5]=rand_nvox['clusterm'][nv]
    nvox_data[i,6]=rand_nvox['tfce'][nv]
    nvox_data[i,7]=randvn_nvox['clustere'][nv]
    nvox_data[i,8]=randvn_nvox['clusterm'][nv]
    nvox_data[i,9]=randvn_nvox['tfce'][nv]

In [13]:
plt.plot(nvals,nvox_data)
legend_entries=['unc p<%0.3f'%unc_pval,'cluster GRF','voxel GRF','FDR','clustere','clusterm','TFCE','clustere-VN','clusterm-VN','TFCE-VN']

plt.legend(legend_entries,loc=4)
plt.xlabel('Number of subjects')
plt.ylabel('Number of voxels over threshold')
print 'Number of voxels exceeding threshold:'
for i in range(nvox_data.shape[1]):
    print legend_entries[i],nvox_data[:,i]


Number of voxels exceeding threshold:
unc p<0.005 [ 10868.  35927.  35371.  41887.  48563.]
cluster GRF [ 15718.  46399.  42711.  49313.  55454.]
voxel GRF [  1284.  10445.  13495.  18451.  23587.]
FDR [  8801.  51694.  46387.  54487.  62028.]
clustere [ 11486.  46399.  42711.  49313.  55454.]
clusterm [ 11486.  46399.  42711.  49313.  55454.]
TFCE [  4018.  52304.  47977.  53322.  59919.]
clustere-VN [  9461.  42395.  39801.  47454.  53756.]
clusterm-VN [  9461.  44253.  39801.  47454.  53756.]
TFCE-VN [  4617.  53005.  47678.  53036.  61525.]

Now let's examine the reproducibility of results across separate samples of data, for each of the methods. For sample sizes of 25 and 50 we generated a second set of non-overlapping subjects and performed the same analyses on those. We will load those, and compare the results across the two subsets for each method. Please take these results with a very large grain of salt, since they are only a single subsample; doing this properly would require generating many such subsamples from a larger dataset.


In [14]:
dice={'unc':{},'clusterGRF':{},'FDR':{},'clustere':{},'clusterm':{},'tfce':{},'clustere-VS':{},'clusterm-VS':{},'tfce-VS':{}}

def get_dice(t1,t2):
    """ return dice coefficient for two binary arrays"""
    return (2.0*numpy.sum(t1*t2))/(numpy.sum(t1)+numpy.sum(t2))

nvals_overlap=[25,50]
tstat_set2={}
grfdata_set2={}
fdrdata_set2={}
for n in nvals_overlap:
    tstat_set2[n]=nibabel.load('data/tt_%dsubs_set2_tstat1.nii.gz'%n).get_data()
    cutoff=scipy.stats.t.ppf(1.-unc_pval,n)
    t1=tstat[n][maskvox]>cutoff
    t2=tstat_set2[n][maskvox]>cutoff
    dice['unc'][n]=get_dice(t1,t2)
    
    img=nibabel.load('data/thresh_%dsubs_set2.nii.gz'%n)
    grfdata_set2[n]=img.get_data()
    t1=grfdata[n][maskvox]>2.2
    t2=grfdata_set2[n][maskvox]>2.2
    dice['clusterGRF'][n]=get_dice(t1,t2)
        
    img=nibabel.load('data/fdr_tstat_%dsubs_set2.nii.gz'%n)
    fdrdata_set2[n]=img.get_data()
    t1=fdrdata[n][maskvox]>0.95
    t2=fdrdata_set2[n][maskvox]>0.95
    dice['FDR'][n]=get_dice(t1,t2)
    
    for type in ['clustere','clusterm','tfce']:
        imgfile='data/randomize_%dsubs_set2/tt_%dsubs_%s_corrp_tstat1.nii.gz'%(n,n,type)
        img=nibabel.load(imgfile)
        data=img.get_data()
        t1=randdata[type][n][maskvox]>0.95
        t2=data[maskvox]>0.95
        dice[type][n]=get_dice(t1,t2)

    for type in ['clustere','clusterm','tfce']:
        imgfile='data/randomize_%dsubs_set2_vn/tt_%dsubs_%s_corrp_tstat1.nii.gz'%(n,n,type)
        img=nibabel.load(imgfile)
        data=img.get_data()
        t1=randvndata[type][n][maskvox]>0.95
        t2=data[maskvox]>0.95
        dice[type+'-VS'][n]=get_dice(t1,t2)

In [15]:
print 'Dice coefficient across separate sets of subjects'
keys=dice.keys()
keys.sort()
for k in keys:
    print k,dice[k]

print ''
print'Number of voxels exceeding threshold:'
for i in range(nvox_data.shape[1]):
    print legend_entries[i],nvox_data[:2,i]


Dice coefficient across separate sets of subjects
FDR {25: 0.23337277188354635, 50: 0.49772769197229627}
clusterGRF {25: 0.36659144875595473, 50: 0.5382944838569567}
clustere {25: 0.28645136933507415, 50: 0.4804904214559387}
clustere-VS {25: 0.2815508535285587, 50: 0.49286925301685447}
clusterm {25: 0.28645136933507415, 50: 0.4804904214559387}
clusterm-VS {25: 0.2815508535285587, 50: 0.4781038374717833}
tfce {25: 0.12004183512589563, 50: 0.420583099790178}
tfce-VS {25: 0.1420156026110492, 50: 0.40772929809855735}
unc {25: 0.3469031948742536, 50: 0.5418841933652068}

Number of voxels exceeding threshold:
unc p<0.005 [ 10868.  35927.]
cluster GRF [ 15718.  46399.]
voxel GRF [  1284.  10445.]
FDR [  8801.  51694.]
clustere [ 11486.  46399.]
clusterm [ 11486.  46399.]
TFCE [  4018.  52304.]
clustere-VN [  9461.  42395.]
clusterm-VN [  9461.  44253.]
TFCE-VN [  4617.  53005.]

In [ ]: