Imports


In [2]:
import os
import sys

In [3]:
os.getcwd()
os.chdir('../')
os.chdir('code')

In [6]:
os.getcwd()


Out[6]:
'/Users/AriaW/Desktop/159/project-beta/project-beta-1/code'

In [7]:
from utils import vmt_utils
from utils import regression
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing 
import copy
import time
import nibabel as nib
import cPickle

In [8]:
%matplotlib inline

In [9]:
reload(regression)


Out[9]:
<module 'utils.regression' from 'utils/regression.pyc'>

Get masked data


In [10]:
# data = np.load('../data/masked_data_50k.npy')

In [11]:
data = np.load('../data/filtered_data.npy')

In [12]:
data.shape


Out[12]:
(55468, 3543)

In [13]:
data_normalized = stats.zscore(data,axis=1,ddof=1)

In [64]:
plt.plot(data_normalized[16188,:])
plt.savefig('../figure/normailzed_data.jpg')



In [14]:
vvar = np.var(data,axis=1)

In [11]:
plt.plot(vvar)
plt.savefig('../figure/data_variance_normalization.jpg')


Get feature space


In [15]:
# Get stimulus feature spaces
stim_fs_fpath = '../description_pp/design_matrix_1.npy'
stim_fs_file = np.load(stim_fs_fpath)
stim_fs_file = stim_fs_file[:3543,:]

In [16]:
stim_fs_file.shape


Out[16]:
(3543, 1155)

Separate Trainig and testing data


In [28]:
e, v = 500, 143

In [29]:
stim_fs_est = stim_fs_file[:e,:]
stim_fs_val = stim_fs_file[:v,:]

In [30]:
print('Estimation feature space shape: %s'%repr(stim_fs_est.shape))
print('Validation feature space shape: %s'%repr(stim_fs_val.shape))


Estimation feature space shape: (500, 1155)
Validation feature space shape: (143, 1155)

In [31]:
data_est = data_normalized[...,:e]
data_val = data_normalized[...,:v]

In [32]:
data_est_masked = data_est.T
data_val_masked = data_val.T
# Show size of masked data
print('Size of masked estimation data is %s'%repr(data_est_masked.shape))
print('Size of masked validation data is %s'%repr(data_val_masked.shape))


Size of masked estimation data is (500, 55468)
Size of masked validation data is (143, 55468)

Set up feature space matrix to do regression


In [33]:
# Create lagged stimulus matrix
efs = vmt_utils.add_lags(stim_fs_est,[2,3,4])
vfs = vmt_utils.add_lags(stim_fs_val,[2,3,4])
print efs.shape
# Add column of ones
efs = vmt_utils.add_constant(efs,is_first=True)
vfs = vmt_utils.add_constant(vfs,is_first=True)
print efs.shape


(500, 3465)
(500, 3466)

Subset data and stimuli


In [34]:
ts = 3000 #training subset
es = 543 #estimation subset

In [35]:
efs = efs[:ts,:]
vfs = vfs[:es,:]
print efs.shape
print vfs.shape


(500, 3466)
(143, 3466)

In [36]:
data_est_masked = data_est_masked[:ts,:]
data_val_masked = data_val_masked[:es,:]
print data_est_masked.shape
print data_val_masked.shape


(500, 55468)
(143, 55468)

Run regression


In [37]:
reload(vmt_utils)


Out[37]:
<module 'utils.vmt_utils' from 'utils/vmt_utils.pyc'>

In [38]:
alpha = np.logspace(0,6,10)
alpha


Out[38]:
array([  1.00000000e+00,   4.64158883e+00,   2.15443469e+01,
         1.00000000e+02,   4.64158883e+02,   2.15443469e+03,
         1.00000000e+04,   4.64158883e+04,   2.15443469e+05,
         1.00000000e+06])

In [39]:
# Run regression 
n_splits = 10 # number of subdivisions of validation data for cross validation of ridge parameter (alpha)
n_resamps = 10 # number of times to compute regression & prediction within training data (can be <= n_splits) #default10
chunk_sz = 10000 # number of voxels to fit at once. Memory-saving.
pthr = 0.005 # Ridge parameter is chosen based on how many voxels are predicted above a correlation threshold 
             # for each alpha value (technically it's slightly more complicated than that, see the code). 
             # This p value sets that correlation threshold.
t0 = time.time()

out = regression.ridge_cv(efs,data_est_masked,val_fs=vfs,val_data=data_val_masked,alphas=alpha,n_resamps=n_resamps,
                              n_splits=n_splits,chunk_sz=chunk_sz,pthr=pthr,is_verbose=True)

t1 = time.time()
print("Elapsed time is: %d min, %d sec"%((t1-t0)/60,(t1-t0)%60))


Running split 1/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 2/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 3/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 4/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 5/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 6/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 7/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 8/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 9/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Running split 10/10
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Finding best alpha...
Best alpha = 1.000
Computing weights and model predictions...
Running chunk 1 of 6...

Running chunk 2 of 6...

Running chunk 3 of 6...

Running chunk 4 of 6...

Running chunk 5 of 6...

Running chunk 6 of 6...

Elapsed time is: 1 min, 46 sec

Make sure estimation procedure chose a reasonable $\alpha$ value

There should be a somewhat obvious maximum in the curve plotted below


In [40]:
# # Plot number of voxels with significant prediction accuracy within the 
# # estimation data for each alpha value
# na = len(out['n_sig_vox_byalpha'])
# plt.plot(range(na),out['n_sig_vox_byalpha'],'ko-',lw=2)
# # plt.xticks(range(na),vmt.regression.DEFAULT_ALPHAS,rotation=45)
# plt.xticks(range(na),alpha,rotation=45)
# plt.xlabel('Regularization parameter')
# _ = plt.ylabel('Number of voxels\naccurately predicted')

Display prediction accuracy results on the cortical surface


In [41]:
out['cc'].shape


Out[41]:
(55468,)

In [42]:
cc = out['cc']
weights = out['weights']

In [43]:
plt.hist(np.nan_to_num(cc),21)
plt.title('correlation coefficient histogram')
plt.savefig('../figure/correlation_coefficient_histogram.jpg')



In [44]:
out['weights'].shape


Out[44]:
(3466, 55468)

In [45]:
voxel_idx = np.argsort(cc)[::-1][:100]
voxel_idx


Out[45]:
array([ 9571,   398,   109,   401, 10256, 10262, 22041, 21298, 22016,
       22841, 22840, 23714, 22813,   400, 20612, 10951, 22042, 22043,
       12332,   111,  9568,  9569, 10257,   108, 10265, 10945, 22843,
       22787, 13550, 22868,   110, 10270,   399, 22017,  9570,  9576,
       24617,   403,  9577, 10263, 13554, 23716, 20611, 12954, 12333,
       21273, 10264,  9563, 22815, 12327,   402,   112, 23659, 19985,
       10251, 23689, 53073, 53072, 23688, 52204, 23661, 23715,   117,
       22816, 12959, 11650, 23717,   404, 24586, 54353, 10259, 20027,
        8859, 51489,   125, 23687, 52220,  8866,  9567, 22067, 12323,
       13548, 10946, 22867, 22068, 13536,   833, 23474, 10260, 12328,
       10958,   839, 20588, 14063, 11654,  8850, 10954, 22018, 10261, 11655])

In [50]:
idxx = [13,1780,1014,15500] # hand picked voxels
plt.figure(4,figsize=(8,16))
for n in range(4):
    i = idxx[n]
    pred_activity = vfs.dot(weights[:,i])
    plt.subplot(4,n//4+1,n%4+1)
    l1, = plt.plot(data_val_masked[:400,i],'b')
    l2, = plt.plot(pred_activity[:400,],'r')
    plt.ylabel('BOLD response')
    plt.xlabel('TRs')
    plt.title('well predict voxel %d\n'%i)
    plt.legend([l1, l2], ['data','prediction'],loc=3)
    
plt.tight_layout()
plt.savefig('../ridge_prediciton_resutls.jpg')



In [76]:
sum(cc>0.7)


Out[76]:
46746

In [ ]: