Generate Network Stacks for the GLM

The goal of this experiment is to investigate the variance in the preprocessed data. I will investigate the different metrics regarding std and binarized count

  1. Generate 4D stacks of all subjects for a given template network so I can flip through them
  2. Generate mean and std maps across all subjects for all networks
  3. Make a 4D stack of the template networks for comparison

In [1]:
import os
import re
import copy
import numpy as np
import pandas as pd
import nibabel as nib
import brainbox as bb
import multiprocessing as mp
from matplotlib import pyplot as plt

In [2]:
import os
import re
import glob
import copy
import numpy as np
import nibabel as nib
from brainbox import tools as to
from  __builtin__ import any as b_any

In [3]:
def find_files(in_path, ext, targets, template='(?<=\d{2})\d{5}', sub=False):
    """
    Finds matching files with extension ext and returns them in
    the order of the targets list given as argument
    Returns a dictionary identical to what I was using before
    Also drops duplicates
    """
    # Go through each directory and see if I can find the subjects I am looking
    # for
    ext = '*{}'.format(ext)
    out_dict = {key: [] for key in ['sub_name', 'dir', 'path']}
   
    if not sub:
        sub_dirs = [d for d in os.walk(in_path).next()[1]]
        print(sub_dirs)
        for sub_dir in sub_dirs:
            print('heyho')
            tmp_dir = os.path.join(in_path, sub_dir)
            in_files = glob.glob(os.path.join(tmp_dir, ext))
            tmp_dict = dict()

            # Get the files that we have
            matches = [x for x in targets if b_any(str(x) in t for t in in_files)]

            for in_file in in_files:
                sub_name = os.path.basename(in_file.split('.')[0])
                sub_id = re.search(r'{}'.format(template), sub_name).group()
                if sub_id in tmp_dict.keys():
                    # This is a duplicate
                    continue
                tmp_dict[sub_id] = (sub_name, in_file)

            # Re-sort the path info
            sort_list = list()
            for target in matches:
                sub_name, in_file = tmp_dict[target]
                out_dict['sub_name'].append(sub_name)
                out_dict['dir'].append(sub_dir)
                out_dict['path'].append(in_file)
    else:
        sub_dir = sub
        tmp_dir = os.path.join(in_path, sub_dir)
        in_files = glob.glob(os.path.join(tmp_dir, ext))
        tmp_dict = dict()

        # Get the files that we have
        matches = [x for x in targets if b_any(str(x) in t for t in in_files)]

        for in_file in in_files:
            sub_name = os.path.basename(in_file.split('.')[0])
            sub_id = re.search(r'{}'.format(template), sub_name).group()
            if sub_id in tmp_dict.keys():
                # This is a duplicate
                continue
            tmp_dict[sub_id] = (sub_name, in_file)

        for target in matches:
            sub_name, in_file = tmp_dict[target]
            out_dict['sub_name'].append(sub_name)
            out_dict['dir'].append(sub_dir)
            out_dict['path'].append(in_file)
    return out_dict

In [8]:
# Paths
scale = 12
run = 1
method = 'dual_regression'
directory = '{}_sc{}'.format(method, scale)
in_path = '/data1/pierre_scores/rest_{}'.format(run)
out_path = '/data1/pierre_scores/out/scores_s{}/stack_maps_{}'.format(scale, run)
if not os.path.isdir(out_path):
    try:
        os.makedirs(out_path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(out_path):
            pass
        else: raise

ta_path = '/data1/pierre_scores/pheno/model_preventad_20141215.csv'
ext = '.nii.gz'

pheno = pd.read_csv(ta_path)
targets = pheno['subjects'].values
file_dict = find_files(in_path, ext, targets, template='(?<=fmri_)s\d{6}\S{3}', sub=directory)
num_subs = len(file_dict['path'])
data_subs = np.array([re.search(r'(?<=fmri_)s\d{6}\S{3}', sub_id).group() for sub_id in file_dict['sub_name']])

In [9]:
def run_par(args):
    """
    Wrapper function to do the loading and saving in parallel
    """
    ds, num_subs, use_dict, tmp_i, out_path, net_id = args
    
    mean_mat = np.zeros(ds[:3] + (scale,))
    std_mat = np.zeros(ds[:3] + (scale,))
    
    sub_stack = np.zeros(ds[:3] + (num_subs,))
    for sub_id in np.arange(num_subs):
        img = nib.load(use_dict['path'][sub_id])
        data = img.get_data()
        net = data[..., net_id]
        sub_stack[..., sub_id] = net
    # Save the network stack first
    stack_out = nib.Nifti1Image(sub_stack, tmp_i.get_affine(), tmp_i.get_header())
    nib.save(stack_out, os.path.join(out_path, '{}_netstack_net{}_scale_{}_run_{}.nii.gz'.format(method, net_id + 1, scale, run)))
    # Get the mean and std
    mean = np.mean(sub_stack, axis=3)
    mean_mat[..., net_id] = mean
    std = np.std(sub_stack, axis=3)
    std_mat[..., net_id] = std
    
    return mean_mat, std_mat

In [10]:
# Get a template
tmp_i = nib.load(file_dict['path'][0])
tmp = tmp_i.get_data()
ds = tmp.shape

# Set up the parallel processing
p_perc = 0.9
p_count = int(np.floor(mp.cpu_count() * p_perc))

# Prepare the meta mats
mean_mat = np.zeros(ds[:3] + (scale,))
std_mat = np.zeros(ds[:3] + (scale,))
arg_list = list()
for net_id in np.arange(scale):
    arg_list.append((ds, num_subs, file_dict, tmp_i, out_path, net_id))
    
# Run the stuff in parallel
print('Running things in parallel now - for speed!')
p = mp.Pool(p_count)
results = p.map(run_par, arg_list)
print('Done with that')


Running things in parallel now - for speed!
Done with that

In [11]:
for result in results:
    mean_mat += result[0]
    std_mat += result[1]

# Save the mean and std maps
print('Saving the mean and std files to {}'.format(out_path))
mean_out = nib.Nifti1Image(mean_mat, tmp_i.get_affine(), tmp_i.get_header())
nib.save(mean_out, os.path.join(out_path, '{}_mean_stack_scale{}_{}.nii.gz'.format(method, scale, run)))
std_out = nib.Nifti1Image(std_mat, tmp_i.get_affine(), tmp_i.get_header())
nib.save(std_out, os.path.join(out_path, '{}_std_stack_scale{}_run_{}.nii.gz'.format(method, scale, run)))


Saving the mean and std files to /data1/pierre_scores/out/scores_s12/stack_maps_1

In [ ]: