Data Wrangling

Explore the experimental data & convert to HDDM-ready format (CSV)

Fine-tune output for individual .mat files


In [1]:
import scipy.io as scio

# Import .mat file as python object
data = scio.loadmat('../data/data_18333.mat', struct_as_record=False)

For now, let's focus on the following:

  • Reaction time (rt)
  • Response (0/1)
  • Stimulus (1-4 for now)

This is similar to examples/hddm_simple.csv, used to play around with the HDDM library


In [2]:
dat_struct = data['data'][0,0]  # Actual data structure, owns to matlab weirdness

Before outputing to CSV, data for each subject will go in a python dictionary in the form of key --> array. The plan is to then create an array of dictionaries for all patients, with each dictionary representing the data gathered for an individual


In [3]:
"""
Conversion from convoluted numpy array that scipy.io spits out to a more
pythonic data structure.
Leverage python instead of numpy for data manipulation, since the use of
numpy isn't really necessary for this data.
"""

#subject['rt'] = dat_struct.rt1.tolist()[0]
#subject['response'] = [x[0] for x in dat_struct.perf1.tolist()]
#subject['stim'] = [x[0] for x in dat_struct.conditions1.tolist()]

csv_keys = ['rt', 'response', 'stim']

reaction_times = dat_struct.rt1.tolist()[0]
responses = [x[0] for x in dat_struct.perf1.tolist()]
stimuli = [x[0] for x in dat_struct.conditions1.tolist()]

subject = []

for exp_run in list(zip(reaction_times, responses, stimuli)):
    trial = dict.fromkeys(csv_keys)
    trial['rt'], trial['response'], trial['stim'] = exp_run
    subject.append(trial)

Now that the data is in a desirable format, we can dump it to a CSV file


In [4]:
import csv

with open('../data/data_18333.csv', 'w') as f:
    w = csv.DictWriter(f, csv_keys)
    w.writeheader()
    w.writerows(subject)

Convert all .mat files to .csv

Convert data from .mat files to .csv files for use by the HDDM library


In [5]:
keys = ['rt', 'response', 'stim']

In [6]:
def mat2py(mat_path):
    """
    Function to convert mat file to a pythonic data structure
    Returns list of dictionaries mapping to spectific attributes
    """
    data = scio.loadmat(mat_path, struct_as_record=False)
    
    dat_struct = data['data'][0,0]
    
    reaction_times = dat_struct.rt1.tolist()[0]
    responses = [x[0] for x in dat_struct.perf1.tolist()]
    stimuli = [x[0] for x in dat_struct.conditions1.tolist()]

    subject = []

    for exp_run in list(zip(reaction_times, responses, stimuli)):
        trial = dict.fromkeys(keys)
        trial['rt'], trial['response'], trial['stim'] = exp_run
        subject.append(trial)
    
    return subject

In [7]:
def subject2csv(subject, mat_path):
    csv_path = mat_path.replace('.mat', '.csv')
    print(csv_path)
    with open(csv_path, 'w') as f:
        w = csv.DictWriter(f, keys)
        w.writeheader()
        w.writerows(subject)

In [8]:
"""
Iterate through all .mat files in the data directory and 
convert them to csv format
"""

import glob

data_dir = '../data/pilot_subjects/'

mat_files = glob.glob(str(data_dir) + '*.mat')

for mat in mat_files:
    subject2csv(mat2py(mat), mat)


../data/pilot_subjects/data_17991.csv
../data/pilot_subjects/data_18288.csv
../data/pilot_subjects/data_18325.csv
../data/pilot_subjects/data_18333.csv
../data/pilot_subjects/data_18334.csv
../data/pilot_subjects/data_18350.csv
../data/pilot_subjects/data_18547.csv
../data/pilot_subjects/data_18619.csv
../data/pilot_subjects/data_18864.csv
../data/pilot_subjects/data_18919.csv
../data/pilot_subjects/data_18977.csv
../data/pilot_subjects/data_18978.csv
../data/pilot_subjects/data_18979.csv
../data/pilot_subjects/data_18988.csv
../data/pilot_subjects/data_19686.csv
../data/pilot_subjects/data_19687.csv

A cleaned-up version of the above is found under utils/matparser.py


In [10]:
dat_struct.__dict__


Out[10]:
{'_fieldnames': ['practice_randomlist',
  'items_selection',
  'stim_presentation1',
  'stim_presentation2',
  'stim_presentation3',
  'stim_presentation4',
  'perf1',
  'rt1',
  'conditions1',
  'onsets1',
  'durations1',
  'perf_matrix1',
  'perf_per_condition1',
  'score1'],
 'conditions1': array([[2],
        [1],
        [3],
        [4],
        [1],
        [1],
        [4],
        [1],
        [1],
        [4],
        [2],
        [1],
        [2],
        [2],
        [3],
        [2],
        [1],
        [2],
        [4],
        [1],
        [2],
        [4],
        [3],
        [3],
        [1],
        [3],
        [2],
        [3],
        [2],
        [3],
        [3],
        [4],
        [1],
        [4],
        [4],
        [3],
        [4],
        [2],
        [4],
        [3],
        [1],
        [4],
        [1],
        [4],
        [1],
        [3],
        [1],
        [4],
        [1],
        [2],
        [1],
        [2],
        [4],
        [3],
        [2],
        [1],
        [2],
        [1],
        [2],
        [2],
        [1],
        [3],
        [3],
        [2],
        [3],
        [4],
        [2],
        [2],
        [3],
        [3],
        [4],
        [4],
        [1],
        [3],
        [3],
        [4],
        [4],
        [3],
        [2],
        [4],
        [1],
        [4],
        [4],
        [1],
        [3],
        [4],
        [1],
        [2],
        [1],
        [1],
        [2],
        [1],
        [4],
        [2],
        [1],
        [2],
        [1],
        [2],
        [2],
        [3],
        [1],
        [3],
        [2],
        [2],
        [3],
        [4],
        [3],
        [3],
        [2],
        [3],
        [4],
        [4],
        [2],
        [3],
        [4],
        [3],
        [3],
        [4],
        [1],
        [4]], dtype=uint8),
 'durations1': array([[ 2.45126336],
        [ 1.18735081],
        [ 1.25909236],
        [ 1.69911023],
        [ 1.07471444],
        [ 1.04526504],
        [ 1.16844255],
        [ 1.10433897],
        [ 1.1543611 ],
        [ 1.47429776],
        [ 1.49886578],
        [ 0.92274903],
        [ 1.29293935],
        [ 1.67733875],
        [ 1.35669857],
        [ 1.61312811],
        [ 1.62063195],
        [ 2.4685545 ],
        [ 1.58107095],
        [ 1.03661995],
        [ 2.53510765],
        [ 1.31956493],
        [ 1.15901467],
        [ 1.21533711],
        [ 1.16737841],
        [ 1.68686927],
        [ 1.35111291],
        [ 1.19876817],
        [ 1.31893915],
        [ 1.0709707 ],
        [ 1.08132388],
        [ 1.32384867],
        [ 1.09241205],
        [ 1.34244346],
        [ 1.25433791],
        [ 1.64675134],
        [ 1.33432051],
        [ 1.39078089],
        [ 1.42217958],
        [ 1.62209747],
        [ 1.03054634],
        [ 1.60060165],
        [ 1.23267668],
        [ 1.2326388 ],
        [ 0.90445907],
        [ 1.30713375],
        [ 0.97110643],
        [ 0.9015779 ],
        [ 0.89627036],
        [ 1.11533743],
        [ 1.10949112],
        [ 1.62447684],
        [ 1.10404016],
        [ 1.35462512],
        [ 1.27518102],
        [ 1.08312745],
        [ 1.08513049],
        [ 1.08789127],
        [ 1.53893128],
        [ 1.37074305],
        [ 1.0503687 ],
        [ 1.30936404],
        [ 1.10093251],
        [ 1.58367755],
        [ 1.17584   ],
        [ 1.07156708],
        [ 1.09063851],
        [ 1.1570291 ],
        [ 1.63666369],
        [ 2.18059685],
        [ 1.55661337],
        [ 1.18108046],
        [ 0.94902835],
        [ 1.15958932],
        [ 0.99904017],
        [ 1.16207775],
        [ 1.0816553 ],
        [ 1.21270853],
        [ 1.39612188],
        [ 1.60412674],
        [ 1.10000909],
        [ 1.14307696],
        [ 1.47119033],
        [ 1.2226557 ],
        [ 1.1990032 ],
        [ 1.54296442],
        [ 1.11839847],
        [ 2.00093415],
        [ 2.8733016 ],
        [ 1.4895157 ],
        [ 1.17733597],
        [ 0.96926468],
        [ 1.37956233],
        [ 1.06753496],
        [ 1.15842807],
        [ 1.18198703],
        [ 0.84629093],
        [ 1.19316261],
        [ 2.16095606],
        [ 1.11280836],
        [ 0.96361107],
        [ 0.97419361],
        [ 0.99240412],
        [ 1.33908928],
        [ 1.58693136],
        [ 1.62746215],
        [ 1.25126147],
        [ 1.11524372],
        [ 1.27739744],
        [ 1.73345977],
        [ 1.16589908],
        [ 1.44559651],
        [ 1.65359799],
        [ 1.34156514],
        [ 1.14161371],
        [ 1.05586082],
        [ 0.97845135],
        [ 1.50926096],
        [ 1.08498825],
        [ 1.1435712 ]]),
 'items_selection': array([[34, 58, 37, 21, 51, 15, 42, 40, 60, 22,  7, 27, 16,  4,  5, 35, 43,
         30, 38, 11, 31, 52,  9, 59, 12, 44, 32, 33, 20,  8, 57,  1, 36, 14,
         46, 55, 48, 23, 54, 49, 17, 28,  2, 56, 47, 13, 45, 50, 18, 10,  6,
         24,  3, 25, 19, 29, 41, 26, 39, 53]], dtype=uint8),
 'onsets1': array([[  5.52242198e-01],
        [  6.59215847e+00],
        [  1.06321114e+01],
        [  1.86720328e+01],
        [  2.27119796e+01],
        [  2.87652540e+01],
        [  3.68184876e+01],
        [  4.28584281e+01],
        [  4.89116809e+01],
        [  5.49515989e+01],
        [  6.09915385e+01],
        [  6.70314826e+01],
        [  7.10847560e+01],
        [  7.91246765e+01],
        [  8.31646119e+01],
        [  9.12045284e+01],
        [  9.72444746e+01],
        [  1.01284398e+02],
        [  1.07324348e+02],
        [  1.15364260e+02],
        [  1.23417517e+02],
        [  1.29457414e+02],
        [  1.37497354e+02],
        [  1.43537267e+02],
        [  1.51577168e+02],
        [  1.57617092e+02],
        [  1.65657033e+02],
        [  1.69696990e+02],
        [  1.77736883e+02],
        [  1.81776846e+02],
        [  1.87830109e+02],
        [  1.91883390e+02],
        [  1.99923299e+02],
        [  2.07976539e+02],
        [  2.16016463e+02],
        [  2.20056411e+02],
        [  2.24096347e+02],
        [  2.32136275e+02],
        [  2.36176205e+02],
        [  2.44216139e+02],
        [  2.52256038e+02],
        [  2.58309305e+02],
        [  2.62349254e+02],
        [  2.68389191e+02],
        [  2.74429131e+02],
        [  2.82482349e+02],
        [  2.90522281e+02],
        [  2.98575531e+02],
        [  3.04628765e+02],
        [  3.10682028e+02],
        [  3.14735320e+02],
        [  3.22788580e+02],
        [  3.28828493e+02],
        [  3.32881805e+02],
        [  3.36921738e+02],
        [  3.44961658e+02],
        [  3.51014905e+02],
        [  3.55068185e+02],
        [  3.59121501e+02],
        [  3.65161428e+02],
        [  3.71201361e+02],
        [  3.75254632e+02],
        [  3.81294550e+02],
        [  3.87347839e+02],
        [  3.95387747e+02],
        [  3.99427699e+02],
        [  4.07480951e+02],
        [  4.15534181e+02],
        [  4.19574133e+02],
        [  4.25614067e+02],
        [  4.29654009e+02],
        [  4.37693933e+02],
        [  4.41733884e+02],
        [  4.47787155e+02],
        [  4.55827100e+02],
        [  4.59880341e+02],
        [  4.67920258e+02],
        [  4.73973524e+02],
        [  4.82013421e+02],
        [  4.88053354e+02],
        [  4.92093327e+02],
        [  4.96146601e+02],
        [  5.02186532e+02],
        [  5.10226432e+02],
        [  5.14266384e+02],
        [  5.22306319e+02],
        [  5.28346226e+02],
        [  5.32399539e+02],
        [  5.36439471e+02],
        [  5.42479415e+02],
        [  5.46519364e+02],
        [  5.50559325e+02],
        [  5.56612591e+02],
        [  5.62652501e+02],
        [  5.70705754e+02],
        [  5.76745681e+02],
        [  5.80785650e+02],
        [  5.86838914e+02],
        [  5.92878823e+02],
        [  5.96918790e+02],
        [  6.00972086e+02],
        [  6.07025352e+02],
        [  6.11078627e+02],
        [  6.19131850e+02],
        [  6.23171824e+02],
        [  6.27211777e+02],
        [  6.33251701e+02],
        [  6.41291615e+02],
        [  6.45344903e+02],
        [  6.49384852e+02],
        [  6.57424770e+02],
        [  6.65464670e+02],
        [  6.69504628e+02],
        [  6.75544543e+02],
        [  6.81584489e+02],
        [  6.89637718e+02],
        [  6.97690970e+02],
        [  7.05744221e+02],
        [  7.09784141e+02],
        [  7.17837413e+02]]),
 'perf1': array([[1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1]], dtype=uint8),
 'perf_matrix1': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
         0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=uint8),
 'perf_per_condition1': array([[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
         100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
         100, 100, 100, 100, 100, 100, 100, 100, 100, 100,   0,   0, 100,
         100, 100, 100, 100, 100,   0, 100, 100, 100, 100, 100,   0, 100,
         100, 100, 100, 100,   0, 100, 100, 100, 100, 100, 100, 100, 100,
         100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
         100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
         100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
         100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
         100, 100, 100]], dtype=uint8),
 'practice_randomlist': array([[ 8,  1, 10,  4,  6,  9,  5, 14, 16, 11,  3,  7, 13, 15, 12,  2]], dtype=uint8),
 'rt1': array([[ 2.45126336,  1.18735081,  1.25909236,  1.69911023,  1.07471444,
          1.04526504,  1.16844255,  1.10433897,  1.1543611 ,  1.47429776,
          1.49886578,  0.92274903,  1.29293935,  1.67733875,  1.35669857,
          1.61312811,  1.62063195,  2.4685545 ,  1.58107095,  1.03661995,
          2.53510765,  1.31956493,  1.15901467,  1.21533711,  1.16737841,
          1.68686927,  1.35111291,  1.19876817,  1.31893915,  1.0709707 ,
          1.08132388,  1.32384867,  1.09241205,  1.34244346,  1.25433791,
          1.64675134,  1.33432051,  1.39078089,  1.42217958,  1.62209747,
          1.03054634,  1.60060165,  1.23267668,  1.2326388 ,  0.90445907,
          1.30713375,  0.97110643,  0.9015779 ,  0.89627036,  1.11533743,
          1.10949112,  1.62447684,  1.10404016,  1.35462512,  1.27518102,
          1.08312745,  1.08513049,  1.08789127,  1.53893128,  1.37074305,
          1.0503687 ,  1.30936404,  1.10093251,  1.58367755,  1.17584   ,
          1.07156708,  1.09063851,  1.1570291 ,  1.63666369,  2.18059685,
          1.55661337,  1.18108046,  0.94902835,  1.15958932,  0.99904017,
          1.16207775,  1.0816553 ,  1.21270853,  1.39612188,  1.60412674,
          1.10000909,  1.14307696,  1.47119033,  1.2226557 ,  1.1990032 ,
          1.54296442,  1.11839847,  2.00093415,  2.8733016 ,  1.4895157 ,
          1.17733597,  0.96926468,  1.37956233,  1.06753496,  1.15842807,
          1.18198703,  0.84629093,  1.19316261,  2.16095606,  1.11280836,
          0.96361107,  0.97419361,  0.99240412,  1.33908928,  1.58693136,
          1.62746215,  1.25126147,  1.11524372,  1.27739744,  1.73345977,
          1.16589908,  1.44559651,  1.65359799,  1.34156514,  1.14161371,
          1.05586082,  0.97845135,  1.50926096,  1.08498825,  1.1435712 ]]),
 'score1': array([[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
         1, 1, 1, 1, 1, 1, 0, 1]], dtype=uint8),
 'stim_presentation1': array([[40, 30, 15, 32, 59, 11, 42,  5, 20, 52, 16, 21, 60,  9,  8, 35, 31,
         12, 44,  7, 43, 51,  4, 22, 33, 34, 27, 58, 38, 37]], dtype=uint8),
 'stim_presentation2': array([[22,  9, 21,  8, 44, 27, 43, 38, 37, 34, 11,  5, 51, 12, 31, 35,  7,
         60, 58, 52, 30, 33, 15,  4, 42, 16, 40, 59, 20, 32]], dtype=uint8),
 'stim_presentation3': array([[22, 37, 43, 44, 20, 15,  8, 42, 34, 59, 27, 16, 21,  7, 51, 40, 52,
         11, 30,  4, 32, 12,  9, 58, 60, 38,  5, 33, 35, 31]], dtype=uint8),
 'stim_presentation4': array([[15, 32, 31, 51, 21, 30, 16,  9, 22, 35,  7, 40, 12, 44, 37, 38, 58,
         11, 43, 59,  4, 20, 33, 60,  5, 27, 34,  8, 52, 42]], dtype=uint8)}