Solution to assignment

By: Christopher Bailey

Date: 15 Sep 2017

Import statements, function definitions and default variables


In [20]:
from os.path import join, basename
import string
from fddhs import grep  # grep for a string in a file
from fddhs import find  # find files matching a pattern

logfile_dir = '../src/logs'  # adjust to fit where you placed the logs!

# take a look at this CSV-file to determine its structure
subject_codes = join(logfile_dir, 'subj_codes.csv')

output_dir = 'assignment'  # did you remember to create it??
output_file = join(output_dir, 'solution_Christopher_Bailey.csv')

In [39]:
# copy-paste your mean- and median-function here:
def mean(values):
    return(sum(values)/len(values))
def median(values):
    return(sorted(values)[len(values) // 2])

In [40]:
def read_log_file(logfile_name, field_sep='\t'):
    '''Read a single log file
    
    The default field-separator is set to be the tab-character (\t)
    
    Return the mean and median RT, and the accuracy, separately for
    the frequent and rare categories. This is done as a list (tuple) of
    6 return values, in the order:
    (mean_rt_freq, median_rt_freq, accuracy_freq,
     mean_rt_rare, median_rt_rare, accuracy_rare)
    '''

    # initialise 
    rt_freq = []
    rt_rare = []
    n_corr_freq = 0
    n_corr_rare = 0

    # open file and read all its lines into a list
    fp = open(logfile_name, 'r')
    all_lines = fp.readlines()
    fp.close()

    # hard-code the index of the stimulus/response type/number
    idx = 5
    
    # loop over lines from 6th onwards
    for line in all_lines[5:]:
        split_line = line.split(field_sep)

        # does the 3rd element of the list start with 'STIM'?
        if split_line[2].startswith('STIM'):
            stim_time = split_line[0]
            cur_stim = split_line[2][idx]

        else:  # nope; it starts with something other than 'STIM'
            resp_time = split_line[0]  # replace XXX!
            cur_resp = split_line[2][idx] # replace YYY!

            # calculate RT
            RT = int(resp_time) - int(stim_time) # formula here

            # test if the current stimulus is in the `ascii_lowercase`-list
            if cur_stim in string.ascii_lowercase:
                rt_freq.append(RT)
                if int(cur_resp) == 1:
                    n_corr_freq = n_corr_freq + 1

            # else test if the current stimulus is in the `digits`-list
            elif cur_stim in string.digits:
                rt_rare.append(RT)
                if cur_resp == '2':
                    n_corr_rare = n_corr_rare + 1                 
                    
    # freq
    mean_rt_freq = 0.1 * mean(rt_freq)
    median_rt_freq = 0.1 * median(rt_freq)
    accuracy_freq = 100 * n_corr_freq / len(rt_freq)

    # rare
    mean_rt_rare = 100e-3 * mean(rt_rare)
    median_rt_rare = 100e-3 * median(rt_rare)
    accuracy_rare = 100 * n_corr_rare / len(rt_rare)

    return(mean_rt_freq, median_rt_freq, accuracy_freq,
           mean_rt_rare, median_rt_rare, accuracy_rare)

Find all the log files, place into a list


In [41]:
all_logs = find(logfile_dir, '*.log')  # what pattern/wildcard to use?
print(len(all_logs), 'logfiles found in', logfile_dir)


40 logfiles found in ../src/logs

In [42]:
all_logs


Out[42]:
['../src/logs/0010_BQR_2017-08-03.log',
 '../src/logs/0011_XYJ_2017-07-27.log',
 '../src/logs/0012_WCT_2017-06-26.log',
 '../src/logs/0013_OUP_2016-10-15.log',
 '../src/logs/0014_IKV_2017-03-05.log',
 '../src/logs/0015_HNI_2017-03-21.log',
 '../src/logs/0016_RZU_2016-09-24.log',
 '../src/logs/0018_SJI_2017-03-05.log',
 '../src/logs/0019_NDQ_2017-02-05.log',
 '../src/logs/0020_IFX_2017-07-19.log',
 '../src/logs/0021_WYK_2017-05-16.log',
 '../src/logs/0023_FCA_2017-03-09.log',
 '../src/logs/0024_ICI_2016-12-30.log',
 '../src/logs/0026_PBV_2017-07-07.log',
 '../src/logs/0027_NPC_2016-09-01.log',
 '../src/logs/0028_MQU_2017-05-04.log',
 '../src/logs/0030_MJC_2016-11-16.log',
 '../src/logs/0031_ALX_2017-01-04.log',
 '../src/logs/0032_JQA_2016-10-07.log',
 '../src/logs/0034_TRY_2016-12-16.log',
 '../src/logs/0036_ZXA_2016-10-12.log',
 '../src/logs/0037_FMC_2017-01-28.log',
 '../src/logs/0038_WOT_2017-04-29.log',
 '../src/logs/0039_DBX_2016-10-11.log',
 '../src/logs/0040_THX_2017-04-06.log',
 '../src/logs/0041_IJN_2016-09-17.log',
 '../src/logs/0042_TQL_2017-05-07.log',
 '../src/logs/0043_WOY_2016-12-25.log',
 '../src/logs/0044_GWQ_2017-08-19.log',
 '../src/logs/0045_DSA_2016-09-30.log',
 '../src/logs/0046_DKV_2016-11-26.log',
 '../src/logs/0048_MSB_2016-09-23.log',
 '../src/logs/0049_ESM_2017-01-22.log',
 '../src/logs/0050_KXP_2017-04-02.log',
 '../src/logs/0052_BFG_2017-01-17.log',
 '../src/logs/0053_YZV_2017-08-10.log',
 '../src/logs/0054_IDT_2017-06-02.log',
 '../src/logs/0055_XFK_2016-11-26.log',
 '../src/logs/0058_NKR_2017-03-23.log',
 '../src/logs/0059_KKL_2017-08-27.log']

Take a look at the first file name; subject ID is the first N characters of the basename of the file.


In [43]:
print(all_logs[0])
print(basename(all_logs[0]))
print(basename(all_logs[0])[:8])  # print the first N characters of the basename


../src/logs/0010_BQR_2017-08-03.log
0010_BQR_2017-08-03.log
0010_BQR

Loop over logs, writing out results table as you go


In [44]:
subject_codes


Out[44]:
'../src/logs/subj_codes.csv'

In [45]:
grep(subject_codes, '0026').split(';')[1]


Out[45]:
'Control'

In [46]:
outfile = open(output_file, 'wt')
delimiter = ','  # or whatever you like

# the opposite of 'split' is 'join', which has a slightly odd syntax
header = delimiter.join(['Subjid', 'Group', 'Cond', 'Mean RT', 'Median RT', 'Accuracy'])

# write out the header-line first
outfile.write(header + '\n')  # remember to add the newline-character!

# loop over all log files
for log in all_logs:
    subj_ID = basename(log)[:8]  # N=8; how many characters is the subject ID?

    # Is the subject a patient or a control?
    # first get the line that contains the subject code
    group_line = grep(subject_codes, subj_ID)
    # then we split the line on the delimiter (;) and take the second element
    group = group_line.split(';')[1]
    
    # Now we can simply call our single-logfile function and get the results!
    (mean_rt_freq, median_rt_freq, accuracy_freq,
     mean_rt_rare, median_rt_rare, accuracy_rare) = read_log_file(log)
    
    # NB: all those variables are numbers (floats); to write them into a text file,
    # we must first convert them into string-objects (with 2 decimal precision)
    freq_results_str = delimiter.join(['{:.2f}'.format(mean_rt_freq),
                                       '{:.2f}'.format(median_rt_freq),
                                       '{:.2f}'.format(accuracy_freq)])
    rare_results_str = delimiter.join(['{:.2f}'.format(mean_rt_rare),
                                       '{:.2f}'.format(median_rt_rare),
                                       '{:.2f}'.format(accuracy_rare)])

    
    # first write a line for the frequent stimuli
    line_out = delimiter.join([subj_ID, group, 'Freq', freq_results_str])
    outfile.write(line_out + '\n')

    # then write a line for the rare stimuli
    line_out = delimiter.join([subj_ID, group, 'Rare', rare_results_str])
    outfile.write(line_out + '\n')

outfile.close()  # remember to close!

Optional exercise 1: summary statistics

Can we reproduce the paper's findings?

Install pandas

We'll use a Python-module called pandas for this, which we've forgotten to include in the environment.yaml-file! But fear not, conda is your friend.

  • on Windows: open 'Anaconda Prompt' & execute: activate fddhs
  • on Mac/Linux: open a Terminal app & execute: source activate fddhs
  • execute the following command in the Prompt/Terminal:

conda install pandas

and answer 'y'.

You'll also need to restart jupyter lab for the module to be found.


In [47]:
import pandas as pd

In [48]:
# yes, this is how easy reading a csv-file _really_ can be...
df = pd.read_csv(output_file, delimiter=delimiter)

In [49]:
# print the first 5 lines
df.head()


Out[49]:
Subjid Group Cond Mean RT Median RT Accuracy
0 0010_BQR Patient Freq 581.16 549.3 94.53
1 0010_BQR Patient Rare 658.55 639.2 85.16
2 0011_XYJ Control Freq 501.09 469.2 97.27
3 0011_XYJ Control Rare 583.50 557.1 88.67
4 0012_WCT Patient Freq 587.64 555.9 93.75

In [52]:
# group the numerical values by Group and Condition,
# and display the mean of each
df.groupby(by=['Group', 'Cond']).mean()


Out[52]:
Mean RT Median RT Accuracy
Group Cond
Control Freq 498.7750 466.405 96.1080
Rare 572.3250 542.625 89.0235
Patient Freq 580.7590 542.835 94.5265
Rare 681.1435 649.650 84.9605

In [ ]: