Solution to assignment

By: Christopher Bailey

Date: 15 Sep 2017

Import statements, function definitions and default variables



In [20]:

    
from os.path import join, basename
import string
from fddhs import grep  # grep for a string in a file
from fddhs import find  # find files matching a pattern

logfile_dir = '../src/logs'  # adjust to fit where you placed the logs!

# take a look at this CSV-file to determine its structure
subject_codes = join(logfile_dir, 'subj_codes.csv')

output_dir = 'assignment'  # did you remember to create it??
output_file = join(output_dir, 'solution_Christopher_Bailey.csv')



In [39]:

    
# copy-paste your mean- and median-function here:
def mean(values):
    return(sum(values)/len(values))
def median(values):
    return(sorted(values)[len(values) // 2])



In [40]:

    
def read_log_file(logfile_name, field_sep='\t'):
    '''Read a single log file
    
    The default field-separator is set to be the tab-character (\t)
    
    Return the mean and median RT, and the accuracy, separately for
    the frequent and rare categories. This is done as a list (tuple) of
    6 return values, in the order:
    (mean_rt_freq, median_rt_freq, accuracy_freq,
     mean_rt_rare, median_rt_rare, accuracy_rare)
    '''

    # initialise 
    rt_freq = []
    rt_rare = []
    n_corr_freq = 0
    n_corr_rare = 0

    # open file and read all its lines into a list
    fp = open(logfile_name, 'r')
    all_lines = fp.readlines()
    fp.close()

    # hard-code the index of the stimulus/response type/number
    idx = 5
    
    # loop over lines from 6th onwards
    for line in all_lines[5:]:
        split_line = line.split(field_sep)

        # does the 3rd element of the list start with 'STIM'?
        if split_line[2].startswith('STIM'):
            stim_time = split_line[0]
            cur_stim = split_line[2][idx]

        else:  # nope; it starts with something other than 'STIM'
            resp_time = split_line[0]  # replace XXX!
            cur_resp = split_line[2][idx] # replace YYY!

            # calculate RT
            RT = int(resp_time) - int(stim_time) # formula here

            # test if the current stimulus is in the `ascii_lowercase`-list
            if cur_stim in string.ascii_lowercase:
                rt_freq.append(RT)
                if int(cur_resp) == 1:
                    n_corr_freq = n_corr_freq + 1

            # else test if the current stimulus is in the `digits`-list
            elif cur_stim in string.digits:
                rt_rare.append(RT)
                if cur_resp == '2':
                    n_corr_rare = n_corr_rare + 1                 
                    
    # freq
    mean_rt_freq = 0.1 * mean(rt_freq)
    median_rt_freq = 0.1 * median(rt_freq)
    accuracy_freq = 100 * n_corr_freq / len(rt_freq)

    # rare
    mean_rt_rare = 100e-3 * mean(rt_rare)
    median_rt_rare = 100e-3 * median(rt_rare)
    accuracy_rare = 100 * n_corr_rare / len(rt_rare)

    return(mean_rt_freq, median_rt_freq, accuracy_freq,
           mean_rt_rare, median_rt_rare, accuracy_rare)

Find all the log files, place into a list



In [41]:

    
all_logs = find(logfile_dir, '*.log')  # what pattern/wildcard to use?
print(len(all_logs), 'logfiles found in', logfile_dir)









    



40 logfiles found in ../src/logs



In [42]:

    
all_logs









    Out[42]:





['../src/logs/0010_BQR_2017-08-03.log',
 '../src/logs/0011_XYJ_2017-07-27.log',
 '../src/logs/0012_WCT_2017-06-26.log',
 '../src/logs/0013_OUP_2016-10-15.log',
 '../src/logs/0014_IKV_2017-03-05.log',
 '../src/logs/0015_HNI_2017-03-21.log',
 '../src/logs/0016_RZU_2016-09-24.log',
 '../src/logs/0018_SJI_2017-03-05.log',
 '../src/logs/0019_NDQ_2017-02-05.log',
 '../src/logs/0020_IFX_2017-07-19.log',
 '../src/logs/0021_WYK_2017-05-16.log',
 '../src/logs/0023_FCA_2017-03-09.log',
 '../src/logs/0024_ICI_2016-12-30.log',
 '../src/logs/0026_PBV_2017-07-07.log',
 '../src/logs/0027_NPC_2016-09-01.log',
 '../src/logs/0028_MQU_2017-05-04.log',
 '../src/logs/0030_MJC_2016-11-16.log',
 '../src/logs/0031_ALX_2017-01-04.log',
 '../src/logs/0032_JQA_2016-10-07.log',
 '../src/logs/0034_TRY_2016-12-16.log',
 '../src/logs/0036_ZXA_2016-10-12.log',
 '../src/logs/0037_FMC_2017-01-28.log',
 '../src/logs/0038_WOT_2017-04-29.log',
 '../src/logs/0039_DBX_2016-10-11.log',
 '../src/logs/0040_THX_2017-04-06.log',
 '../src/logs/0041_IJN_2016-09-17.log',
 '../src/logs/0042_TQL_2017-05-07.log',
 '../src/logs/0043_WOY_2016-12-25.log',
 '../src/logs/0044_GWQ_2017-08-19.log',
 '../src/logs/0045_DSA_2016-09-30.log',
 '../src/logs/0046_DKV_2016-11-26.log',
 '../src/logs/0048_MSB_2016-09-23.log',
 '../src/logs/0049_ESM_2017-01-22.log',
 '../src/logs/0050_KXP_2017-04-02.log',
 '../src/logs/0052_BFG_2017-01-17.log',
 '../src/logs/0053_YZV_2017-08-10.log',
 '../src/logs/0054_IDT_2017-06-02.log',
 '../src/logs/0055_XFK_2016-11-26.log',
 '../src/logs/0058_NKR_2017-03-23.log',
 '../src/logs/0059_KKL_2017-08-27.log']

Take a look at the first file name; subject ID is the first N characters of the basename of the file.



In [43]:

    
print(all_logs[0])
print(basename(all_logs[0]))
print(basename(all_logs[0])[:8])  # print the first N characters of the basename









    



../src/logs/0010_BQR_2017-08-03.log
0010_BQR_2017-08-03.log
0010_BQR

Loop over logs, writing out results table as you go



In [44]:

    
subject_codes









    Out[44]:





'../src/logs/subj_codes.csv'



In [45]:

    
grep(subject_codes, '0026').split(';')[1]









    Out[45]:





'Control'



In [46]:

    
outfile = open(output_file, 'wt')
delimiter = ','  # or whatever you like

# the opposite of 'split' is 'join', which has a slightly odd syntax
header = delimiter.join(['Subjid', 'Group', 'Cond', 'Mean RT', 'Median RT', 'Accuracy'])

# write out the header-line first
outfile.write(header + '\n')  # remember to add the newline-character!

# loop over all log files
for log in all_logs:
    subj_ID = basename(log)[:8]  # N=8; how many characters is the subject ID?

    # Is the subject a patient or a control?
    # first get the line that contains the subject code
    group_line = grep(subject_codes, subj_ID)
    # then we split the line on the delimiter (;) and take the second element
    group = group_line.split(';')[1]
    
    # Now we can simply call our single-logfile function and get the results!
    (mean_rt_freq, median_rt_freq, accuracy_freq,
     mean_rt_rare, median_rt_rare, accuracy_rare) = read_log_file(log)
    
    # NB: all those variables are numbers (floats); to write them into a text file,
    # we must first convert them into string-objects (with 2 decimal precision)
    freq_results_str = delimiter.join(['{:.2f}'.format(mean_rt_freq),
                                       '{:.2f}'.format(median_rt_freq),
                                       '{:.2f}'.format(accuracy_freq)])
    rare_results_str = delimiter.join(['{:.2f}'.format(mean_rt_rare),
                                       '{:.2f}'.format(median_rt_rare),
                                       '{:.2f}'.format(accuracy_rare)])

    
    # first write a line for the frequent stimuli
    line_out = delimiter.join([subj_ID, group, 'Freq', freq_results_str])
    outfile.write(line_out + '\n')

    # then write a line for the rare stimuli
    line_out = delimiter.join([subj_ID, group, 'Rare', rare_results_str])
    outfile.write(line_out + '\n')

outfile.close()  # remember to close!

Optional exercise 1: summary statistics

Can we reproduce the paper's findings?

Install `pandas`

We'll use a Python-module called pandas for this, which we've forgotten to include in the environment.yaml-file! But fear not, conda is your friend.

on Windows: open 'Anaconda Prompt' & execute: activate fddhs
on Mac/Linux: open a Terminal app & execute: source activate fddhs
execute the following command in the Prompt/Terminal:

conda install pandas

and answer 'y'.

You'll also need to restart jupyter lab for the module to be found.



In [47]:

    
import pandas as pd



In [48]:

    
# yes, this is how easy reading a csv-file _really_ can be...
df = pd.read_csv(output_file, delimiter=delimiter)



In [49]:

    
# print the first 5 lines
df.head()









    Out[49]:







  
    
      
      Subjid
      Group
      Cond
      Mean RT
      Median RT
      Accuracy
    
  
  
    
      0
      0010_BQR
      Patient
      Freq
      581.16
      549.3
      94.53
    
    
      1
      0010_BQR
      Patient
      Rare
      658.55
      639.2
      85.16
    
    
      2
      0011_XYJ
      Control
      Freq
      501.09
      469.2
      97.27
    
    
      3
      0011_XYJ
      Control
      Rare
      583.50
      557.1
      88.67
    
    
      4
      0012_WCT
      Patient
      Freq
      587.64
      555.9
      93.75



In [52]:

    
# group the numerical values by Group and Condition,
# and display the mean of each
df.groupby(by=['Group', 'Cond']).mean()



In [ ]:

	Subjid	Group	Cond	Mean RT	Median RT	Accuracy
0	0010_BQR	Patient	Freq	581.16	549.3	94.53
1	0010_BQR	Patient	Rare	658.55	639.2	85.16
2	0011_XYJ	Control	Freq	501.09	469.2	97.27
3	0011_XYJ	Control	Rare	583.50	557.1	88.67
4	0012_WCT	Patient	Freq	587.64	555.9	93.75