In [20]:
from os.path import join, basename
import string
from fddhs import grep # grep for a string in a file
from fddhs import find # find files matching a pattern
logfile_dir = '../src/logs' # adjust to fit where you placed the logs!
# take a look at this CSV-file to determine its structure
subject_codes = join(logfile_dir, 'subj_codes.csv')
output_dir = 'assignment' # did you remember to create it??
output_file = join(output_dir, 'solution_Christopher_Bailey.csv')
In [39]:
# copy-paste your mean- and median-function here:
def mean(values):
return(sum(values)/len(values))
def median(values):
return(sorted(values)[len(values) // 2])
In [40]:
def read_log_file(logfile_name, field_sep='\t'):
'''Read a single log file
The default field-separator is set to be the tab-character (\t)
Return the mean and median RT, and the accuracy, separately for
the frequent and rare categories. This is done as a list (tuple) of
6 return values, in the order:
(mean_rt_freq, median_rt_freq, accuracy_freq,
mean_rt_rare, median_rt_rare, accuracy_rare)
'''
# initialise
rt_freq = []
rt_rare = []
n_corr_freq = 0
n_corr_rare = 0
# open file and read all its lines into a list
fp = open(logfile_name, 'r')
all_lines = fp.readlines()
fp.close()
# hard-code the index of the stimulus/response type/number
idx = 5
# loop over lines from 6th onwards
for line in all_lines[5:]:
split_line = line.split(field_sep)
# does the 3rd element of the list start with 'STIM'?
if split_line[2].startswith('STIM'):
stim_time = split_line[0]
cur_stim = split_line[2][idx]
else: # nope; it starts with something other than 'STIM'
resp_time = split_line[0] # replace XXX!
cur_resp = split_line[2][idx] # replace YYY!
# calculate RT
RT = int(resp_time) - int(stim_time) # formula here
# test if the current stimulus is in the `ascii_lowercase`-list
if cur_stim in string.ascii_lowercase:
rt_freq.append(RT)
if int(cur_resp) == 1:
n_corr_freq = n_corr_freq + 1
# else test if the current stimulus is in the `digits`-list
elif cur_stim in string.digits:
rt_rare.append(RT)
if cur_resp == '2':
n_corr_rare = n_corr_rare + 1
# freq
mean_rt_freq = 0.1 * mean(rt_freq)
median_rt_freq = 0.1 * median(rt_freq)
accuracy_freq = 100 * n_corr_freq / len(rt_freq)
# rare
mean_rt_rare = 100e-3 * mean(rt_rare)
median_rt_rare = 100e-3 * median(rt_rare)
accuracy_rare = 100 * n_corr_rare / len(rt_rare)
return(mean_rt_freq, median_rt_freq, accuracy_freq,
mean_rt_rare, median_rt_rare, accuracy_rare)
In [41]:
all_logs = find(logfile_dir, '*.log') # what pattern/wildcard to use?
print(len(all_logs), 'logfiles found in', logfile_dir)
In [42]:
all_logs
Out[42]:
Take a look at the first file name; subject ID is the first N characters of the basename of the file.
In [43]:
print(all_logs[0])
print(basename(all_logs[0]))
print(basename(all_logs[0])[:8]) # print the first N characters of the basename
In [44]:
subject_codes
Out[44]:
In [45]:
grep(subject_codes, '0026').split(';')[1]
Out[45]:
In [46]:
outfile = open(output_file, 'wt')
delimiter = ',' # or whatever you like
# the opposite of 'split' is 'join', which has a slightly odd syntax
header = delimiter.join(['Subjid', 'Group', 'Cond', 'Mean RT', 'Median RT', 'Accuracy'])
# write out the header-line first
outfile.write(header + '\n') # remember to add the newline-character!
# loop over all log files
for log in all_logs:
subj_ID = basename(log)[:8] # N=8; how many characters is the subject ID?
# Is the subject a patient or a control?
# first get the line that contains the subject code
group_line = grep(subject_codes, subj_ID)
# then we split the line on the delimiter (;) and take the second element
group = group_line.split(';')[1]
# Now we can simply call our single-logfile function and get the results!
(mean_rt_freq, median_rt_freq, accuracy_freq,
mean_rt_rare, median_rt_rare, accuracy_rare) = read_log_file(log)
# NB: all those variables are numbers (floats); to write them into a text file,
# we must first convert them into string-objects (with 2 decimal precision)
freq_results_str = delimiter.join(['{:.2f}'.format(mean_rt_freq),
'{:.2f}'.format(median_rt_freq),
'{:.2f}'.format(accuracy_freq)])
rare_results_str = delimiter.join(['{:.2f}'.format(mean_rt_rare),
'{:.2f}'.format(median_rt_rare),
'{:.2f}'.format(accuracy_rare)])
# first write a line for the frequent stimuli
line_out = delimiter.join([subj_ID, group, 'Freq', freq_results_str])
outfile.write(line_out + '\n')
# then write a line for the rare stimuli
line_out = delimiter.join([subj_ID, group, 'Rare', rare_results_str])
outfile.write(line_out + '\n')
outfile.close() # remember to close!
Can we reproduce the paper's findings?
pandasWe'll use a Python-module called pandas for this, which we've forgotten to include in the environment.yaml-file! But fear not, conda is your friend.
activate fddhssource activate fddhsconda install pandas
and answer 'y'.
You'll also need to restart jupyter lab for the module to be found.
In [47]:
import pandas as pd
In [48]:
# yes, this is how easy reading a csv-file _really_ can be...
df = pd.read_csv(output_file, delimiter=delimiter)
In [49]:
# print the first 5 lines
df.head()
Out[49]:
In [52]:
# group the numerical values by Group and Condition,
# and display the mean of each
df.groupby(by=['Group', 'Cond']).mean()
Out[52]:
In [ ]: