In [1]:
from glob import glob
import os
import pandas as pd
import requests
import parse
In [2]:
ls
In [3]:
csv_files = [os.path.abspath(val) for val in sorted(glob('clean-csv/*-clean.csv'))]
#if not any([key in val for key in ['clean', 'Brain', 'ABIDE']])]
csv_files
Out[3]:
In [4]:
def process_csv(filename):
#filename = csv_files[0]
df = pd.read_csv(filename, header=None, names=['url', 'id'])
df['id'] = df.id.apply(str)
print(filename, df.shape)
info = []
for grp in df.groupby('id'):
T1 = [val for val in grp[1].url.values if 'T1.mgz' in val]
if T1:
for T1_val in T1:
annot = T1_val.replace('T1.mgz', 'aseg.mgz')
this_info = dict(url=T1_val, annot=annot, id=grp[0])
info.append(this_info)
else:
for val in grp[1].url.values:
if 'BrainGenomicsSuperstructProject/sub' in val:
val = val.replace('BrainGenomicsSuperstructProject/sub',
'BrainGenomicsSuperstructProject/orig_bids/sub')
this_info = dict(url=val, annot=pd.np.nan, id=grp[0])
info.append(this_info)
new_df = pd.DataFrame(info)
return new_df
In [5]:
all_df = []
for filename in csv_files:
new_df = process_csv(filename)
new_filename = filename #.replace('.csv', '-clean.csv')
#new_df[['url', 'id']].to_csv(new_filename, header=False, index=False)
print(new_filename, new_df.shape)
all_df.append(new_df)
all_df = pd.concat(all_df)
all_df.shape
all_df.to_csv('clean-csv/all.csv', columns=['url','id'], header=False, index=False)
In [6]:
csv = pd.read_csv('clean-csv/all.csv', names=['T1url', 'id'], dtype=str)
csv.head()
Out[6]:
In [7]:
patterns = ['data/Projects/{study}/RawData/{site}/{subj_id}/session_{session_id}/anat_{anat_id}/mprage.nii.gz',
'data/Projects/{study}/Outputs/freesurfer/5.1/{site}_{site_suffix}_{subj_id}/mri/T1.mgz',
'data/Projects/{study}/Outputs/freesurfer/5.1/{site}_{subj_id}/mri/T1.mgz',
'data/Projects/{study}/surfaces/freesurfer/5.3/{subj_id}/mri/T1.mgz',
'data/Projects/{study}/Outputs/IBA_TRT/freesurfer_gpu/{subj_id}-session_{session_id}/mri/T1.mgz',
'data/Projects/{study}/Outputs/IBA_TRT/freesurfer/{subj_id}-session_{session_id}/mri/T1.mgz',
'data/Projects/{study}/RawData/{site}/{subj_id}/session_{session_id}/anat_{anat_id}/anat.nii.gz',
'data/Projects/{study:w}/sub-{subj_id}/ses-{session_id}/anat/sub-{subj_id}_ses-{session_id}_T1w.nii.gz',
'data/Projects/{study:w}/RawData/{subj_id}/{session_id}/{deface_id}/{deface_id2}_defaced.nii.gz',
'data/Projects/INDI/{study}/RawData/sub-{subj_id}/anat/sub-{subj_id}_T1w.nii.gz',
]
In [8]:
project_id = "https://s3.amazonaws.com/fcp-indi/data/Projects/{project_id}/{frag}"
def get_project(string):
parsed = parse.parse(project_id, string)
if parsed:
return parsed.named.get('project_id').lower()
elif 'osf' in string and '5h7sv' in string:
return 'ixi'
else:
return pd.np.nan
csv['project'] = csv.T1url.apply(get_project)
csv.head()
Out[8]:
In [9]:
url_patterns = ["https://s3.amazonaws.com/fcp-indi/" + pattern for pattern in patterns]
In [10]:
def get_session(url):
result = None
for pattern in url_patterns:
parsed = parse.parse(pattern, url)
if parsed:
result = parsed.named.get('session_id')
if result:
if result == '01':
return '1'
else:
return result
else:
return '1'
In [11]:
sessions = csv.T1url.apply(get_session)
sessions.unique()
Out[11]:
In [12]:
csv['session_id'] = sessions
In [13]:
ids = csv.groupby(by=['project', 'id'])
In [14]:
count = ids.session_id.count()
count_series = pd.Series()
In [15]:
for idx, row in csv.iterrows():
count_series.loc[idx] = count.get((row.project, row.id))
In [16]:
csv['session_count'] = count_series
In [17]:
csv.columns = ['t1_url', 'participant_id', 'project', 'session_id', 'session_count']
In [18]:
# Joining GSP w/pheno...
def update_participant_id(x):
pid = None
if len(x) == 1:
pid = 'Sub000{}_Ses1'.format(x)
elif len(x) == 2:
pid = 'Sub00{}_Ses1'.format(x)
elif len(x) == 3:
pid = 'Sub0{}_Ses1'.format(x)
elif len(x) == 4:
pid = 'Sub{}_Ses1'.format(x)
else:
pid
return pid
gsp = csv[csv.project == 'braingenomicssuperstructproject']
# rename genomesuperstruct to be accronym
gsp['project'] = 'gsp'
gsp['participant_id'] = gsp.participant_id.apply(update_participant_id)
csv.loc[gsp.index] = gsp
In [19]:
# Joining indi w/pheno...
def update_participant_id(x):
return 'sub-{:07d}'.format(int(x))
indi = csv[csv.project == 'indi']
# rename genomesuperstruct to be accronym
indi['participant_id'] = indi.participant_id.apply(update_participant_id)
csv.loc[indi.index] = indi
In [20]:
# Joining ixi w/pheno...
def update_participant_id(x):
pid = x.replace('IXI', '')
return str(int(pid))
ixi = csv[csv.project == 'ixi']
ixi['participant_id'] = ixi.participant_id.apply(update_participant_id)
csv.loc[ixi.index] = ixi
In [21]:
csv.to_csv('clean-csv/all-session.csv', index=None)
In [ ]: