In [1]:
from glob import glob
import os
import pandas as pd

import requests
import parse

In [2]:
ls


Load.ipynb               example_repository/      metadata/
brain-development/       fcp-indi/                process-clean-csv.ipynb
brainbox-csv/            fcp-indi.gz              process-csv.ipynb
clean-csv/               fcp-info.ipynb           transform.ipynb
dataverse/               ixi-crawl.ipynb

In [3]:
csv_files = [os.path.abspath(val) for val in sorted(glob('clean-csv/*-clean.csv'))] 
#if not any([key in val for key in ['clean', 'Brain', 'ABIDE']])]
csv_files


Out[3]:
['/software/data/brainbox/metasearch/crawler/clean-csv/ABIDE_Initiative-clean.csv',
 '/software/data/brainbox/metasearch/crawler/clean-csv/ACPI-clean.csv',
 '/software/data/brainbox/metasearch/crawler/clean-csv/ADHD200-clean.csv',
 '/software/data/brainbox/metasearch/crawler/clean-csv/BrainGenomicsSuperstructProject-clean.csv',
 '/software/data/brainbox/metasearch/crawler/clean-csv/CORR-clean.csv',
 '/software/data/brainbox/metasearch/crawler/clean-csv/HypnosisBarrios-clean.csv',
 '/software/data/brainbox/metasearch/crawler/clean-csv/IXI-clean.csv',
 '/software/data/brainbox/metasearch/crawler/clean-csv/RocklandSample-clean.csv']

In [4]:
def process_csv(filename):
    #filename = csv_files[0]
    df = pd.read_csv(filename, header=None, names=['url', 'id'])
    df['id'] = df.id.apply(str)
    print(filename, df.shape)
    info = []
    for grp in df.groupby('id'):
        T1 = [val for val in grp[1].url.values if 'T1.mgz' in val]
        if T1:
            for T1_val in T1:
                annot = T1_val.replace('T1.mgz', 'aseg.mgz')
                this_info = dict(url=T1_val, annot=annot, id=grp[0])
                info.append(this_info)
        else:
            for val in grp[1].url.values:
                if 'BrainGenomicsSuperstructProject/sub' in val:
                    val = val.replace('BrainGenomicsSuperstructProject/sub',
                                      'BrainGenomicsSuperstructProject/orig_bids/sub')
                this_info = dict(url=val, annot=pd.np.nan, id=grp[0])
                info.append(this_info)
    new_df = pd.DataFrame(info)
    return new_df

In [5]:
all_df = []
for filename in csv_files:
    new_df = process_csv(filename)
    new_filename = filename #.replace('.csv', '-clean.csv')
    #new_df[['url', 'id']].to_csv(new_filename,  header=False, index=False)
    print(new_filename, new_df.shape)
    all_df.append(new_df)
all_df = pd.concat(all_df)
all_df.shape
all_df.to_csv('clean-csv/all.csv', columns=['url','id'], header=False, index=False)


/software/data/brainbox/metasearch/crawler/clean-csv/ABIDE_Initiative-clean.csv (1112, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/ABIDE_Initiative-clean.csv (1112, 3)
/software/data/brainbox/metasearch/crawler/clean-csv/ACPI-clean.csv (129, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/ACPI-clean.csv (129, 3)
/software/data/brainbox/metasearch/crawler/clean-csv/ADHD200-clean.csv (973, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/ADHD200-clean.csv (973, 3)
/software/data/brainbox/metasearch/crawler/clean-csv/BrainGenomicsSuperstructProject-clean.csv (1570, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/BrainGenomicsSuperstructProject-clean.csv (1570, 3)
/software/data/brainbox/metasearch/crawler/clean-csv/CORR-clean.csv (3153, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/CORR-clean.csv (3153, 3)
/software/data/brainbox/metasearch/crawler/clean-csv/HypnosisBarrios-clean.csv (10, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/HypnosisBarrios-clean.csv (10, 3)
/software/data/brainbox/metasearch/crawler/clean-csv/IXI-clean.csv (581, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/IXI-clean.csv (581, 3)
/software/data/brainbox/metasearch/crawler/clean-csv/RocklandSample-clean.csv (532, 2)
/software/data/brainbox/metasearch/crawler/clean-csv/RocklandSample-clean.csv (532, 3)

In [6]:
csv = pd.read_csv('clean-csv/all.csv', names=['T1url', 'id'], dtype=str)
csv.head()


Out[6]:
T1url id
0 https://s3.amazonaws.com/fcp-indi/data/Project... 50002
1 https://s3.amazonaws.com/fcp-indi/data/Project... 50003
2 https://s3.amazonaws.com/fcp-indi/data/Project... 50004
3 https://s3.amazonaws.com/fcp-indi/data/Project... 50005
4 https://s3.amazonaws.com/fcp-indi/data/Project... 50006

In [7]:
patterns = ['data/Projects/{study}/RawData/{site}/{subj_id}/session_{session_id}/anat_{anat_id}/mprage.nii.gz',
            'data/Projects/{study}/Outputs/freesurfer/5.1/{site}_{site_suffix}_{subj_id}/mri/T1.mgz',
            'data/Projects/{study}/Outputs/freesurfer/5.1/{site}_{subj_id}/mri/T1.mgz',
            'data/Projects/{study}/surfaces/freesurfer/5.3/{subj_id}/mri/T1.mgz',
            'data/Projects/{study}/Outputs/IBA_TRT/freesurfer_gpu/{subj_id}-session_{session_id}/mri/T1.mgz',
            'data/Projects/{study}/Outputs/IBA_TRT/freesurfer/{subj_id}-session_{session_id}/mri/T1.mgz',
            'data/Projects/{study}/RawData/{site}/{subj_id}/session_{session_id}/anat_{anat_id}/anat.nii.gz',
            'data/Projects/{study:w}/sub-{subj_id}/ses-{session_id}/anat/sub-{subj_id}_ses-{session_id}_T1w.nii.gz',
            'data/Projects/{study:w}/RawData/{subj_id}/{session_id}/{deface_id}/{deface_id2}_defaced.nii.gz',
            'data/Projects/INDI/{study}/RawData/sub-{subj_id}/anat/sub-{subj_id}_T1w.nii.gz',
           ]

In [8]:
project_id = "https://s3.amazonaws.com/fcp-indi/data/Projects/{project_id}/{frag}" 
def get_project(string):
    parsed = parse.parse(project_id, string)
    if parsed:
        return parsed.named.get('project_id').lower()
    elif 'osf' in string and '5h7sv' in string:
        return 'ixi'
    else:
        return pd.np.nan
csv['project'] = csv.T1url.apply(get_project)
csv.head()


Out[8]:
T1url id project
0 https://s3.amazonaws.com/fcp-indi/data/Project... 50002 abide_initiative
1 https://s3.amazonaws.com/fcp-indi/data/Project... 50003 abide_initiative
2 https://s3.amazonaws.com/fcp-indi/data/Project... 50004 abide_initiative
3 https://s3.amazonaws.com/fcp-indi/data/Project... 50005 abide_initiative
4 https://s3.amazonaws.com/fcp-indi/data/Project... 50006 abide_initiative

In [9]:
url_patterns = ["https://s3.amazonaws.com/fcp-indi/" + pattern for pattern in patterns]

In [10]:
def get_session(url):
    result = None
    for pattern in url_patterns:
        parsed = parse.parse(pattern, url)
        if parsed:
            result = parsed.named.get('session_id')
    if result:
        if result == '01':
            return '1'
        else:
            return result
    else:
        return '1'

In [11]:
sessions = csv.T1url.apply(get_session)
sessions.unique()


Out[11]:
array(['1', '2', '3', '10', '4', '5', '6', '7', '8', '9', 'clg_4',
       'clg_4R', 'dsc_2', 'nfb_3', 'nfb_2', 'clg_2R', 'clg_2'], dtype=object)

In [12]:
csv['session_id'] = sessions

In [13]:
ids = csv.groupby(by=['project', 'id'])

In [14]:
count = ids.session_id.count()
count_series = pd.Series()

In [15]:
for idx, row in csv.iterrows():
     count_series.loc[idx] = count.get((row.project, row.id))

In [16]:
csv['session_count'] = count_series

In [17]:
csv.columns = ['t1_url', 'participant_id', 'project', 'session_id', 'session_count']

In [18]:
# Joining GSP w/pheno...
def update_participant_id(x):
    pid = None
    if len(x) == 1:
        pid = 'Sub000{}_Ses1'.format(x)
    elif len(x) == 2:
        pid = 'Sub00{}_Ses1'.format(x)
    elif len(x) == 3:
        pid = 'Sub0{}_Ses1'.format(x)
    elif len(x) == 4:
        pid = 'Sub{}_Ses1'.format(x)
    else:
        pid
    return pid

gsp = csv[csv.project == 'braingenomicssuperstructproject']
# rename genomesuperstruct to be accronym
gsp['project'] = 'gsp'
gsp['participant_id'] = gsp.participant_id.apply(update_participant_id)

csv.loc[gsp.index] = gsp


/software/miniconda3/envs/dev3pype/lib/python3.5/site-packages/ipykernel/__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/software/miniconda3/envs/dev3pype/lib/python3.5/site-packages/ipykernel/__main__.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [19]:
# Joining indi w/pheno...
def update_participant_id(x):
    return 'sub-{:07d}'.format(int(x))

indi = csv[csv.project == 'indi']
# rename genomesuperstruct to be accronym
indi['participant_id'] = indi.participant_id.apply(update_participant_id)

csv.loc[indi.index] = indi


/software/miniconda3/envs/dev3pype/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [20]:
# Joining ixi w/pheno...
def update_participant_id(x):
    pid = x.replace('IXI', '')
    return str(int(pid))

ixi = csv[csv.project == 'ixi']
ixi['participant_id'] = ixi.participant_id.apply(update_participant_id)

csv.loc[ixi.index] = ixi


/software/miniconda3/envs/dev3pype/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [21]:
csv.to_csv('clean-csv/all-session.csv', index=None)

In [ ]: