In [142]:
import os

import pandas as pd

os.path.abspath(os.curdir)


Out[142]:
'/Users/nicholsn/Repos/metasearch/crawler/load'

In [143]:
# Load all the transformed files into a dataframes to be merged.
load_path = os.path.abspath(os.curdir)
xfm_path = os.path.abspath('../transform')
project_names = [project for project in os.listdir(xfm_path) if '.' not in project]
csv_files = list()
for project_name in project_names:
    xfm_dir = os.path.join(xfm_path, project_name)
    files = os.listdir(xfm_dir)
    [csv_files.append(os.path.join(xfm_path, project_name, csv)) for csv in files if '.csv' in csv]

In [144]:
# Define the order of the columns we want in our final table.
columns = ['project',
           'site_id',
           'participant_id',
           'diagnosis',
           'sex', 
           'age',
           'handedness',
           'full iq',
           'performance iq',
           'verbal iq']

In [145]:
df_list = [pd.read_csv(i) for i in csv_files]
df = pd.concat(df_list)
df.to_csv(os.path.join(load_path, 'phenotype.csv'), columns=columns, index=False)
df.head()


Out[145]:
age diagnosis dsm_iv_tr full iq handedness participant_id performance iq project sex site_id verbal iq
0 16.77 autism autism 103.0 ambidextrous 50002 89.0 abide_initiative Male PITT 116.0
1 24.45 autism autism 124.0 right handed 50003 115.0 abide_initiative Male PITT 128.0
2 19.09 autism autism 113.0 right handed 50004 117.0 abide_initiative Male PITT 108.0
3 13.73 autism autism 119.0 right handed 50005 118.0 abide_initiative Female PITT 117.0
4 13.37 autism autism 109.0 left handed 50006 119.0 abide_initiative Male PITT 99.0

In [193]:
img_path = os.path.abspath( '../clean-csv/all.csv')
img = pd.read_csv(img_path)
img = img.set_index('id', drop=False)
img.head()


Out[193]:
T1url id
id
0050002 https://s3.amazonaws.com/fcp-indi/data/Project... 0050002
0050003 https://s3.amazonaws.com/fcp-indi/data/Project... 0050003
0050004 https://s3.amazonaws.com/fcp-indi/data/Project... 0050004
0050005 https://s3.amazonaws.com/fcp-indi/data/Project... 0050005
0050006 https://s3.amazonaws.com/fcp-indi/data/Project... 0050006

In [199]:
t1_count = img.groupby(by='id').count()
img['t1_count'] = t1_count.T1url
img.head()


Out[199]:
T1url id t1_count
id
0050002 https://s3.amazonaws.com/fcp-indi/data/Project... 0050002 1
0050003 https://s3.amazonaws.com/fcp-indi/data/Project... 0050003 1
0050004 https://s3.amazonaws.com/fcp-indi/data/Project... 0050004 1
0050005 https://s3.amazonaws.com/fcp-indi/data/Project... 0050005 1
0050006 https://s3.amazonaws.com/fcp-indi/data/Project... 0050006 1

In [206]:
for idx, df in img[img.t1_count > 1].iterrows():
    print idx, df.T1url, df.t1_count
    break


0021002 https://s3.amazonaws.com/fcp-indi/data/Projects/ADHD200/surfaces/freesurfer/5.3/0021002/mri/T1.mgz 2

In [209]:
for idx, df in img.loc['0021002'].iterrows():
    print idx, df.T1url, df.t1_count


0021002 https://s3.amazonaws.com/fcp-indi/data/Projects/ADHD200/surfaces/freesurfer/5.3/0021002/mri/T1.mgz 2
0021002 https://s3.amazonaws.com/fcp-indi/data/Projects/CORR/RawData/NKI_TRT/0021002/session_1/anat_1/anat.nii.gz 2

In [ ]: