Extract Metadata From AWS S3

This script accesses phenotypic data from a number of projects organized under the International Neuroimaging Data-sharing Initiative that are stored on AWS S3.


In [53]:
import os
import boto
import gzip
import pandas as pd
conn = boto.connect_s3(anon=True)
fcp = conn.get_bucket('fcp-indi')

bucket_url_prefix = 'https://s3.amazonaws.com/fcp-indi/'

In [19]:
# Get the keys for all files in project directory from S3.
# Takes about 1.5 hours to run...
with gzip.open('fcp-indi-keys.csv.gz', 'wb') as fi:
    for i in fcp.list():
        if 'Projects' in i.name:
            fi.write(i.name + '\n')


1 loop, best of 3: 1h 27min 6s per loop

In [24]:
# Read in all the keys to process.
df = pd.read_csv('fcp-indi-keys.csv.gz', names=['key'], header=None)
df.head()


Out[24]:
key
0 data/Projects/ABIDE_BIDS/CMU_a/sub-0050642/ana...
1 data/Projects/ABIDE_BIDS/CMU_a/sub-0050642/fun...
2 data/Projects/ABIDE_BIDS/CMU_a/sub-0050646/ana...
3 data/Projects/ABIDE_BIDS/CMU_a/sub-0050646/fun...
4 data/Projects/ABIDE_BIDS/CMU_a/sub-0050647/ana...

In [36]:
# Curated list of csv files containing phenotypic data on AWS.
# Only takes a few seconds compared to full listing above w/mri and other data.
projects = {'ABIDE_Initiative': ['data/Projects/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv'],
            'ACPI': ['data/Projects/ACPI/PhenotypicData/acpi_aggregated_phenotypic_data.csv'],
            'ADHD200': ['data/Projects/ADHD200/RawData/Brown_TestRelease_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/KKI_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/NYU_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/OHSU_TestRelease_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/OHSU_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/Peking_1_TestRelease_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/Peking_1_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/Pittsburgh_phenotypic.csv'],
            'CC_ME': [None], # no phenotypic data
            'CORR': ['data/Projects/CORR/RawData/CoRR_AggregatedPhenotypicData.csv'],
            'CPAC_Regression_Test': [None],
            'HBNSS': ['data/Projects/HBNSS/PhenotypicData/cmi_hbnssi_pheno_data.csv'],
            'INDI': ['data/Projects/INDI/HypnosisBarrios/RawData/participants.tsv',
                     'data/Projects/INDI/SLIM/swu_slim_phenodata_time1.tsv',
                     'data/Projects/INDI/SLIM/swu_slim_phenodata_time2.tsv',
                     'data/Projects/INDI/SLIM/swu_slim_phenodata_time3.tsv'], # phenotypic data is tsv
            'RocklandSample': ['data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r4_phenotypic_v1.csv',
                               'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r6_phenotypic_v1.csv',
                               'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r7_phenotypic_v1.csv',
                               'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r8_phenotypic_v1.csv']}

In [3]:
# Extract all the csv files.
for k, v in projects.iteritems():
    # Get all the key objects
    keys = [fcp.get_key(i) for i in v]
    # Create project directries
    if not os.path.exists(k):
        os.mkdir(k.lower())
    # Download all the keys as csv
    for key in keys:
        if key:
            fname = key.name.split('/')[-1]
            fpath = os.path.join(os.path.abspath(k), fname)
            key.get_contents_to_filename(fpath)


Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r4_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r6_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r7_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r8_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/Brown_TestRelease_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/KKI_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/NYU_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/OHSU_TestRelease_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/OHSU_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/Peking_1_TestRelease_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/Peking_1_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/Pittsburgh_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/CORR/CoRR_AggregatedPhenotypicData.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/INDI/participants.tsv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/INDI/swu_slim_phenodata_time1.tsv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/INDI/swu_slim_phenodata_time2.tsv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/INDI/swu_slim_phenodata_time3.tsv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ACPI/acpi_aggregated_phenotypic_data.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/HBNSS/cmi_hbnssi_pheno_data.csv

In [55]:
list(fcp.list(prefix='data/Projects/RocklandSample/PhenotypicData/'))


Out[55]:
[<Key: fcp-indi,data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r4_phenotypic_v1.csv>,
 <Key: fcp-indi,data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r6_phenotypic_v1.csv>,
 <Key: fcp-indi,data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r7_phenotypic_v1.csv>,
 <Key: fcp-indi,data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r8_phenotypic_v1.csv>]

In [ ]: