In [53]:
import os
import boto
import gzip
import pandas as pd
conn = boto.connect_s3(anon=True)
fcp = conn.get_bucket('fcp-indi')
bucket_url_prefix = 'https://s3.amazonaws.com/fcp-indi/'
In [19]:
# Get the keys for all files in project directory from S3.
# Takes about 1.5 hours to run...
with gzip.open('fcp-indi-keys.csv.gz', 'wb') as fi:
for i in fcp.list():
if 'Projects' in i.name:
fi.write(i.name + '\n')
In [24]:
# Read in all the keys to process.
df = pd.read_csv('fcp-indi-keys.csv.gz', names=['key'], header=None)
df.head()
Out[24]:
In [36]:
# Curated list of csv files containing phenotypic data on AWS.
# Only takes a few seconds compared to full listing above w/mri and other data.
projects = {'ABIDE_Initiative': ['data/Projects/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv'],
'ACPI': ['data/Projects/ACPI/PhenotypicData/acpi_aggregated_phenotypic_data.csv'],
'ADHD200': ['data/Projects/ADHD200/RawData/Brown_TestRelease_phenotypic.csv',
'data/Projects/ADHD200/RawData/KKI_phenotypic.csv',
'data/Projects/ADHD200/RawData/NYU_phenotypic.csv',
'data/Projects/ADHD200/RawData/OHSU_TestRelease_phenotypic.csv',
'data/Projects/ADHD200/RawData/OHSU_phenotypic.csv',
'data/Projects/ADHD200/RawData/Peking_1_TestRelease_phenotypic.csv',
'data/Projects/ADHD200/RawData/Peking_1_phenotypic.csv',
'data/Projects/ADHD200/RawData/Pittsburgh_phenotypic.csv'],
'CC_ME': [None], # no phenotypic data
'CORR': ['data/Projects/CORR/RawData/CoRR_AggregatedPhenotypicData.csv'],
'CPAC_Regression_Test': [None],
'HBNSS': ['data/Projects/HBNSS/PhenotypicData/cmi_hbnssi_pheno_data.csv'],
'INDI': ['data/Projects/INDI/HypnosisBarrios/RawData/participants.tsv',
'data/Projects/INDI/SLIM/swu_slim_phenodata_time1.tsv',
'data/Projects/INDI/SLIM/swu_slim_phenodata_time2.tsv',
'data/Projects/INDI/SLIM/swu_slim_phenodata_time3.tsv'], # phenotypic data is tsv
'RocklandSample': ['data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r4_phenotypic_v1.csv',
'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r6_phenotypic_v1.csv',
'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r7_phenotypic_v1.csv',
'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r8_phenotypic_v1.csv']}
In [3]:
# Extract all the csv files.
for k, v in projects.iteritems():
# Get all the key objects
keys = [fcp.get_key(i) for i in v]
# Create project directries
if not os.path.exists(k):
os.mkdir(k.lower())
# Download all the keys as csv
for key in keys:
if key:
fname = key.name.split('/')[-1]
fpath = os.path.join(os.path.abspath(k), fname)
key.get_contents_to_filename(fpath)
In [55]:
list(fcp.list(prefix='data/Projects/RocklandSample/PhenotypicData/'))
Out[55]:
In [ ]: