In [1]:
import os
import json
import glob
import numpy as np
import pandas as pd
In [2]:
# Path to repositories to pull raw data out of.
curdir = os.path.abspath(os.path.curdir)
repos = ['fcp-indi', 'brain-development', 'dataverse']
extract_paths = [os.path.join(curdir, repo) for repo in repos]
# Path to where the tranformed projects will go.
xfm_dir = os.path.join(curdir, 'metadata')
if not os.path.exists(xfm_dir):
os.mkdir(xfm_dir)
# Project dirs.
extract_dirs = list()
for extract_path in extract_paths:
project_paths = [os.path.join(extract_path, project) for project in os.listdir(extract_path) if '.' not in project]
extract_dirs.extend(project_paths)
In [3]:
def apply_mapping(mapping, df_list):
"""
Reads a dictionary mapping and list of dataframe, then merges the
dataframes and convers the column name and values into a common
format. Note: the dataframes should have a common structure.
Example Mapping Structure
=========================
{
"DX_GROUP":
{
"element": "diagnosis",
"type": "category",
"1": "autism",
"2": "control"
}
}
"""
results = list()
csv = pd.concat(df_list)
csv.reset_index(drop=True, inplace=True)
for col, elem in mapping.items():
# Use get in case the col is missing/static.
series = csv.get(col)
# Categories are mapped to common values.
if elem.get('type') == 'str':
result = series.apply(lambda x: elem.get(str(x)))
# Values parsed as numbers are checked for any mappings (e.g., -999 == NaN).
elif elem.get('type') == 'float':
result = series.apply(lambda x: np.NaN if str(x) in elem.keys() else x)
# Used to create a column of all the same value.
elif elem.get('type') == 'static':
val = [elem.get('value')] * csv.shape[0]
result = pd.Series(val, index=csv.index)
else:
try:
# Handle IDs being read as float.
result = series.apply(lambda x: str(int(x)) if pd.notnull(x) else x)
except ValueError as e:
result = series
# Concat all csv in a extract dir into one dataframe.
df = pd.DataFrame()
df[elem.get('element')] = result
results.append(df)
concat = pd.concat(results, axis=1)
# Merge any columns with duplicate names to fil in NaN from concat step.
return concat.groupby(level=0, axis=1).first()
In [4]:
# Read a mapping.json file from each sub directory to process.
for extract_dir in extract_dirs:
df_list = list()
mapping = dict()
project_name = extract_dir.split('/')[-1]
# All files must use the same data dictionary in a given directory.
extract_files = [os.path.join(extract_dir, i) for i in os.listdir(extract_dir) if i not in ['mapping.json']]
mapping_file = os.path.join(extract_dir, 'mapping.json')
if os.path.exists(mapping_file):
with open(mapping_file, 'rt') as fi:
mapping.update(json.load(fi))
ext_type = dict(csv=',', tsv='\t')
# Grab dtype for parsing csv
dtype = {k: v.get('type') for k, v in mapping.items() if v.get('type') in ['str', 'int', 'float']}
dtype.update({k: 'str' for k, v in mapping.items() if v.get('type') in ['static', 'identifier']})
# Dealing with NaNs.
na_values = ['NoPhenotypicData', '#']
for extract_file in extract_files:
# Process each file.
ext = extract_file.split('.')[-1]
sep_type = ext_type.get(ext)
df = pd.read_csv(extract_file, sep=sep_type, dtype=dtype, na_values=na_values)
# encoding = "ISO-8859-1")
df_list.append(df.drop_duplicates())
xfm = apply_mapping(mapping, df_list)
xfm.loc[pd.isnull(xfm.session_id), 'session_id'] = 1
pheno_file = ''.join([project_name, '_', 'phenotype.csv'])
pheno_path = os.path.join(xfm_dir, pheno_file)
xfm.to_csv(pheno_path, index=False)
In [5]:
# Merge all the transformed files into a single frame
meta_files = glob.glob(xfm_dir + '/*.csv')
meta_list = list()
for meta_file in meta_files:
df = pd.read_csv(meta_file, dtype={'session_id': 'str',
'participant_id': 'str'})
meta_list.append(df.drop_duplicates())
meta_df = pd.concat(meta_list).set_index(['project', 'session_id', 'participant_id'])
meta = meta_df.copy() #drop_duplicates()
meta.head()
Out[5]:
In [6]:
# Load the CSV with MRI URLs
mri_path = os.path.abspath('clean-csv/all-session.csv')
mri = pd.read_csv(mri_path).set_index(['project', 'session_id', 'participant_id'])
mri.head()
Out[6]:
In [7]:
# Grab only the records with unique indices (NKI-RS has duplicate records and needs to be cleaned)
dups = meta.loc[meta.index.duplicated()]
only_unique = meta.loc[~meta.index.duplicated()]
unique = pd.concat([only_unique, dups])
unique.shape
Out[7]:
In [8]:
# Merge Datasets
joined = unique.join(mri, how='outer', sort=True)
In [9]:
joined['MRI'] = joined.t1_url.apply(lambda x: 'no' if pd.isnull(x) else 'yes')
print(joined.shape)
joined.site_id = joined.site_id.apply(lambda x: x.lower() if isinstance(x, str) else pd.np.nan)
final = joined.reset_index()
In [10]:
final.columns = ['project',
'session_id',
'participant_id',
'age',
'diagnosis',
'dsm_iv_tr',
'education',
'ethnicity',
'full_iq',
'handedness',
'marital_status',
'occupation',
'performance_iq',
'sex',
'site_id',
'species',
'verbal_iq',
'MRIs',
'session_count',
'MRI']
final.to_csv('../docs/data/phenotype_mri.csv', index=False)
In [11]:
for project in final.project.unique():
filename = 'brainbox-csv/{}.csv'.format(project)
part = final.loc[final.project == project]
part[['MRIs', 'participant_id']].dropna().drop_duplicates().to_csv(filename, index=False, header=False)
In [12]:
final[['MRIs', 'participant_id', 'project']].dropna().drop_duplicates().to_csv('brainbox-csv/all-mris.csv', index=False, header=False)
In [ ]: