Transform Phenotypic Data into a Common Format

This script accesses the extracted data in their original format and applys a mapping to harmonize the data into a common tabular format.


In [1]:
import os
import json
import glob

import numpy as np
import pandas as pd

In [2]:
# Path to repositories to pull raw data out of.
curdir = os.path.abspath(os.path.curdir)
repos = ['fcp-indi', 'brain-development', 'dataverse']
extract_paths = [os.path.join(curdir, repo) for repo in repos]
# Path to where the tranformed projects will go.
xfm_dir = os.path.join(curdir, 'metadata')
if not os.path.exists(xfm_dir):
    os.mkdir(xfm_dir)
# Project dirs.
extract_dirs = list()
for extract_path in extract_paths:
    project_paths = [os.path.join(extract_path, project) for project in os.listdir(extract_path) if '.' not in project]
    extract_dirs.extend(project_paths)

In [3]:
def apply_mapping(mapping, df_list):
    """
    Reads a dictionary mapping and list of dataframe, then merges the
    dataframes and convers the column name and values into a common
    format. Note: the dataframes should have a common structure.
    
    Example Mapping Structure
    =========================
    {  
        "DX_GROUP":
      {
        "element": "diagnosis",
        "type": "category",
        "1": "autism",
        "2": "control"
      }
    }
    """
    results = list()
    csv = pd.concat(df_list)
    csv.reset_index(drop=True, inplace=True)
    for col, elem in mapping.items():
        # Use get in case the col is missing/static.
        series = csv.get(col)
        # Categories are mapped to common values.
        if elem.get('type') == 'str':
            result = series.apply(lambda x: elem.get(str(x)))
        # Values parsed as numbers are checked for any mappings (e.g., -999 == NaN).
        elif elem.get('type') == 'float':
            result = series.apply(lambda x: np.NaN if str(x) in elem.keys() else x)
        # Used to create a column of all the same value.
        elif elem.get('type') == 'static':
            val = [elem.get('value')] * csv.shape[0]
            result = pd.Series(val, index=csv.index)
        else:
            try:
                # Handle IDs being read as float.
                result = series.apply(lambda x: str(int(x)) if pd.notnull(x) else x)
            except ValueError as e:
                result = series
        # Concat all csv in a extract dir into one dataframe.        
        df = pd.DataFrame()
        df[elem.get('element')] = result
        results.append(df)
        concat = pd.concat(results, axis=1)
    # Merge any columns with duplicate names to fil in NaN from concat step.
    return concat.groupby(level=0, axis=1).first()

In [4]:
# Read a mapping.json file from each sub directory to process.
for extract_dir in extract_dirs:
    df_list = list()
    mapping = dict()
    project_name = extract_dir.split('/')[-1]
    # All files must use the same data dictionary in a given directory.
    extract_files = [os.path.join(extract_dir, i) for i in os.listdir(extract_dir) if i not in ['mapping.json']]
    mapping_file = os.path.join(extract_dir, 'mapping.json')
    if os.path.exists(mapping_file):
        with open(mapping_file, 'rt') as fi:
            mapping.update(json.load(fi))
        ext_type = dict(csv=',', tsv='\t')
        # Grab dtype for parsing csv
        dtype = {k: v.get('type') for k, v in mapping.items() if v.get('type') in ['str', 'int', 'float']}
        dtype.update({k: 'str' for k, v in mapping.items() if v.get('type') in ['static', 'identifier']})
        # Dealing with NaNs.
        na_values = ['NoPhenotypicData', '#']
        for extract_file in extract_files:
            # Process each file.
            ext = extract_file.split('.')[-1]
            sep_type = ext_type.get(ext)
            df = pd.read_csv(extract_file, sep=sep_type, dtype=dtype, na_values=na_values)
            #                encoding = "ISO-8859-1")
            df_list.append(df.drop_duplicates())
        xfm = apply_mapping(mapping, df_list)
        xfm.loc[pd.isnull(xfm.session_id), 'session_id'] = 1
        pheno_file = ''.join([project_name, '_', 'phenotype.csv'])
        pheno_path = os.path.join(xfm_dir, pheno_file)
        xfm.to_csv(pheno_path, index=False)

In [5]:
# Merge all the transformed files into a single frame
meta_files = glob.glob(xfm_dir + '/*.csv')
meta_list = list()
for meta_file in meta_files:
    df = pd.read_csv(meta_file, dtype={'session_id': 'str',
                                       'participant_id': 'str'})
    meta_list.append(df.drop_duplicates())
meta_df = pd.concat(meta_list).set_index(['project', 'session_id', 'participant_id'])
meta = meta_df.copy() #drop_duplicates()
meta.head()


Out[5]:
age diagnosis dsm_iv_tr education ethnicity full iq handedness marital_status occupation performance iq sex site_id species verbal iq
project session_id participant_id
abide_initiative 1 50002 16.77 autism autism NaN NaN 103.0 ambidextrous NaN NaN 89.0 Male PITT homo-sapiens 116.0
50003 24.45 autism autism NaN NaN 124.0 right handed NaN NaN 115.0 Male PITT homo-sapiens 128.0
50004 19.09 autism autism NaN NaN 113.0 right handed NaN NaN 117.0 Male PITT homo-sapiens 108.0
50005 13.73 autism autism NaN NaN 119.0 right handed NaN NaN 118.0 Female PITT homo-sapiens 117.0
50006 13.37 autism autism NaN NaN 109.0 left handed NaN NaN 119.0 Male PITT homo-sapiens 99.0

In [6]:
# Load the CSV with MRI URLs
mri_path = os.path.abspath('clean-csv/all-session.csv')
mri = pd.read_csv(mri_path).set_index(['project', 'session_id', 'participant_id'])
mri.head()


Out[6]:
t1_url session_count
project session_id participant_id
abide_initiative 1 50002 https://s3.amazonaws.com/fcp-indi/data/Project... 1
50003 https://s3.amazonaws.com/fcp-indi/data/Project... 1
50004 https://s3.amazonaws.com/fcp-indi/data/Project... 1
50005 https://s3.amazonaws.com/fcp-indi/data/Project... 1
50006 https://s3.amazonaws.com/fcp-indi/data/Project... 1

In [7]:
# Grab only the records with unique indices (NKI-RS has duplicate records and needs to be cleaned)
dups = meta.loc[meta.index.duplicated()]
only_unique = meta.loc[~meta.index.duplicated()]
unique = pd.concat([only_unique, dups])
unique.shape


Out[7]:
(9859, 14)

In [8]:
# Merge Datasets
joined = unique.join(mri, how='outer', sort=True)

In [9]:
joined['MRI'] = joined.t1_url.apply(lambda x: 'no' if pd.isnull(x) else 'yes')
print(joined.shape)
joined.site_id = joined.site_id.apply(lambda x: x.lower() if isinstance(x, str) else pd.np.nan)
final = joined.reset_index()


(11010, 17)

In [10]:
final.columns = ['project',
                 'session_id',
                 'participant_id',
                 'age',
                 'diagnosis',
                 'dsm_iv_tr',
                 'education',
                 'ethnicity',
                 'full_iq',
                 'handedness',
                 'marital_status',
                 'occupation',
                 'performance_iq',
                 'sex',
                 'site_id',
                 'species',
                 'verbal_iq',
                 'MRIs',
                 'session_count',
                 'MRI']
final.to_csv('../docs/data/phenotype_mri.csv', index=False)

In [11]:
for project in final.project.unique():
    filename = 'brainbox-csv/{}.csv'.format(project)
    part = final.loc[final.project == project]
    part[['MRIs', 'participant_id']].dropna().drop_duplicates().to_csv(filename, index=False, header=False)

In [12]:
final[['MRIs', 'participant_id', 'project']].dropna().drop_duplicates().to_csv('brainbox-csv/all-mris.csv', index=False, header=False)

In [ ]: