Transform Phenotypic Data into a Common Format

This script accesses the extracted data in their original format and applys a mapping to harmonize the data into a common tabular format.



In [1]:

    
import os
import json
import glob

import numpy as np
import pandas as pd



In [2]:

    
# Path to repositories to pull raw data out of.
curdir = os.path.abspath(os.path.curdir)
repos = ['fcp-indi', 'brain-development', 'dataverse']
extract_paths = [os.path.join(curdir, repo) for repo in repos]
# Path to where the tranformed projects will go.
xfm_dir = os.path.join(curdir, 'metadata')
if not os.path.exists(xfm_dir):
    os.mkdir(xfm_dir)
# Project dirs.
extract_dirs = list()
for extract_path in extract_paths:
    project_paths = [os.path.join(extract_path, project) for project in os.listdir(extract_path) if '.' not in project]
    extract_dirs.extend(project_paths)



In [3]:

    
def apply_mapping(mapping, df_list):
    """
    Reads a dictionary mapping and list of dataframe, then merges the
    dataframes and convers the column name and values into a common
    format. Note: the dataframes should have a common structure.
    
    Example Mapping Structure
    =========================
    {  
        "DX_GROUP":
      {
        "element": "diagnosis",
        "type": "category",
        "1": "autism",
        "2": "control"
      }
    }
    """
    results = list()
    csv = pd.concat(df_list)
    csv.reset_index(drop=True, inplace=True)
    for col, elem in mapping.items():
        # Use get in case the col is missing/static.
        series = csv.get(col)
        # Categories are mapped to common values.
        if elem.get('type') == 'str':
            result = series.apply(lambda x: elem.get(str(x)))
        # Values parsed as numbers are checked for any mappings (e.g., -999 == NaN).
        elif elem.get('type') == 'float':
            result = series.apply(lambda x: np.NaN if str(x) in elem.keys() else x)
        # Used to create a column of all the same value.
        elif elem.get('type') == 'static':
            val = [elem.get('value')] * csv.shape[0]
            result = pd.Series(val, index=csv.index)
        else:
            try:
                # Handle IDs being read as float.
                result = series.apply(lambda x: str(int(x)) if pd.notnull(x) else x)
            except ValueError as e:
                result = series
        # Concat all csv in a extract dir into one dataframe.        
        df = pd.DataFrame()
        df[elem.get('element')] = result
        results.append(df)
        concat = pd.concat(results, axis=1)
    # Merge any columns with duplicate names to fil in NaN from concat step.
    return concat.groupby(level=0, axis=1).first()



In [4]:

    
# Read a mapping.json file from each sub directory to process.
for extract_dir in extract_dirs:
    df_list = list()
    mapping = dict()
    project_name = extract_dir.split('/')[-1]
    # All files must use the same data dictionary in a given directory.
    extract_files = [os.path.join(extract_dir, i) for i in os.listdir(extract_dir) if i not in ['mapping.json']]
    mapping_file = os.path.join(extract_dir, 'mapping.json')
    if os.path.exists(mapping_file):
        with open(mapping_file, 'rt') as fi:
            mapping.update(json.load(fi))
        ext_type = dict(csv=',', tsv='\t')
        # Grab dtype for parsing csv
        dtype = {k: v.get('type') for k, v in mapping.items() if v.get('type') in ['str', 'int', 'float']}
        dtype.update({k: 'str' for k, v in mapping.items() if v.get('type') in ['static', 'identifier']})
        # Dealing with NaNs.
        na_values = ['NoPhenotypicData', '#']
        for extract_file in extract_files:
            # Process each file.
            ext = extract_file.split('.')[-1]
            sep_type = ext_type.get(ext)
            df = pd.read_csv(extract_file, sep=sep_type, dtype=dtype, na_values=na_values)
            #                encoding = "ISO-8859-1")
            df_list.append(df.drop_duplicates())
        xfm = apply_mapping(mapping, df_list)
        xfm.loc[pd.isnull(xfm.session_id), 'session_id'] = 1
        pheno_file = ''.join([project_name, '_', 'phenotype.csv'])
        pheno_path = os.path.join(xfm_dir, pheno_file)
        xfm.to_csv(pheno_path, index=False)



In [5]:

    
# Merge all the transformed files into a single frame
meta_files = glob.glob(xfm_dir + '/*.csv')
meta_list = list()
for meta_file in meta_files:
    df = pd.read_csv(meta_file, dtype={'session_id': 'str',
                                       'participant_id': 'str'})
    meta_list.append(df.drop_duplicates())
meta_df = pd.concat(meta_list).set_index(['project', 'session_id', 'participant_id'])
meta = meta_df.copy() #drop_duplicates()
meta.head()









    Out[5]:






  
    
      
      
      
      age
      diagnosis
      dsm_iv_tr
      education
      ethnicity
      full iq
      handedness
      marital_status
      occupation
      performance iq
      sex
      site_id
      species
      verbal iq
    
    
      project
      session_id
      participant_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      abide_initiative
      1
      50002
      16.77
      autism
      autism
      NaN
      NaN
      103.0
      ambidextrous
      NaN
      NaN
      89.0
      Male
      PITT
      homo-sapiens
      116.0
    
    
      50003
      24.45
      autism
      autism
      NaN
      NaN
      124.0
      right handed
      NaN
      NaN
      115.0
      Male
      PITT
      homo-sapiens
      128.0
    
    
      50004
      19.09
      autism
      autism
      NaN
      NaN
      113.0
      right handed
      NaN
      NaN
      117.0
      Male
      PITT
      homo-sapiens
      108.0
    
    
      50005
      13.73
      autism
      autism
      NaN
      NaN
      119.0
      right handed
      NaN
      NaN
      118.0
      Female
      PITT
      homo-sapiens
      117.0
    
    
      50006
      13.37
      autism
      autism
      NaN
      NaN
      109.0
      left handed
      NaN
      NaN
      119.0
      Male
      PITT
      homo-sapiens
      99.0



In [6]:

    
# Load the CSV with MRI URLs
mri_path = os.path.abspath('clean-csv/all-session.csv')
mri = pd.read_csv(mri_path).set_index(['project', 'session_id', 'participant_id'])
mri.head()









    Out[6]:






  
    
      
      
      
      t1_url
      session_count
    
    
      project
      session_id
      participant_id
      
      
    
  
  
    
      abide_initiative
      1
      50002
      https://s3.amazonaws.com/fcp-indi/data/Project...
      1
    
    
      50003
      https://s3.amazonaws.com/fcp-indi/data/Project...
      1
    
    
      50004
      https://s3.amazonaws.com/fcp-indi/data/Project...
      1
    
    
      50005
      https://s3.amazonaws.com/fcp-indi/data/Project...
      1
    
    
      50006
      https://s3.amazonaws.com/fcp-indi/data/Project...
      1



In [7]:

    
# Grab only the records with unique indices (NKI-RS has duplicate records and needs to be cleaned)
dups = meta.loc[meta.index.duplicated()]
only_unique = meta.loc[~meta.index.duplicated()]
unique = pd.concat([only_unique, dups])
unique.shape









    Out[7]:





(9859, 14)



In [8]:

    
# Merge Datasets
joined = unique.join(mri, how='outer', sort=True)



In [9]:

    
joined['MRI'] = joined.t1_url.apply(lambda x: 'no' if pd.isnull(x) else 'yes')
print(joined.shape)
joined.site_id = joined.site_id.apply(lambda x: x.lower() if isinstance(x, str) else pd.np.nan)
final = joined.reset_index()









    



(11010, 17)



In [10]:

    
final.columns = ['project',
                 'session_id',
                 'participant_id',
                 'age',
                 'diagnosis',
                 'dsm_iv_tr',
                 'education',
                 'ethnicity',
                 'full_iq',
                 'handedness',
                 'marital_status',
                 'occupation',
                 'performance_iq',
                 'sex',
                 'site_id',
                 'species',
                 'verbal_iq',
                 'MRIs',
                 'session_count',
                 'MRI']
final.to_csv('../docs/data/phenotype_mri.csv', index=False)



In [11]:

    
for project in final.project.unique():
    filename = 'brainbox-csv/{}.csv'.format(project)
    part = final.loc[final.project == project]
    part[['MRIs', 'participant_id']].dropna().drop_duplicates().to_csv(filename, index=False, header=False)



In [12]:

    
final[['MRIs', 'participant_id', 'project']].dropna().drop_duplicates().to_csv('brainbox-csv/all-mris.csv', index=False, header=False)



In [ ]:

			age	diagnosis	dsm_iv_tr	education	ethnicity	full iq	handedness	marital_status	occupation	performance iq	sex	site_id	species	verbal iq
project	session_id	participant_id
abide_initiative	1	50002	16.77	autism	autism	NaN	NaN	103.0	ambidextrous	NaN	NaN	89.0	Male	PITT	homo-sapiens	116.0
		50003	24.45	autism	autism	NaN	NaN	124.0	right handed	NaN	NaN	115.0	Male	PITT	homo-sapiens	128.0
		50004	19.09	autism	autism	NaN	NaN	113.0	right handed	NaN	NaN	117.0	Male	PITT	homo-sapiens	108.0
		50005	13.73	autism	autism	NaN	NaN	119.0	right handed	NaN	NaN	118.0	Female	PITT	homo-sapiens	117.0
		50006	13.37	autism	autism	NaN	NaN	109.0	left handed	NaN	NaN	119.0	Male	PITT	homo-sapiens	99.0