In [1]:
# imports
from __future__ import print_function
import numpy as np
import os
import nibabel as nib
from os import listdir
from os.path import isfile, join
import os.path
In [2]:
# little helper function to return the proper filelist with the full path but that skips hidden files
def listdir_nohidden(path):
for f in os.listdir(path):
if not f.startswith('.'):
yield f
def listdir_fullpath(d):
return [os.path.join(d, f) for f in listdir_nohidden(d)]
# and create a filelist
onlyfiles = listdir_fullpath("./data/Input/")
# print the file list length
"There are " + str(len(onlyfiles)) + " files to be processed."
Out[2]:
Check for missing data
In [3]:
# check to see which files contains nodes with missing information
missingarray = []
for i in onlyfiles:
# load timeseries
filename = i
ts_raw = np.loadtxt(filename)
# check zero columns
missingn = np.where(~ts_raw.any(axis=0))[0]
missingarray.append(missingn)
# select the ones that don't have missing data
ids = np.where([len(i) == 0 for i in missingarray])[0]
selected_filename_only = [onlyfiles[i] for i in ids]
# could be useful to have one without pathnames later one
selected_full_path = [os.path.basename(onlyfiles[i]) for i in ids]
"There are " + str(len(selected_filename_only)) + " files that are selected."
Out[3]:
Now load the phenotype file and check to see the filenames match the selected ones and have the SUB_IN_SMP set to 1
In [4]:
import pandas as pd
# read in csv
df_phen = pd.read_csv('./data/Phenotypic_V1_0b_preprocessed1_filt.csv')
# add a column that matches the filename
for i in df_phen:
df_phen['filename_1D'] = df_phen['FILE_ID']+"_rois_cc400.1D"
df_phen['filename_npy'] = df_phen['FILE_ID']+"_rois_cc400.1D.npy"
df_phen['selected'] = np.where(df_phen['filename_1D'].isin((selected_full_path)), 1, 0 )
df_phen = df_phen.loc[df_phen["SUB_IN_SMP"] == 1]
df_phen = df_phen.loc[df_phen["selected"] == 1]
df_phen.to_csv('./data/SelectedSubjects.csv')
"There are " + str(len(df_phen.index)) + " in the final selection."
Out[4]: