In [ ]:
import pandas as pd
import numpy as np
import joblib
import os

In [ ]:
behavData_xlsx = 'U:\\Projects\\Project_CCA\\Behavioural\\mwq.raw_sessionMean.xlsx'

# Keywords in the selected variable, they have to match the exact name in the file               
# the first key you select must be the id
selectvar = False #if you dont need it, change to false
selectedKeys = ['SCAN_ID',
                'foo'
                ]#don't touch if false

#optional: name the selected behavioral data; can leave unchanged; this will save data as .npy files
keysfn = 'MWQ_keys'
datafn = 'data_MWQ_session_preprocessed'
imp_s = 'mean' # impute strategy
drop_c = 10  #criteria of dropping participants: number of missing variable 
impute_miss = True #if you are using this output for SCCA, set as True. It will impute missing values with variable mean
#Run the script after changing the things above

In [ ]:
data_raw = pd.read_excel(behavData_xlsx)

#get all the keys of the raw data
keys= data_raw.columns
if selectvar:
    #select keys you are inculding in the analysis 
    includeKeys = []
    for s in selectedKeys:
        for k in keys:
            if s in k: 
                includeKeys.append(k)
    # clean data
    # get the variable we are including
    cs_include = data_raw[includeKeys].values
    keys = includeKeys
else:
    cs_include = data_raw.values

excludeIdx = []
if excludeNaN:
    #exclde cases with more than 20 nan
    excludeIdx = []
    for i in range(cs_include.shape[0]):
        n = np.count_nonzero(np.isnan(cs_include)[i])
        if n>drop_c:
            excludeIdx.append(i)

    excludeIdx = np.array(excludeIdx)
    #exclude the participants
    x = np.delete(cs_include, excludeIdx, 0)
    data = x[:,1:]
    IDNO = x[:,0]

else:
    data = cs_include[:,1:]
    IDNO = cs_include[:,0]

In [ ]:
from funlib_scca import *

data_imp = imputedata(data, imp_s, missing=impute_miss) #impute outlier

#demean
S = data_imp.sum(axis=0) / data_imp.shape[0]
data_imp -= S[np.newaxis, :]
var = (data_imp ** 2).sum(axis=0)
var[var == 0] = 1
data_imp /= var

In [ ]:
#save file
output = np.column_stack((IDNO, data_imp))

np.save(datafn, output)
# np.save(keysfn, keys)