Validation


In [1]:
%matplotlib inline

import time
import os
import sys

import sqlite3

import pandas as pd
import numpy as np
import scipy as sp

import statsmodels.formula.api as smf

import scipy.io

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import seaborn as sns

import plotly
plotly.offline.init_notebook_mode()


Discriminability

Calculate mean synchrony timeseries across the middle frequency bands for each pipeline.


In [2]:
def disco_matimport(indir,outdir,startlev,endlev):
    ''' Import Matlab variables into Pandas table 
    Returns: sync = Mean of synchrony values across frequency bands for each pipeline '''

    function = 'disco_matimport'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Running',function,indir)
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Number of files =',len(os.listdir(os.path.join('.',indir))))
    
    i = 0
    sync = pd.DataFrame({})
    for file in os.listdir(os.path.join('.',indir)):
        
        i += 1
        sys.stdout.write(str(i)+', ')
        sys.stdout.flush()
        
        syncvar = file.split('_')[1].upper()
        matvar = scipy.io.loadmat(os.path.join('.',indir,file),variable_names=syncvar)
        avgvar = np.mean(matvar[syncvar][startlev:endlev],0) # Mean of synchrony timeseries across frequency bands        
        
        # Store information in columns
        temp = pd.DataFrame({file[len(file.split('_')[0])+len('_'):-4]:avgvar})
        sync = pd.concat([sync,temp],axis=1)        
        
        # Store information in rows
        # avgvar = pd.DataFrame.transpose(pd.DataFrame(avgvar))
        # pipeline = pd.DataFrame({'pipeline':[file[:-4]]})
        # temp = pd.concat([pipeline,avgvar],axis=1)        
        # sync = pd.concat([sync,temp],ignore_index=True)

    print(' ')
        
    outfile = function+'_'+indir+'.pkl'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Saving',outfile)
    sync.to_pickle(os.path.join('.',outdir,outfile))
    
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Done')        
    return sync

In [18]:
VAL = disco_matimport('validity','',5,10)


11/09/2016 15:49:17 Running disco_matimport validity
11/09/2016 15:49:17 Number of files = 432
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432,  
11/09/2016 15:55:28 Saving disco_matimport_validity.pkl
11/09/2016 15:55:37 Done

In [19]:
VAL.head()


Out[19]:
cpm_coif1_decimate_cubic_vector cpm_coif1_decimate_cubic_xalign cpm_coif1_decimate_cubic_yalign cpm_coif1_decimate_cubic_zalign cpm_coif1_decimate_linear_vector cpm_coif1_decimate_linear_xalign cpm_coif1_decimate_linear_yalign cpm_coif1_decimate_linear_zalign cpm_coif1_decimate_nearest_vector cpm_coif1_decimate_nearest_xalign ... ips_sym6_mean_cubic_yalign ips_sym6_mean_cubic_zalign ips_sym6_mean_linear_vector ips_sym6_mean_linear_xalign ips_sym6_mean_linear_yalign ips_sym6_mean_linear_zalign ips_sym6_mean_nearest_vector ips_sym6_mean_nearest_xalign ips_sym6_mean_nearest_yalign ips_sym6_mean_nearest_zalign
0 0.411086 0.449412 0.483949 0.526640 0.406081 0.451630 0.484344 0.527490 0.410913 0.450989 ... 0.346234 0.517742 0.360442 0.477407 0.362911 0.528230 0.363596 0.449737 0.355297 0.530667
1 0.406029 0.449216 0.483830 0.529568 0.401735 0.451696 0.483964 0.530395 0.407795 0.450598 ... 0.362532 0.515705 0.361382 0.478330 0.358006 0.526258 0.364685 0.450874 0.350665 0.529080
2 0.395510 0.447991 0.483617 0.532305 0.395438 0.450906 0.483510 0.533129 0.401583 0.449556 ... 0.357903 0.513587 0.362428 0.479330 0.353370 0.524315 0.365736 0.452098 0.346359 0.527423
3 0.394070 0.446268 0.483077 0.535114 0.393310 0.449722 0.482721 0.535948 0.395880 0.448236 ... 0.353117 0.511502 0.363594 0.480408 0.348911 0.522390 0.366878 0.453408 0.342222 0.525601
4 0.401674 0.443921 0.482273 0.537661 0.397049 0.448084 0.481633 0.538565 0.396605 0.446561 ... 0.347529 0.509458 0.364846 0.481153 0.344580 0.520324 0.368071 0.454589 0.338178 0.523747

5 rows × 432 columns

Create database with the averaged synchrony timeseries.


In [26]:
analysis = 'validity'
columns = list(VAL)
dframe = VAL

schema = 'DROP TABLE IF EXISTS "' + analysis + '"; CREATE TABLE "' + analysis + '" ('
for i in range(len(columns)):
    schema += '"' + columns[i] + '" FLOAT'
    if i < len(columns)-1:
        schema += ', '
schema += ');'

dbase = sqlite3.connect('disco_matimport_' + analysis + '.db')
dbase.cursor().executescript(schema); dbase.commit()
dframe.to_sql(analysis,dbase,if_exists='replace',index=False); dbase.commit()
dbase.close()

Load pipeline synchrony database.


In [2]:
new_db = sqlite3.connect('disco_matimport_validity.db')

Load condition times database.


In [3]:
time_db = sqlite3.connect('disco_millisecond_conditions.db')

Compute correlations of synchrony timecourse from different pipelines with condition timecourse.


In [41]:
def disco_validity(indata,outdir,timedata,samplerate):
    ''' Correlation test of synchrony timecourse from different 
    pipelines with condition timecourse from the pilot experiment
    indata = Database of synchrony measures files
    outdir = Output data directory
    timedata = Database containing condition times in milliseconds
    samplerate = Sampling rate in ms 
    (e.g., ceil(mean of actual rate across Ss) = ceil(15.4120) = 16 ms)    
    Returns: RHO = Correlations with condition timecourse '''
    
    function = 'disco_validity'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Running',function)           
    
    # Condition times in milliseconds
    query = "SELECT msec FROM conditions;"
    timeMSEC = pd.read_sql_query(query,timedata)
    timeMSEC = timeMSEC['msec'].values
    
    # List of preprocessing pipelines (columns)
    query = "PRAGMA table_info(validity);"
    preproc = [elem[1] for elem in indata.cursor().execute(query)]
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Number of pipelines =',len(preproc))
    
    i = 0
    RHO = pd.DataFrame(columns=('preproc','R','P'))
    for P in preproc:
        
        i += 1
        sys.stdout.write(str(i)+', ')
        sys.stdout.flush()
        
        query = "SELECT " + P + " FROM validity;"
        meanSYNC = pd.read_sql_query(query,indata)
        meanSYNC = meanSYNC[pd.notnull(meanSYNC[P])] # Remove NaN value at last element (CPM)
        meanSYNC = meanSYNC[P].values
        
        # Timecourse defining synchrony (1) and non-synchrony (0) conditions
        condSYNC = np.zeros(len(meanSYNC))
        for t in range(0+1,len(timeMSEC),2): # For each synchrony condition (2 timepoints)
            first = int(np.ceil((timeMSEC[t]-timeMSEC[0])/samplerate)) # First timepoint
            last = int(np.ceil((timeMSEC[t+1]-timeMSEC[0])/samplerate)) # Last timepoint
            if last > len(condSYNC):
                last = len(condSYNC)     
            condSYNC[first:last] = np.ones(last-first)
        
        # Correlation test of condition and synchrony timecourses
        result = sp.stats.pearsonr(condSYNC,meanSYNC)
        RHO.loc[RHO.shape[0]] = [P,result[0],result[1]]    
        
    print(' ')
        
    outfile = function+'.pkl'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Saving',outfile)
    RHO.to_pickle(os.path.join('.',outdir,outfile))
    
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Done')        
    return RHO

In [42]:
RHO = disco_validity(new_db,'',time_db,int(np.ceil(15.4120)))


11/09/2016 20:22:28 Running disco_validity
11/09/2016 20:22:28 Number of pipelines = 432
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432,  
11/09/2016 20:39:29 Saving disco_validity.pkl
11/09/2016 20:39:29 Done

In [44]:
RHO.head()


Out[44]:
preproc R P
0 cpm_coif1_decimate_cubic_vector -0.443024 0.000000e+00
1 cpm_coif1_decimate_cubic_xalign -0.073459 1.962499e-162
2 cpm_coif1_decimate_cubic_yalign -0.052611 3.636690e-84
3 cpm_coif1_decimate_cubic_zalign 0.252733 0.000000e+00
4 cpm_coif1_decimate_linear_vector -0.444704 0.000000e+00

Save correlations in database.


In [45]:
dframe = RHO
schema = """
DROP TABLE IF EXISTS "RHO"; 
CREATE TABLE "RHO" (
    "preproc" VARCHAR,
    "R" FLOAT,
    "P" FLOAT
);
"""
dbase = sqlite3.connect('disco_validity.db')
dbase.cursor().executescript(schema); dbase.commit()
dframe.to_sql("RHO",dbase,if_exists='replace',index=False); dbase.commit()
dbase.close()

Group correlations by factor levels (i.e., pipeline parameters) using Pandas.


In [4]:
# Load discriminability database
db = sqlite3.connect('disco_validity.db')
query = "SELECT * FROM RHO;"
df = pd.read_sql_query(query,db)
db.close()
df.head()


Out[4]:
preproc R P
0 cpm_coif1_decimate_cubic_vector -0.443024 0.000000e+00
1 cpm_coif1_decimate_cubic_xalign -0.073459 1.962499e-162
2 cpm_coif1_decimate_cubic_yalign -0.052611 3.636690e-84
3 cpm_coif1_decimate_cubic_zalign 0.252733 0.000000e+00
4 cpm_coif1_decimate_linear_vector -0.444704 0.000000e+00

In [5]:
# Replace vanishing moments with filter lengths
df['preproc'] = df['preproc'].str.replace('1|2','_short')
df['preproc'] = df['preproc'].str.replace('3|4','_medium')
df['preproc'] = df['preproc'].str.replace('5|6','_long')

In [6]:
df.head()


Out[6]:
preproc R P
0 cpm_coif_short_decimate_cubic_vector -0.443024 0.000000e+00
1 cpm_coif_short_decimate_cubic_xalign -0.073459 1.962499e-162
2 cpm_coif_short_decimate_cubic_yalign -0.052611 3.636690e-84
3 cpm_coif_short_decimate_cubic_zalign 0.252733 0.000000e+00
4 cpm_coif_short_decimate_linear_vector -0.444704 0.000000e+00

In [7]:
df.tail()


Out[7]:
preproc R P
427 ips_sym_long_mean_linear_zalign 0.478266 0.0
428 ips_sym_long_mean_nearest_vector 0.388934 0.0
429 ips_sym_long_mean_nearest_xalign 0.302790 0.0
430 ips_sym_long_mean_nearest_yalign -0.393082 0.0
431 ips_sym_long_mean_nearest_zalign 0.476917 0.0

In [8]:
# Retrieve correlations for each factor level

paramstr = ['cpm','ips', # Synchrony measure
            'coif','db','sym', # Wavelet family
            'short','medium','long', # Filter length
            'decimate','mean', # Downsampling
            'cubic','linear','nearest', # Interpolation
            'vector','xalign','yalign','zalign'] # Combination

paramdf = pd.DataFrame(columns=['parameter','correlations'])
for p in paramstr:
    c = df[df['preproc'].str.contains(p)].R.values
    paramdf.loc[paramdf.shape[0]] = [p,c]

In [9]:
paramdf


Out[9]:
parameter correlations
0 cpm [-0.443023934066, -0.0734588191007, -0.0526109...
1 ips [0.419601904344, 0.298482899634, -0.4459811573...
2 coif [-0.443023934066, -0.0734588191007, -0.0526109...
3 db [-0.401646998013, -0.0663185161795, -0.0501602...
4 sym [-0.401646998013, -0.0663185161795, -0.0501602...
5 short [-0.443023934066, -0.0734588191007, -0.0526109...
6 medium [-0.366862499275, -0.0555178888118, -0.0589029...
7 long [-0.295433564528, -0.0546943131961, -0.0723451...
8 decimate [-0.443023934066, -0.0734588191007, -0.0526109...
9 mean [-0.443069044138, -0.0734711210756, -0.0526133...
10 cubic [-0.443023934066, -0.0734588191007, -0.0526109...
11 linear [-0.44470389173, -0.0647630371939, -0.05051637...
12 nearest [-0.443457036385, -0.0661041183197, -0.0508083...
13 vector [-0.443023934066, -0.44470389173, -0.443457036...
14 xalign [-0.0734588191007, -0.0647630371939, -0.066104...
15 yalign [-0.0526109918102, -0.0505163771805, -0.050808...
16 zalign [0.252732857003, 0.267245346902, 0.26694937003...

Group correlations by factor levels (i.e., pipeline parameters) using SQLite3.


In [2]:
# Load discriminability database
db = sqlite3.connect('disco_validity.db')
query = "SELECT * FROM RHO WHERE preproc LIKE 'cpm_coif%decimate_cubic_vector';"
out = db.cursor().execute(query); print(out.fetchall())
query = "SELECT * FROM RHO WHERE preproc LIKE 'cpm_db%decimate_cubic_vector';"
out = db.cursor().execute(query); print(out.fetchall())
query = "SELECT * FROM RHO WHERE preproc LIKE 'cpm_sym%decimate_cubic_vector';"
out = db.cursor().execute(query); print(out.fetchall())


[('cpm_coif1_decimate_cubic_vector', -0.4430239340659711, 0.0), ('cpm_coif3_decimate_cubic_vector', -0.36686249927528225, 0.0), ('cpm_coif5_decimate_cubic_vector', -0.2954335645276906, 0.0)]
[('cpm_db2_decimate_cubic_vector', -0.4016469980134443, 0.0), ('cpm_db4_decimate_cubic_vector', -0.41140742561604376, 0.0), ('cpm_db6_decimate_cubic_vector', -0.41315003238599723, 0.0)]
[('cpm_sym2_decimate_cubic_vector', -0.4016469980134443, 0.0), ('cpm_sym4_decimate_cubic_vector', -0.41094072529979264, 0.0), ('cpm_sym6_decimate_cubic_vector', -0.3601261034255285, 0.0)]

In [3]:
# Replace vanishing moments with filter lengths
query = "UPDATE RHO SET preproc = REPLACE(preproc,'1','_short');"; db.cursor().execute(query)
query = "UPDATE RHO SET preproc = REPLACE(preproc,'2','_short');"; db.cursor().execute(query)
query = "UPDATE RHO SET preproc = REPLACE(preproc,'3','_medium');"; db.cursor().execute(query)
query = "UPDATE RHO SET preproc = REPLACE(preproc,'4','_medium');"; db.cursor().execute(query)
query = "UPDATE RHO SET preproc = REPLACE(preproc,'5','_long');"; db.cursor().execute(query)
query = "UPDATE RHO SET preproc = REPLACE(preproc,'6','_long');"; db.cursor().execute(query)
db.commit()

In [4]:
query = "SELECT * FROM RHO WHERE preproc LIKE 'cpm_coif%decimate_cubic_vector';"
out = db.cursor().execute(query); print(out.fetchall())
query = "SELECT * FROM RHO WHERE preproc LIKE 'cpm_db%decimate_cubic_vector';"
out = db.cursor().execute(query); print(out.fetchall())
query = "SELECT * FROM RHO WHERE preproc LIKE 'cpm_sym%decimate_cubic_vector';"
out = db.cursor().execute(query); print(out.fetchall())


[('cpm_coif_short_decimate_cubic_vector', -0.4430239340659711, 0.0), ('cpm_coif_medium_decimate_cubic_vector', -0.36686249927528225, 0.0), ('cpm_coif_long_decimate_cubic_vector', -0.2954335645276906, 0.0)]
[('cpm_db_short_decimate_cubic_vector', -0.4016469980134443, 0.0), ('cpm_db_medium_decimate_cubic_vector', -0.41140742561604376, 0.0), ('cpm_db_long_decimate_cubic_vector', -0.41315003238599723, 0.0)]
[('cpm_sym_short_decimate_cubic_vector', -0.4016469980134443, 0.0), ('cpm_sym_medium_decimate_cubic_vector', -0.41094072529979264, 0.0), ('cpm_sym_long_decimate_cubic_vector', -0.3601261034255285, 0.0)]

In [5]:
# Retrieve correlations for each factor level

paramstr = ['cpm','ips', # Synchrony measure
            'coif','db','sym', # Wavelet family
            'short','medium','long', # Filter length
            'decimate','mean', # Downsampling
            'cubic','linear','nearest', # Interpolation
            'vector','xalign','yalign','zalign'] # Combination

paramdf = pd.DataFrame(columns=['parameter','correlations'])
for p in paramstr:
    query = "SELECT R FROM RHO WHERE preproc LIKE '%" + p +"%';"  
    c = pd.read_sql_query(query,db).R.values
    paramdf.loc[paramdf.shape[0]] = [p,c]

In [6]:
paramdf


Out[6]:
parameter correlations
0 cpm [-0.443023934066, -0.0734588191007, -0.0526109...
1 ips [0.419601904344, 0.298482899634, -0.4459811573...
2 coif [-0.443023934066, -0.0734588191007, -0.0526109...
3 db [-0.401646998013, -0.0663185161795, -0.0501602...
4 sym [-0.401646998013, -0.0663185161795, -0.0501602...
5 short [-0.443023934066, -0.0734588191007, -0.0526109...
6 medium [-0.366862499275, -0.0555178888118, -0.0589029...
7 long [-0.295433564528, -0.0546943131961, -0.0723451...
8 decimate [-0.443023934066, -0.0734588191007, -0.0526109...
9 mean [-0.443069044138, -0.0734711210756, -0.0526133...
10 cubic [-0.443023934066, -0.0734588191007, -0.0526109...
11 linear [-0.44470389173, -0.0647630371939, -0.05051637...
12 nearest [-0.443457036385, -0.0661041183197, -0.0508083...
13 vector [-0.443023934066, -0.44470389173, -0.443457036...
14 xalign [-0.0734588191007, -0.0647630371939, -0.066104...
15 yalign [-0.0526109918102, -0.0505163771805, -0.050808...
16 zalign [0.252732857003, 0.267245346902, 0.26694937003...

In [7]:
# Save table and close database
paramdf.to_pickle('disco_validity_params.pkl')
db.close()

Compare correlations from different preprocessing parameters.


In [2]:
# Load pipeline correlations
db = sqlite3.connect('disco_validity.db')
query = "SELECT * FROM RHO;"
df = pd.read_sql_query(query,db)
db.close()

In [3]:
# Split pipeline names into factor levels or parameters
params = pd.DataFrame(df['preproc'].str.split('_').tolist(),columns=['sync','wave','filt','down','interp','comb'])
df = pd.concat([df,params],axis=1)
df.head()


Out[3]:
preproc R P sync wave filt down interp comb
0 cpm_coif_short_decimate_cubic_vector -0.443024 0.000000e+00 cpm coif short decimate cubic vector
1 cpm_coif_short_decimate_cubic_xalign -0.073459 1.962499e-162 cpm coif short decimate cubic xalign
2 cpm_coif_short_decimate_cubic_yalign -0.052611 3.636690e-84 cpm coif short decimate cubic yalign
3 cpm_coif_short_decimate_cubic_zalign 0.252733 0.000000e+00 cpm coif short decimate cubic zalign
4 cpm_coif_short_decimate_linear_vector -0.444704 0.000000e+00 cpm coif short decimate linear vector

In [4]:
# Convert correlations to Fisher's z
Fz = pd.DataFrame(dict(Fz=np.arctanh(df['R']).values))
df = pd.concat([df,Fz],axis=1)
df.head()


Out[4]:
preproc R P sync wave filt down interp comb Fz
0 cpm_coif_short_decimate_cubic_vector -0.443024 0.000000e+00 cpm coif short decimate cubic vector -0.475987
1 cpm_coif_short_decimate_cubic_xalign -0.073459 1.962499e-162 cpm coif short decimate cubic xalign -0.073591
2 cpm_coif_short_decimate_cubic_yalign -0.052611 3.636690e-84 cpm coif short decimate cubic yalign -0.052660
3 cpm_coif_short_decimate_cubic_zalign 0.252733 0.000000e+00 cpm coif short decimate cubic zalign 0.258330
4 cpm_coif_short_decimate_linear_vector -0.444704 0.000000e+00 cpm coif short decimate linear vector -0.478079

In [5]:
# Multiple regression of transformed correlations on categorical preprocessing parameters
est = smf.ols(formula='Fz~C(sync)+C(wave)+C(filt)+C(down)+C(interp)+C(comb)',data=df).fit()
est.summary().tables[1]


Out[5]:
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -0.1414 0.036 -3.899 0.000 -0.213 -0.070
C(sync)[T.ips] 0.2743 0.021 13.101 0.000 0.233 0.315
C(wave)[T.db] -8.578e-05 0.026 -0.003 0.997 -0.050 0.050
C(wave)[T.sym] 0.0019 0.026 0.075 0.940 -0.048 0.052
C(filt)[T.medium] -0.0001 0.026 -0.005 0.996 -0.051 0.050
C(filt)[T.short] -0.0076 0.026 -0.295 0.768 -0.058 0.043
C(down)[T.mean] -2.747e-05 0.021 -0.001 0.999 -0.041 0.041
C(interp)[T.linear] 0.0040 0.026 0.158 0.875 -0.046 0.054
C(interp)[T.nearest] 0.0040 0.026 0.155 0.877 -0.046 0.054
C(comb)[T.xalign] 0.1258 0.030 4.249 0.000 0.068 0.184
C(comb)[T.yalign] -0.2404 0.030 -8.120 0.000 -0.299 -0.182
C(comb)[T.zalign] 0.4011 0.030 13.550 0.000 0.343 0.459

In [6]:
# Interaction between significant factors: synchrony measure and axes combination
est = smf.ols(formula='Fz~C(sync)*C(comb)',data=df).fit()
est.summary().tables[1]


Out[6]:
coef std err t P>|t| [95.0% Conf. Int.]
Intercept -0.4126 0.004 -96.858 0.000 -0.421 -0.404
C(sync)[T.ips] 0.8181 0.006 135.803 0.000 0.806 0.830
C(comb)[T.xalign] 0.3535 0.006 58.681 0.000 0.342 0.365
C(comb)[T.yalign] 0.3545 0.006 58.851 0.000 0.343 0.366
C(comb)[T.zalign] 0.6662 0.006 110.583 0.000 0.654 0.678
C(sync)[T.ips]:C(comb)[T.xalign] -0.4554 0.009 -53.455 0.000 -0.472 -0.439
C(sync)[T.ips]:C(comb)[T.yalign] -1.1899 0.009 -139.663 0.000 -1.207 -1.173
C(sync)[T.ips]:C(comb)[T.zalign] -0.5301 0.009 -62.219 0.000 -0.547 -0.513

In [10]:
# Bar chart of interaction

combolist = ['vector','xalign','yalign','zalign']
comboaxis = np.arange(len(combolist))+1
cpm = [np.tanh(np.mean(df[(df.sync=='cpm')&(df.comb==combo)]['Fz'].values)) for combo in combolist]
ips = [np.tanh(np.mean(df[(df.sync=='ips')&(df.comb==combo)]['Fz'].values)) for combo in combolist]
cpm_e = [np.tanh(sp.stats.sem(df[(df.sync=='cpm')&(df.comb==combo)]['Fz'].values)) for combo in combolist]
ips_e = [np.tanh(sp.stats.sem(df[(df.sync=='ips')&(df.comb==combo)]['Fz'].values)) for combo in combolist]

with sns.axes_style("whitegrid"):
    
    plt.figure(); plt.clf()
    
    ax1 = plt.subplot(121); ax1.bar(comboaxis,cpm,yerr=cpm_e)
    ax2 = plt.subplot(122); ax2.bar(comboaxis,ips,yerr=ips_e)
    
    val="Discriminability (Pearson's r)"; ax1.set_ylabel(val); # ax2.set_ylabel(val)
    val="Axes Combination"; ax1.set_xlabel(val); ax2.set_xlabel(val)
    val="Synchrony Measure: CPM"; ax1.set_title(val)
    val="Synchrony Measure: IPS"; ax2.set_title(val)
    val=('vector','xalign','yalign','zalign'); ax1.set_xticklabels(val); ax2.set_xticklabels(val)
    val=comboaxis+.5; ax1.set_xticks(val); ax2.set_xticks(val)
    val=[.75,5,-.6,.6]; ax1.axis(val); ax2.axis(val)


Box and whisker plots of parameter correlations.


In [ ]:
paramname = {'cpm':'Synchrony: CPM','ips':'Synchrony: IPS',
             'coif':'Wavelet: Coiflet','db':'Wavelet: Daubechies','sym':'Wavelet: Symlet',
             'short':'Filter Length: Short','medium':'Filter Length: Medium','long':'Filter Length: Long',
             'decimate':'Downsampling: Decimate','mean':'Downsampling: Average',
             'cubic':'Interpolation: Cubic','linear':'Interpolation: Linear','nearest':'Interpolation: Nearest',
             'vector':'Combination: Standard','xalign':'Combination: X-align',
             'yalign':'Combination: Y-align','zalign':'Combination: Z-align'}

# Number of boxes (parameters)
N = paramdf.shape[0] 

# Generate an array of rainbow colors
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0,360,num=N)] 

# Each box is a dictionary containing the data, type, and color; Shows the mean for each box
data = [{'name':paramname[paramdf.loc[i][0]],
         'y':paramdf.loc[i][1],
         'type':'box',
         'marker':{'color':c[i]},
         'boxmean':True} 
        for i in range(N)]

# Format the layout
layout = {'xaxis':{'showticklabels':False},
          'yaxis':{'title':"Discriminability (Pearson's r)",
                   'range':[-0.6,0.6],'tickformat':'.3f',
                   'zeroline':False},
          'title':'Preprocessing and Analysis Pipelines: Parameter Discriminability'}

# Plot the data
fig = {'data':data,'layout':layout}
plotly.offline.iplot(fig)

Sort pipeline correlations from highest to lowest.


In [17]:
db = sqlite3.connect('disco_validity.db')
query = "SELECT * FROM RHO ORDER BY R DESC;"
df = pd.read_sql_query(query,db)
db.close()
df[0:25]


Out[17]:
preproc R P
0 ips_db_long_decimate_linear_zalign 0.538547 0.0
1 ips_db_long_mean_linear_zalign 0.538441 0.0
2 ips_db_long_decimate_nearest_zalign 0.537953 0.0
3 ips_db_long_mean_nearest_zalign 0.537908 0.0
4 ips_coif_short_decimate_linear_zalign 0.533696 0.0
5 ips_coif_short_mean_linear_zalign 0.533524 0.0
6 ips_coif_short_decimate_nearest_zalign 0.533006 0.0
7 ips_coif_short_mean_nearest_zalign 0.532938 0.0
8 ips_db_long_mean_cubic_zalign 0.532047 0.0
9 ips_db_long_decimate_cubic_zalign 0.532046 0.0
10 ips_coif_short_decimate_cubic_zalign 0.528236 0.0
11 ips_db_medium_decimate_linear_zalign 0.528211 0.0
12 ips_coif_short_mean_cubic_zalign 0.528182 0.0
13 ips_db_medium_mean_linear_zalign 0.528130 0.0
14 ips_db_medium_decimate_nearest_zalign 0.527213 0.0
15 ips_db_medium_mean_nearest_zalign 0.527109 0.0
16 ips_db_medium_mean_cubic_zalign 0.520981 0.0
17 ips_db_medium_decimate_cubic_zalign 0.520934 0.0
18 ips_sym_medium_decimate_linear_zalign 0.506138 0.0
19 ips_sym_medium_mean_linear_zalign 0.506107 0.0
20 ips_sym_medium_mean_nearest_zalign 0.505032 0.0
21 ips_sym_medium_decimate_nearest_zalign 0.504978 0.0
22 ips_sym_medium_mean_cubic_zalign 0.502726 0.0
23 ips_sym_medium_decimate_cubic_zalign 0.502665 0.0
24 ips_db_short_mean_linear_zalign 0.501880 0.0

Bar plots of highest pipeline correlations.


In [ ]:
N = 25 # Top 25 pipelines

pipename = list(df[0:N].preproc)
pipename = [p.replace('_',', ') for p in pipename]
pipename = [p.replace('mean','average') for p in pipename]

pipetext = ['']*N
pipetext[pipename.index('ips, coif, short, average, linear, zalign')] = 'Highest reliability'

pipecolor = ['rgb(158,202,225)']*N
pipecolor[pipename.index('ips, coif, short, average, linear, zalign')] = 'rgb(8,48,107)'

data = [plotly.graph_objs.Bar(
        x = pipename,
        y = list(df[0:N].R),
        text = pipetext,
        marker = dict(
            color = pipecolor,
            line = dict(
                color = 'rgb(8,48,107)',
                width = 1.5
                )
            ),
        opacity = 0.6
        )]

# Format the layout
layout = {'xaxis':{'tickangle':315},
          'yaxis':{'title':"Discriminability (Pearson's r)",
                   'range':[0,1],'tickformat':'.3f'},
          'title':'Preprocessing and Analysis Pipelines: Highest Discriminability',
          'showlegend':False,
          'margin':{'b':225,'l':150}}

# Plot the data
fig = plotly.graph_objs.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

Reliability

Calculate mean synchrony timeseries across the middle frequency bands, for each pipeline, for each pair of devices.


In [3]:
REL = disco_matimport('reliability','',5,10)


11/09/2016 21:40:15 Running disco_matimport reliability
11/09/2016 21:40:15 Number of files = 862
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862,  
11/09/2016 22:02:19 Saving disco_matimport_reliability.pkl
11/09/2016 22:02:37 Done

Create database with the averaged synchrony timeseries for each pair of devices.


In [4]:
analysis = 'reliability'
columns = list(REL)
dframe = REL

schema = 'DROP TABLE IF EXISTS "' + analysis + '"; CREATE TABLE "' + analysis + '" ('
for i in range(len(columns)):
    schema += '"' + columns[i] + '" FLOAT'
    if i < len(columns)-1:
        schema += ', '
schema += ');'

dbase = sqlite3.connect('disco_matimport_' + analysis + '.db')
dbase.cursor().executescript(schema); dbase.commit()
dframe.to_sql(analysis,dbase,if_exists='replace',index=False); dbase.commit()
dbase.close()

In [5]:
REL.head()


Out[5]:
cpm_groupA_coif1_decimate_cubic_vector cpm_groupA_coif1_decimate_cubic_xalign cpm_groupA_coif1_decimate_cubic_yalign cpm_groupA_coif1_decimate_cubic_zalign cpm_groupA_coif1_decimate_linear_vector cpm_groupA_coif1_decimate_linear_xalign cpm_groupA_coif1_decimate_linear_yalign cpm_groupA_coif1_decimate_linear_zalign cpm_groupA_coif1_decimate_nearest_vector cpm_groupA_coif1_decimate_nearest_xalign ... ips_groupB_sym6_mean_cubic_yalign ips_groupB_sym6_mean_cubic_zalign ips_groupB_sym6_mean_linear_vector ips_groupB_sym6_mean_linear_xalign ips_groupB_sym6_mean_linear_yalign ips_groupB_sym6_mean_linear_zalign ips_groupB_sym6_mean_nearest_vector ips_groupB_sym6_mean_nearest_xalign ips_groupB_sym6_mean_nearest_yalign ips_groupB_sym6_mean_nearest_zalign
0 0.418215 0.472479 0.419658 0.611465 0.414580 0.474915 0.398093 0.613531 0.414284 0.476677 ... 0.386682 0.561145 0.442758 0.523072 0.407694 0.572926 0.444657 0.481366 0.401957 0.574316
1 0.414860 0.472894 0.422960 0.610134 0.412113 0.475309 0.400779 0.611845 0.412794 0.477215 ... 0.429352 0.559734 0.443273 0.523784 0.402313 0.571635 0.445445 0.482433 0.396942 0.573601
2 0.413279 0.472751 0.425838 0.608792 0.411136 0.475225 0.403097 0.609999 0.412433 0.477244 ... 0.424363 0.558158 0.443884 0.524597 0.397315 0.570337 0.446124 0.483616 0.392379 0.572728
3 0.411246 0.472116 0.428376 0.607484 0.409613 0.474663 0.405214 0.608146 0.411439 0.476744 ... 0.419097 0.556589 0.444621 0.525509 0.392586 0.569016 0.446895 0.484911 0.388077 0.571552
4 0.408903 0.471527 0.429978 0.606505 0.407853 0.474100 0.406376 0.606601 0.410308 0.476248 ... 0.412547 0.555034 0.445462 0.525904 0.388058 0.567430 0.447740 0.485997 0.383935 0.570273

5 rows × 862 columns

Load pipeline synchrony database.


In [3]:
new_db = sqlite3.connect('disco_matimport_reliability.db')

Compute intraclass correlations of synchrony timecourses from the paired devices.


In [4]:
def disco_reliability(indata,outdir):
    ''' Reliablity test of synchrony timecourse from different pipelines
    indata = Database of synchrony measures files
    outdir = Output data directory
    Returns: REL = Intraclass correlations between two devices '''
    
    function = 'disco_reliability'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Running',function)
    
    # List of preprocessing pipelines (columns) from Group A
    query = "PRAGMA table_info(reliability);"
    preproc = [elem[1] for elem in indata.cursor().execute(query)]
    preproc = [elem for elem in preproc if 'groupA' in elem]
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Number of pipelines =',len(preproc))
    
    # Type of ICC: A-1, Case 3 [i.e., A-ICC(3,1)]
    # Absolute agreement among measurements; Two-way mixed effects single measures
    # Model 3 = Each subject is assessed by each rater, but the raters are the only raters of interest
    # Form 1 = Reliability calculated from a single measurement
    # Two-way random effects [A-1, Case 2 or A-ICC(2,1)] and A-1, Case 3 
    # have equivalent calculation and only differ in interpretation
     
    i = 0
    REL = pd.DataFrame(columns=('preproc','R')) # P ?
    for P in preproc:
        
        i += 1
        sys.stdout.write(str(i)+', ')
        sys.stdout.flush()
        
        query = "SELECT " + P + " FROM reliability;" # Group A       
        A = pd.read_sql_query(query,indata)
        A = A[pd.notnull(A[P])] # Remove NaN value at last element (CPM)
        x = A[P].values
        
        query = "SELECT " + P.replace("groupA","groupB") + " FROM reliability;" # Group B
        B = pd.read_sql_query(query,indata)
        B = B[pd.notnull(B[P.replace("groupA","groupB")])] # Remove NaN value at last element (CPM)
        y = B[P.replace("groupA","groupB")].values
        
        # Reliability test of synchrony timecourses
        # http://stats.stackexchange.com/questions/63368/intra-class-correlation-and-experimental-design

        Sx = sum(x); Sy = sum(y);
        Sxx = sum(x*x); Sxy = sum((x+y)**2)/2; Syy = sum(y*y)

        n = len(x)
        fact = ((Sx + Sy)**2)/(n*2)
        SS_tot = Sxx + Syy - fact
        SS_among = Sxy - fact
        SS_error = SS_tot - SS_among

        MS_error = SS_error/n
        MS_among = SS_among/(n-1)

        ICC = (MS_among - MS_error) / (MS_among + MS_error)
        REL.loc[REL.shape[0]] = [P,ICC]    
        
    print(' ')
        
    outfile = function+'.pkl'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Saving',outfile)
    REL.to_pickle(os.path.join('.',outdir,outfile))
    
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Done')        
    return REL

In [5]:
REL = disco_reliability(new_db,'')


11/13/2016 10:51:21 Running disco_reliability
11/13/2016 10:51:21 Number of pipelines = 431
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431,  
11/13/2016 11:34:52 Saving disco_reliability.pkl
11/13/2016 11:34:52 Done

Save correlations in database.


In [9]:
dframe = REL
schema = """
DROP TABLE IF EXISTS "REL"; 
CREATE TABLE "REL" (
    "preproc" VARCHAR,
    "R" FLOAT,
    "P" FLOAT
);
"""
dbase = sqlite3.connect('disco_reliability.db')
dbase.cursor().executescript(schema); dbase.commit()
dframe.to_sql("REL",dbase,if_exists='replace',index=False); dbase.commit()
dbase.close()

Group correlations by factor levels (i.e., pipeline parameters) using Pandas.


In [8]:
# Pipeline names
valdb = sqlite3.connect('disco_validity.db')
query = "SELECT preproc FROM RHO;"
relpip = pd.read_sql_query(query,valdb)
valdb.close()

# ICCs and p-values
relmat = scipy.io.loadmat('matlabfiles/disco_reliability.mat',variable_names=['REL','preproc'])
relcor = pd.DataFrame({'R':pd.DataFrame(relmat['REL'])[0]})
relpvl = pd.DataFrame({'P':pd.DataFrame(relmat['REL'])[6]})

# Reliability database
dframe = pd.concat([relpip,relcor,relpvl],axis=1)
schema = """
DROP TABLE IF EXISTS "ICC"; 
CREATE TABLE "ICC" (
    "preproc" VARCHAR,
    "R" FLOAT,
    "P" FLOAT
);
"""
dbase = sqlite3.connect('disco_reliability.db')
dbase.cursor().executescript(schema); dbase.commit()
dframe.to_sql("ICC",dbase,if_exists='replace',index=False); dbase.commit()
dbase.close()

In [9]:
dframe.head()


Out[9]:
preproc R P
0 cpm_coif_short_decimate_cubic_vector 0.331900 0.0
1 cpm_coif_short_decimate_cubic_xalign 0.579359 0.0
2 cpm_coif_short_decimate_cubic_yalign 0.180107 0.0
3 cpm_coif_short_decimate_cubic_zalign 0.578860 0.0
4 cpm_coif_short_decimate_linear_vector 0.332233 0.0

In [10]:
dframe.tail()


Out[10]:
preproc R P
427 ips_sym_long_mean_linear_zalign 0.725738 0.0
428 ips_sym_long_mean_nearest_vector 0.365295 0.0
429 ips_sym_long_mean_nearest_xalign 0.706074 0.0
430 ips_sym_long_mean_nearest_yalign -0.435204 1.0
431 ips_sym_long_mean_nearest_zalign 0.724398 0.0

In [11]:
# Retrieve reliabilities for each factor level

paramstr = ['cpm','ips', # Synchrony measure
            'coif','db','sym', # Wavelet family
            'short','medium','long', # Filter length
            'decimate','mean', # Downsampling
            'cubic','linear','nearest', # Interpolation
            'vector','xalign','yalign','zalign'] # Combination

paramdf = pd.DataFrame(columns=['parameter','correlations'])
for p in paramstr:
    c = dframe[dframe['preproc'].str.contains(p)].R.values
    paramdf.loc[paramdf.shape[0]] = [p,c]

In [12]:
paramdf


Out[12]:
parameter correlations
0 cpm [0.331900174013, 0.579358714027, 0.18010708272...
1 ips [0.34461322736, 0.679160557675, -0.41354417363...
2 coif [0.331900174013, 0.579358714027, 0.18010708272...
3 db [0.241233743194, 0.481703836298, 0.15899816692...
4 sym [0.241233743194, 0.481703836298, 0.15899816692...
5 short [0.331900174013, 0.579358714027, 0.18010708272...
6 medium [0.300646309148, 0.586527673733, 0.20503517260...
7 long [0.243963752708, 0.561886034352, 0.20177121670...
8 decimate [0.331900174013, 0.579358714027, 0.18010708272...
9 mean [0.331884810745, 0.579366087792, 0.18010289161...
10 cubic [0.331900174013, 0.579358714027, 0.18010708272...
11 linear [0.332232603231, 0.580365155504, 0.18460311829...
12 nearest [0.33123645765, 0.580777150017, 0.185454189469...
13 vector [0.331900174013, 0.332232603231, 0.33123645765...
14 xalign [0.579358714027, 0.580365155504, 0.58077715001...
15 yalign [0.180107082727, 0.184603118292, 0.18545418946...
16 zalign [0.578860137678, 0.579984756026, 0.57940980535...

In [13]:
# Save table
paramdf.to_pickle('disco_reliability_params.pkl')

Compare reliabilities from different preprocessing parameters.


In [3]:
# Load pipeline reliabilities
db = sqlite3.connect('disco_reliability.db')
query = "SELECT * FROM ICC;"
df = pd.read_sql_query(query,db)
db.close()

In [4]:
# Split pipeline names into factor levels or parameters
params = pd.DataFrame(df['preproc'].str.split('_').tolist(),columns=['sync','wave','filt','down','interp','comb'])
df = pd.concat([df,params],axis=1)
df.head()


Out[4]:
preproc R P sync wave filt down interp comb
0 cpm_coif_short_decimate_cubic_vector 0.331900 0.0 cpm coif short decimate cubic vector
1 cpm_coif_short_decimate_cubic_xalign 0.579359 0.0 cpm coif short decimate cubic xalign
2 cpm_coif_short_decimate_cubic_yalign 0.180107 0.0 cpm coif short decimate cubic yalign
3 cpm_coif_short_decimate_cubic_zalign 0.578860 0.0 cpm coif short decimate cubic zalign
4 cpm_coif_short_decimate_linear_vector 0.332233 0.0 cpm coif short decimate linear vector

In [5]:
# Convert correlations to Fisher's z
Fz = pd.DataFrame(dict(Fz=np.arctanh(df['R']).values))
df = pd.concat([df,Fz],axis=1)
df.head()


Out[5]:
preproc R P sync wave filt down interp comb Fz
0 cpm_coif_short_decimate_cubic_vector 0.331900 0.0 cpm coif short decimate cubic vector 0.344962
1 cpm_coif_short_decimate_cubic_xalign 0.579359 0.0 cpm coif short decimate cubic xalign 0.661497
2 cpm_coif_short_decimate_cubic_yalign 0.180107 0.0 cpm coif short decimate cubic yalign 0.182093
3 cpm_coif_short_decimate_cubic_zalign 0.578860 0.0 cpm coif short decimate cubic zalign 0.660747
4 cpm_coif_short_decimate_linear_vector 0.332233 0.0 cpm coif short decimate linear vector 0.345336

In [6]:
# Multiple regression of transformed correlations on categorical preprocessing parameters
est = smf.ols(formula='Fz~C(sync)+C(wave)+C(filt)+C(down)+C(interp)+C(comb)',data=df).fit()
est.summary().tables[1]


Out[6]:
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 0.3505 0.030 11.512 0.000 0.291 0.410
C(sync)[T.ips] -0.0226 0.018 -1.285 0.200 -0.057 0.012
C(wave)[T.db] -0.0381 0.022 -1.769 0.078 -0.080 0.004
C(wave)[T.sym] -0.0170 0.022 -0.792 0.429 -0.059 0.025
C(filt)[T.medium] 0.0051 0.022 0.237 0.813 -0.037 0.047
C(filt)[T.short] -0.0330 0.022 -1.533 0.126 -0.075 0.009
C(down)[T.mean] 8.232e-06 0.018 0.000 1.000 -0.035 0.035
C(interp)[T.linear] 0.0027 0.022 0.125 0.901 -0.040 0.045
C(interp)[T.nearest] 0.0024 0.022 0.111 0.912 -0.040 0.045
C(comb)[T.xalign] 0.3986 0.025 16.033 0.000 0.350 0.447
C(comb)[T.yalign] -0.4283 0.025 -17.228 0.000 -0.477 -0.379
C(comb)[T.zalign] 0.4520 0.025 18.180 0.000 0.403 0.501

Box and whisker plots of parameter reliabilities.


In [ ]:
paramname = {'cpm':'Synchrony: CPM','ips':'Synchrony: IPS',
             'coif':'Wavelet: Coiflet','db':'Wavelet: Daubechies','sym':'Wavelet: Symlet',
             'short':'Filter Length: Short','medium':'Filter Length: Medium','long':'Filter Length: Long',
             'decimate':'Downsampling: Decimate','mean':'Downsampling: Average',
             'cubic':'Interpolation: Cubic','linear':'Interpolation: Linear','nearest':'Interpolation: Nearest',
             'vector':'Combination: Standard','xalign':'Combination: X-align',
             'yalign':'Combination: Y-align','zalign':'Combination: Z-align'}

# Number of boxes (parameters)
N = paramdf.shape[0] 

# Generate an array of rainbow colors
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0,360,num=N)] 

# Each box is a dictionary containing the data, type, and color; Shows the mean for each box
data = [{'name':paramname[paramdf.loc[i][0]],
         'y':paramdf.loc[i][1],
         'type':'box',
         'marker':{'color':c[i]},
         'boxmean':True} 
        for i in range(N)]

# Format the layout
layout = {'xaxis':{'showticklabels':False},
          'yaxis':{'title':"Reliability (Intraclass correlation)",
                   'range':[-0.6,0.8],'tickformat':'.3f',
                   'zeroline':False},
          'title':'Preprocessing and Analysis Pipelines: Parameter Reliability'}

# Plot the data
fig = {'data':data,'layout':layout}
plotly.offline.iplot(fig)

Sort pipeline reliabilities from highest to lowest.


In [26]:
relsort = dframe.sort_values('R',ascending=False)
relsort[0:25]


Out[26]:
preproc R P
235 ips_coif_short_mean_linear_zalign 0.752083 0.0
223 ips_coif_short_decimate_linear_zalign 0.752035 0.0
239 ips_coif_short_mean_nearest_zalign 0.750913 0.0
227 ips_coif_short_decimate_nearest_zalign 0.750883 0.0
231 ips_coif_short_mean_cubic_zalign 0.749176 0.0
219 ips_coif_short_decimate_cubic_zalign 0.749164 0.0
403 ips_sym_medium_mean_linear_zalign 0.739974 0.0
391 ips_sym_medium_decimate_linear_zalign 0.739923 0.0
395 ips_sym_medium_decimate_nearest_zalign 0.739172 0.0
407 ips_sym_medium_mean_nearest_zalign 0.739135 0.0
399 ips_sym_medium_mean_cubic_zalign 0.736799 0.0
355 ips_db_long_mean_linear_zalign 0.736787 0.0
343 ips_db_long_decimate_linear_zalign 0.736706 0.0
387 ips_sym_medium_decimate_cubic_zalign 0.736646 0.0
359 ips_db_long_mean_nearest_zalign 0.736059 0.0
347 ips_db_long_decimate_nearest_zalign 0.736026 0.0
247 ips_coif_medium_decimate_linear_zalign 0.731954 0.0
351 ips_db_long_mean_cubic_zalign 0.731879 0.0
339 ips_db_long_decimate_cubic_zalign 0.731867 0.0
259 ips_coif_medium_mean_linear_zalign 0.731774 0.0
251 ips_coif_medium_decimate_nearest_zalign 0.730686 0.0
263 ips_coif_medium_mean_nearest_zalign 0.730540 0.0
243 ips_coif_medium_decimate_cubic_zalign 0.728627 0.0
255 ips_coif_medium_mean_cubic_zalign 0.728409 0.0
427 ips_sym_long_mean_linear_zalign 0.725738 0.0

Bar plots of highest pipeline reliabilities.


In [ ]:
N = 25 # Top 25 pipelines

pipename = list(relsort[0:N].preproc)
pipename = [p.replace('_',', ') for p in pipename]
pipename = [p.replace('mean','average') for p in pipename]

pipetext = ['']*N
pipetext[pipename.index('ips, db, long, decimate, linear, zalign')] = 'Highest discriminability'

pipecolor = ['rgb(158,202,225)']*N
pipecolor[pipename.index('ips, db, long, decimate, linear, zalign')] = 'rgb(8,48,107)'

data = [plotly.graph_objs.Bar(
        x = pipename,
        y = list(relsort[0:N].R),
        text = pipetext,
        marker = dict(
            color = pipecolor,
            line = dict(
                color = 'rgb(8,48,107)',
                width = 1.5
                )
            ),
        opacity = 0.6
        )]

# Format the layout
layout = {'xaxis':{'tickangle':315},
          'yaxis':{'title':"Reliability (Intraclass correlation)",
                   'range':[0,1],'tickformat':'.3f'},
          'title':'Preprocessing and Analysis Pipelines: Highest Reliability',
          'showlegend':False,
          'margin':{'b':225,'l':150}}

# Plot the data
fig = plotly.graph_objs.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)