notebook.community



In [1]:

    
import sys
import glob
import re
import fnmatch
import math
import os
from os import listdir
from os.path import join, isfile, basename

import itertools

import numpy as np
from numpy import float32, int32, uint8, dtype, genfromtxt

from scipy.stats import ttest_ind

import pandas as pd

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns

import colorsys



In [2]:

    
# VFB labels and label names
labels = [16,64,8,32,2,4,65,66,33,67,34,17,69,70,35,71,9,18,72,36,73,74,37,75,19,76,38,77,39,78,79,20,5,40,80,10,81,82,83,84,85,86,11,22,23,24,12,3,6,49,50,25,51,13,52,26,53,27,54,55,56,28,7,14,57,58,29,59,30,60,15,61,31,62,63]
label_names_file = '/groups/saalfeld/home/bogovicj/vfb/DrosAdultBRAINdomains/refData/Original_Index.tsv'

label_names = pd.read_csv( label_names_file, delimiter='\t', header=0 )
# print label_names[ label_names['Stack id'] == 11 ]['JFRCtempate2010.mask130819' ].iloc[0]
# print label_names[ label_names['Stack id'] == 70 ]['JFRCtempate2010.mask130819' ].iloc[0]

def get_label_name( label_id ):
    return label_names[ label_names['Stack id'] == label_id ]['JFRCtempate2010.mask130819' ].iloc[0]

 # label_names['JFRCtempate2010.mask130819']

label_shorthand_col ='JFRCtempate2010.mask130819'
label_id_col ='Stack id'


# Find left-right matching labels
rnames = label_names[ label_names.apply( lambda x : x[label_shorthand_col].endswith('_R'), axis=1 )]

lr_pair_list = []
for rn in rnames.loc[:,label_shorthand_col]:
    ln = rn.replace('_R','_L')
    id_R = label_names[ label_names[label_shorthand_col]==rn ].loc[:,label_id_col]
    id_L = label_names[ label_names[label_shorthand_col]==ln ].loc[:,label_id_col]
    lr_pair_list += [[id_R.values[0], id_L.values[0]]]
lr_pair_list = np.array( lr_pair_list )



In [3]:

    
path='/nrs/saalfeld/john/projects/flyChemStainAtlas/all_evals/F-antsFlip/cmtkCow/evalComp/stats0.txt'

def readlines( f ):
    f = open( f, 'r' )
    lines = f.readlines()
    f.close()
    return lines

def labelstat( lines, label, stat='MEDIAN'):
    statupper = stat.upper()
    for l in lines:
        if l.startswith(str(label)):
            if l.find(statupper) >= 0:
                return float( l.split()[2] )



In [4]:

    
base_dir = '/nrs/saalfeld/john/projects/flyChemStainAtlas/all_evals'

# templates = ['JFRCtemplate2010']
# reg_methods = [ 'cmtkCow', 'cmtkCOG', 'antsRegYang' ]

templates = ['JFRCtemplate2010', 'JFRC2013_lo', 'F-antsFlip_lo', 'F-cmtkFlip_lof', 'TeforBrain_f']
# reg_methods = [ 'cmtkCow', 'cmtkCOG', 'cmtkHideo' ]
# reg_methods = [ 'antsRegOwl', 'antsRegDog', 'antsRegYang' ]
reg_methods = [ 'cmtkCow', 'cmtkCOG', 'cmtkHideo', 'antsRegOwl', 'antsRegDog', 'antsRegYang' ]



In [5]:

    
# parse all data into a data frame

expnamelist=[]
algnamelist=[]
templatelist=[]
statlist=[]
datalist=[]
labellist=[]
labelnamelist=[]
linelist=[]
linelabellist=[]

for template,reg in itertools.product( templates, reg_methods ):

    exp_dir = join( base_dir, template, reg )
#     print exp_dir
    name_f = join(exp_dir,'name')
    if os.path.isfile( name_f ):
        expname = readlines( join(exp_dir,'name'))[0]
    else:
        expname = '{},{}'.format(template,reg)

#     print expname 
    for line in [0,1,2,3]:
        eval_dir=join(exp_dir,'evalComp')
#         print eval_dir
        # Read label stats
        statFile = '{}/stats{}.txt'.format(eval_dir,line)
        stat_lines = readlines( statFile )
        for sl in stat_lines:
            dat = sl.split()
            label = int(dat[0])
            line_label = str(line) + '_' + str(label)
           
            expnamelist += [ expname.replace('\n','') ]
            algnamelist += [ reg ]
            templatelist += [ template ]
            linelist += [ line ]
            labellist += [ label ]
            linelabellist += [ line_label ]
            labelnamelist += [ get_label_name(label) ]
            statlist += [ dat[1] ]
            datalist += [ float(dat[2]) ]
            
        # Read total brain mask stats
        allStatFile = '{}/all_stats{}.txt'.format(eval_dir,line)
        all_stat_lines = readlines( allStatFile )
        label = -1
        for sl in all_stat_lines:
            if( sl.startswith('all ')):
                line_label = str(line) + '_' + str(label)
                dat = sl.split()
                expnamelist += [ expname.replace('\n','') ]
                algnamelist += [ reg ]
                templatelist += [ template ]
                linelist += [ line ]
                labellist += [ label ] # -1 indicates total mask label
                linelabellist += [ line_label ]
                labelnamelist += [ 'ALL' ]
                statlist += [ dat[1] ]
                datalist += [ float(dat[2]) ]


dist_df = pd.DataFrame( {'EXP':expnamelist, 
                         'ALG':algnamelist, 
                         'TEMPLATE':templatelist, 
                         'STAT':statlist,
                         'LINE':linelist,
                         'LABEL':labellist,
                         'LINELABEL':linelabellist,
                         'LABELNAME':labelnamelist,
                         'VALUE':datalist })

Determine the best algorithm

Below is average over median and mean distances per line-label, though it may be worth it to also filter by "useful" line-labels, but this should not matter.



In [6]:

    
# Group by algorithm, then average over mean distances
avg_avg_dists = dist_df.loc[(dist_df.STAT == 'MEAN'),['ALG','VALUE']].groupby(['ALG'],as_index=False).mean()
aads = avg_avg_dists.sort_values('VALUE', ascending=True)
print( 'best by avg_avg is: ',aads.iloc[0]['ALG'])


# Group by algorithm, then average over median distances
avg_med_dists = dist_df.loc[(dist_df.STAT == 'MEDIAN'),['ALG','VALUE']].groupby(['ALG'],as_index=False).mean()
amds = avg_med_dists.sort_values('VALUE', ascending=True)
print( 'best by avg_med is: ', amds.iloc[0]['ALG'])

print( ' ' )
print( ' ' )
print( aads )
print( ' ' )
print ( ' ' )
print( amds )









    



best by avg_avg is:  cmtkCOG
best by avg_med is:  cmtkCOG
 
 
           ALG     VALUE
3      cmtkCOG  6.565228
5    cmtkHideo  6.602061
0   antsRegDog  6.733418
1   antsRegOwl  6.806076
4      cmtkCow  6.846528
2  antsRegYang  6.980998
 
 
           ALG     VALUE
3      cmtkCOG  5.320492
5    cmtkHideo  5.379651
0   antsRegDog  5.499060
4      cmtkCow  5.695244
1   antsRegOwl  5.711444
2  antsRegYang  5.896575

Significance tests

and other tests



In [5]:

    
# Load combined data
# dist_samples_f = '/nrs/saalfeld/john/projects/flyChemStainAtlas/all_evals/label_data_line3.csv.gz'
# dist_samples_df = pd.read_csv( dist_samples_f, header=None, names=['TEMPLATE','ALG','LINE','LABEL','DISTANCE'] )



In [8]:

    
# dist_0_70 = dist_samples_df[ ((dist_samples_df.ALG == 'cmtkCOG') | (dist_samples_df.ALG == 'antsRegDog')) & 
#                (dist_samples_df.LINE == 0) & (dist_samples_df.LABEL == 70)]



In [9]:

    
# # dist_0_70.sample(500)
# print( dist_0_70.shape )
# dist_0_70_fantsgrp = dist_0_70[(dist_0_70.TEMPLATE == 'F-antsFlip_lo')]
# print( dist_0_70_fantsgrp.shape )









    



(15165270, 5)
(3141130, 5)



In [10]:

    
# d_cog = dist_0_70_fantsgrp[ dist_0_70_fantsgrp.ALG == 'cmtkCOG']
# print( d_cog.shape )

# d_yang = dist_0_70_fantsgrp[ dist_0_70_fantsgrp.ALG == 'antsRegDog']
# print( d_yang.shape )

# t,p = ttest_ind( d_cog['DISTANCE'], d_yang['DISTANCE'])
# print( 't:', t )
# print( 'p:', p )









    



(1584440, 5)
(1556690, 5)
t: 4.56048161292
p: 5.10383544402e-06



In [11]:

    
# print( 'cog: ', d_cog.median())
# print( 'dog: ', d_yang.median())









    



cog:  LINE         0.000000
LABEL       70.000000
DISTANCE     2.915476
dtype: float64
dog:  LINE         0.000000
LABEL       70.000000
DISTANCE     2.915476
dtype: float64



In [12]:

    
# print( 'cog: ', d_cog.DISTANCE.mean())
# print( 'dog: ', d_yang.DISTANCE.mean())









    



cog:  3.19841645534
dog:  3.18909376434



In [13]:

    
# d_cog_yang = dist_0_70_fantsgrp[ (dist_0_70_fantsgrp.ALG == 'cmtkCOG') | (dist_0_70_fantsgrp.ALG == 'antsRegDog') ]
# sns.violinplot( y=d_cog_yang.DISTANCE, x=d_cog_yang.ALG )









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fcaa54afa90>

Determine the best algorithm

Without averaging across labels



In [44]:

    
# dist_samples_df.head()

# tmp = dist_samples_df.sample(100000)
# tmp

for line in [0,1,2,3]:
# for line in [3]:
    print( 'line: ', line )
    
    dist_samples_f = '/nrs/saalfeld/john/projects/flyChemStainAtlas/all_evals/label_data_line{}.csv.gz'.format( line )
    dist_samples_df = pd.read_csv( dist_samples_f, header=None, names=['TEMPLATE','ALG','LINE','LABEL','DISTANCE'] )

    line_templAlg_sorted = dist_samples_df.groupby( ['ALG','TEMPLATE'], as_index=False ).mean().sort_values( 'DISTANCE', ascending=True )
    line_templAlg_sorted = line_templAlg_sorted.reset_index(drop=True).reset_index()

    # Rank the algs and templates above

    # print( line3_templAlg_sorted.index[ line3_templAlg_sorted.ALG == 'antsRegDog' ].tolist())
    # print( line3_templAlg_sorted.index[ line3_templAlg_sorted.ALG == 'cmtkCOG' ].tolist())

    # line3_templAlg_sorted[['ALG','index']]
    print( 'rank by alg' )
    print( line_templAlg_sorted[['ALG','index']].groupby(['ALG'], as_index=False ).mean().sort_values('index', ascending=True ))
    print()
    print( 'rank by template' )
    print( line_templAlg_sorted[['TEMPLATE','index']].groupby(['TEMPLATE'], as_index=False ).mean().sort_values('index', ascending=True ))
    print( ' ' )
    print( 'avg by alg' )
    print( line_templAlg_sorted.groupby( ['ALG'], as_index=False ).mean().sort_values('DISTANCE', ascending=True ))
    print( ' ' )
    print( 'avg by template' )
    print( line_templAlg_sorted.groupby( ['TEMPLATE'], as_index=False ).mean().sort_values('DISTANCE', ascending=True ))
    print( ' ' )
    print( ' ' )
    print( '##################################################' )
    print( ' ' )
    print( ' ' )









    



line:  0
rank by alg
           ALG  index
0   antsRegDog    8.4
1   antsRegOwl   10.5
2  antsRegYang   14.6
3      cmtkCOG   15.8
5    cmtkHideo   16.0
4      cmtkCow   18.0

rank by template
           TEMPLATE      index
0     F-antsFlip_lo   3.500000
1    F-cmtkFlip_lof  12.333333
2       JFRC2013_lo  13.666667
4      TeforBrain_f  20.800000
3  JFRCtemplate2010  20.833333
 
avg by alg
           ALG  index  LINE      LABEL  DISTANCE
0   antsRegDog    8.4   0.0  44.888686  4.705870
1   antsRegOwl   10.5   0.0  44.203547  4.738695
2  antsRegYang   14.6   0.0  44.402953  4.951090
5    cmtkHideo   16.0   0.0  44.966749  5.189394
3      cmtkCOG   15.8   0.0  44.909168  5.234582
4      cmtkCow   18.0   0.0  44.741441  5.294709
 
avg by template
           TEMPLATE      index  LINE      LABEL  DISTANCE
0     F-antsFlip_lo   3.500000   0.0  44.488716  4.336102
1    F-cmtkFlip_lof  12.333333   0.0  45.119346  4.831834
2       JFRC2013_lo  13.666667   0.0  44.539307  4.871741
4      TeforBrain_f  20.800000   0.0  44.443634  5.558863
3  JFRCtemplate2010  20.833333   0.0  44.876132  5.633436
 
 
##################################################
 
 
line:  1
rank by alg
           ALG  index
3      cmtkCOG    8.8
0   antsRegDog   10.2
5    cmtkHideo   11.2
4      cmtkCow   16.8
1   antsRegOwl   18.0
2  antsRegYang   19.8

rank by template
           TEMPLATE      index
0     F-antsFlip_lo   7.000000
1    F-cmtkFlip_lof   9.333333
4      TeforBrain_f  13.000000
3  JFRCtemplate2010  19.000000
2       JFRC2013_lo  21.500000
 
avg by alg
           ALG  index  LINE      LABEL  DISTANCE
3      cmtkCOG    8.8   1.0  47.921568  2.960315
5    cmtkHideo   11.2   1.0  47.925397  3.002793
1   antsRegOwl   18.0   1.0  48.286257  3.081214
2  antsRegYang   19.8   1.0  48.269764  3.139502
4      cmtkCow   16.8   1.0  47.957299  3.301496
0   antsRegDog   10.2   1.0  48.021897  3.319318
 
avg by template
           TEMPLATE      index  LINE      LABEL  DISTANCE
0     F-antsFlip_lo   7.000000   1.0  48.215206  2.729950
1    F-cmtkFlip_lof   9.333333   1.0  48.255277  2.767243
4      TeforBrain_f  13.000000   1.0  48.292440  2.874880
3  JFRCtemplate2010  19.000000   1.0  48.150091  3.496489
2       JFRC2013_lo  21.500000   1.0  47.406501  3.767580
 
 
##################################################
 
 
line:  2
rank by alg
           ALG  index
3      cmtkCOG    7.6
5    cmtkHideo    9.2
0   antsRegDog   10.2
4      cmtkCow   16.2
1   antsRegOwl   19.0
2  antsRegYang   22.8

rank by template
           TEMPLATE      index
0     F-antsFlip_lo   9.000000
1    F-cmtkFlip_lof  10.000000
2       JFRC2013_lo  12.333333
4      TeforBrain_f  17.600000
3  JFRCtemplate2010  21.666667
 
avg by alg
           ALG  index  LINE      LABEL  DISTANCE
3      cmtkCOG    7.6   2.0  47.619700  4.108975
5    cmtkHideo    9.2   2.0  47.570820  4.147486
4      cmtkCow   16.2   2.0  47.647695  4.505606
1   antsRegOwl   19.0   2.0  47.256087  4.748986
0   antsRegDog   10.2   2.0  46.398187  5.015768
2  antsRegYang   22.8   2.0  47.361893  5.063077
 
avg by template
           TEMPLATE      index  LINE      LABEL  DISTANCE
0     F-antsFlip_lo   9.000000   2.0  47.134169  4.153830
1    F-cmtkFlip_lof  10.000000   2.0  47.397519  4.173474
2       JFRC2013_lo  12.333333   2.0  47.923393  4.436108
4      TeforBrain_f  17.600000   2.0  47.803323  4.753719
3  JFRCtemplate2010  21.666667   2.0  46.378120  5.475240
 
 
##################################################
 
 
line:  3
rank by alg
           ALG  index
0   antsRegDog    7.0
3      cmtkCOG    7.6
5    cmtkHideo   10.6
4      cmtkCow   18.2
1   antsRegOwl   18.5
2  antsRegYang   23.0

rank by template
           TEMPLATE      index
0     F-antsFlip_lo   7.166667
1    F-cmtkFlip_lof   9.333333
4      TeforBrain_f  13.600000
2       JFRC2013_lo  19.333333
3  JFRCtemplate2010  20.500000
 
avg by alg
           ALG  index  LINE      LABEL  DISTANCE
0   antsRegDog    7.0   3.0  47.245202  4.410520
3      cmtkCOG    7.6   3.0  47.537041  4.425244
5    cmtkHideo   10.6   3.0  47.531707  4.508683
1   antsRegOwl   18.5   3.0  47.206720  4.960564
4      cmtkCow   18.2   3.0  47.581324  4.981830
2  antsRegYang   23.0   3.0  46.949806  5.299180
 
avg by template
           TEMPLATE      index  LINE      LABEL  DISTANCE
0     F-antsFlip_lo   7.166667   3.0  47.219908  4.399196
1    F-cmtkFlip_lof   9.333333   3.0  47.410344  4.461236
4      TeforBrain_f  13.600000   3.0  47.594491  4.768142
3  JFRCtemplate2010  20.500000   3.0  47.124348  5.059643
2       JFRC2013_lo  19.333333   3.0  47.425371  5.101397
 
 
##################################################



In [46]:

    
dist_samples_f = '/nrs/saalfeld/john/projects/flyChemStainAtlas/all_evals/label_data.csv.gz'
dist_samples_df = pd.read_csv( dist_samples_f, header=None, names=['TEMPLATE','ALG','LINE','LABEL','DISTANCE'] )

line_templAlg_sorted = dist_samples_df.groupby( ['ALG','TEMPLATE'], as_index=False ).mean().sort_values( 'DISTANCE', ascending=True )
line_templAlg_sorted = line_templAlg_sorted.reset_index(drop=True).reset_index()

# Rank the algs and templates above

print( 'rank by alg' )
print( line_templAlg_sorted[['ALG','index']].groupby(['ALG'], as_index=False ).mean().sort_values('index', ascending=True ))
print()
print( 'rank by template' )
print( line_templAlg_sorted[['TEMPLATE','index']].groupby(['TEMPLATE'], as_index=False ).mean().sort_values('index', ascending=True ))
print( ' ' )
print( 'avg by alg' )
print( line_templAlg_sorted.groupby( ['ALG'], as_index=False ).mean().sort_values('DISTANCE', ascending=True ))
print( ' ' )
print( 'avg by template' )
print( line_templAlg_sorted.groupby( ['TEMPLATE'], as_index=False ).mean().sort_values('DISTANCE', ascending=True ))
print( ' ' )
print( ' ' )
print( '##################################################' )
print( ' ' )
print( ' ' )









    



rank by alg
           ALG  index
0   antsRegDog   8.80
1   antsRegOwl  11.25
3      cmtkCOG  15.00
2  antsRegYang  15.20
5    cmtkHideo  15.60
4      cmtkCow  17.60

rank by template
           TEMPLATE      index
0     F-antsFlip_lo   3.666667
1    F-cmtkFlip_lof  10.666667
2       JFRC2013_lo  14.666667
4      TeforBrain_f  20.400000
3  JFRCtemplate2010  21.666667
 
avg by alg
           ALG  index      LINE      LABEL  DISTANCE
1   antsRegOwl  11.25  0.302796  44.870926  4.588780
0   antsRegDog   8.80  0.300578  45.333607  4.596240
2  antsRegYang  15.20  0.300400  45.033537  4.794298
5    cmtkHideo  15.60  0.306425  45.494156  4.890668
3      cmtkCOG  15.00  0.304509  45.446431  4.921243
4      cmtkCow  17.60  0.303615  45.315075  5.044734
 
avg by template
           TEMPLATE      index      LINE      LABEL  DISTANCE
0     F-antsFlip_lo   3.666667  0.304225  45.088382  4.173977
1    F-cmtkFlip_lof  10.666667  0.302361  45.623666  4.585219
2       JFRC2013_lo  14.666667  0.306336  45.120316  4.739927
4      TeforBrain_f  20.400000  0.302914  45.116535  5.229292
3  JFRCtemplate2010  21.666667  0.299452  45.336811  5.408308
 
 
##################################################

Determine the best template (using the best algorithm)



In [14]:

    
# Statistics for the best algorithm
best_alg_dists = dist_df[dist_df.ALG == 'cmtkCOG']


best_alg_avg_avg_dists = dist_df.loc[(dist_df.STAT == 'MEAN'),['TEMPLATE','VALUE']].groupby(['TEMPLATE'],as_index=False).mean()
sorted_best_by_avg = best_alg_avg_avg_dists.sort_values('VALUE', ascending=True)
print( 'best by avg_avg is: ', sorted_best_by_avg.iloc[0]['TEMPLATE'])
print(' ')
print('ordering:')
print(sorted_best_by_avg)
print(' ')
print(' ')

best_alg_avg_med_dists = dist_df.loc[(dist_df.STAT == 'MEDIAN'),['TEMPLATE','VALUE']].groupby(['TEMPLATE'],as_index=False).mean()
sorted_best_by_med = best_alg_avg_med_dists.sort_values('VALUE', ascending=True)
print( 'best by avg_med is: ', sorted_best_by_med.iloc[0]['TEMPLATE'] )
print(' ')
print('ordering:')
print(sorted_best_by_med)









    



best by avg_avg is:  F-antsFlip_lo
 
ordering:
           TEMPLATE     VALUE
0     F-antsFlip_lo  6.268178
1    F-cmtkFlip_lof  6.518722
2       JFRC2013_lo  6.863953
4      TeforBrain_f  6.918917
3  JFRCtemplate2010  7.207540
 
 
best by avg_med is:  F-antsFlip_lo
 
ordering:
           TEMPLATE     VALUE
0     F-antsFlip_lo  5.132273
1    F-cmtkFlip_lof  5.320864
4      TeforBrain_f  5.706866
2       JFRC2013_lo  5.769926
3  JFRCtemplate2010  5.987649

Determining the line-labels with the most sample



In [24]:

    
dist_by_linelabel = (dist_df.loc[ (dist_df.LABEL > 0) & (dist_df.STAT == 'COUNT'), ['LINELABEL','VALUE'] ]).sort_values('VALUE', ascending=False)
# dist_by_linelabel.tail(50)

Cluster line-labels



In [25]:

    
# get line-label wise statistics for the best algorithm and template
best_exp_df = dist_df[ (dist_df.ALG == 'cmtkCOG') & (dist_df.TEMPLATE == 'F-antsFlip_lo') & (dist_df.LABEL > 0 )]

# Reorganize
best_exp_df_4cluster = best_exp_df.pivot(index='LINELABEL', columns='STAT', values='VALUE')
# print( best_exp_df_4cluster.head() )

log_counts = best_exp_df_4cluster.COUNT.map( lambda x: math.log(x))

# sns.kdeplot( best_exp_df_4cluster.MEAN, best_exp_df_4cluster.STDDEV )

sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.set_style("darkgrid", {"axes.facecolor": ".9"})

# points = plt.scatter( best_exp_df_4cluster['MEAN'],  best_exp_df_4cluster['STDDEV'],
#            c=log_counts, alpha=0.4, cmap="viridis")
points = plt.scatter( log_counts, best_exp_df_4cluster['MEAN'], 
           c=best_exp_df_4cluster['STDDEV'], alpha=0.4, cmap="viridis")
plt.xlabel('COUNT')
plt.ylabel('MEAN')
plt.title('Colored by STDDEV')
plt.colorbar( points )

fig = plt.gcf()
a = fig.set_size_inches( 16, 12 )

The above isn't so clearly "clustered" to my eye, but I do wonder about those line-labels with very high mean distance



In [26]:

    
# What line-labels have such a high mean?

best_exp_df_4cluster[best_exp_df_4cluster.MEAN > 20 ]

Okay, so some of these have small or very small sample size, but what of the two with > 10k samples..?



In [54]:

    
# For each line, Sort labels by average size
gb_line_label = dist_df[ (dist_df.STAT == 'COUNT') ].groupby(['LINE','LABEL','LABELNAME','LINELABEL'], as_index=False )
line_label_avgs = gb_line_label.mean().sort_values('VALUE', ascending=False)



In [45]:

    
# Figure out colors
numTemplates = 4
numRegalgs = 6

plot_colors = []

for ti,ai in itertools.product( range(numTemplates), range(numRegalgs)):
#     print ti,ai
    plot_colors += [ colorsys.hsv_to_rgb( float(ti)/numTemplates, 0.25 + 0.75*float(ai)/numRegalgs, 1.0  )]



In [47]:

    
# For each line, plot the median distance over templates/regMethod

# Only plot for the top N labels
N = 10
line = 0

plt.rc('legend',fontsize=7) # using a size in points

for line in [0,1,2,3]:
    #     print line
    # Get the top N labels for this line
#     l = line_label_avgs[ (line_label_avgs.LINE == line ) & (line_label_avgs.LABEL > 0) ].head( N ).LABEL
    ln = line_label_avgs[ (line_label_avgs.LINE == line ) & (line_label_avgs.LABEL > 0) ].head( N ).LABELNAME
    med_df = dist_df[(dist_df.STAT == 'MEDIAN') & (dist_df.LINE == line) & (dist_df.LABEL > 0) ]
#     print med_df.head()
    
#     med_df_piv = med_df[['EXP','LABEL','VALUE']].pivot( index='LABEL', columns='EXP', values='VALUE' )
#     med_df_piv.loc[ l ].plot.bar( color=plot_colors )
    med_df_piv = med_df[['EXP','LABELNAME','VALUE']].pivot( index='LABELNAME', columns='EXP', values='VALUE' )
    med_df_piv.loc[ ln ].plot.bar( color=plot_colors )
    
    fig = plt.gcf()
    a = fig.set_size_inches( 18, 8 )



In [48]:

    
# For each line, plot the median distance over all labels for templates/regMethod 

for line in [0,1,2,3]:
    med_df = dist_df[(dist_df.STAT == 'MEDIAN') & (dist_df.LINE == line) & (dist_df.LABEL == -1)]
    med_df[['EXP','VALUE']].set_index('EXP').plot.bar()



In [112]:

    
#sz_templates = [ 'F-antsFlip', 'F-antsFlip_lo', 'F-antsFlip_1p52', 'F-antsFlip_2p4']
sz_templates = [  'F-antsFlip_2p4iso', 'F-antsFlip_1p2iso', 'F-antsFlip_lo', 'F-antsFlip' ]
sz_reg_methods = ['cmtkCOG']

exp_order = []

# parse all data into a data frame
expnamelist=[]
statlist=[]
datalist=[]
labellist=[]
linelist=[]

for template,reg in itertools.product( sz_templates, sz_reg_methods ):

    exp_dir = join( base_dir, template, reg )
#     print exp_dir
    name_f = join(exp_dir,'name')
    if os.path.isfile( name_f ):
        expname = readlines( join(exp_dir,'name'))[0].rstrip()
    else:
        expname = '{},{}'.format(template,reg)

    exp_order += [expname]
    
    for line in [0,1,2,3]:
        eval_dir=join(exp_dir,'evalComp')
        # Read label stats
        statFile = '{}/stats{}.txt'.format(eval_dir,line)
        stat_lines = readlines( statFile )
        for sl in stat_lines:
            dat = sl.split()
            expnamelist += [ expname.replace('\n','') ]
            linelist += [ line ]
            labellist += [ int(dat[0]) ]
            statlist += [ dat[1] ]
            datalist += [ float(dat[2]) ]
            
        # Read total brain mask stats
        allStatFile = '{}/all_stats{}.txt'.format(eval_dir,line)
        all_stat_lines = readlines( allStatFile )
        for sl in all_stat_lines:
            if( sl.startswith('all ')):
                dat = sl.split()
                expnamelist += [ expname.replace('\n','') ]
                linelist += [ line ]
                labellist += [ -1 ] # -1 indicates total mask label
                statlist += [ dat[1] ]
                datalist += [ float(dat[2]) ]


sz_dist_df = pd.DataFrame( {'EXP':expnamelist, 
                         'STAT':statlist,
                         'LINE':linelist,
                         'LABEL':labellist,
                         'VALUE':datalist })



In [51]:

    
# For each line, Sort labels by average size
sz_gb_line_label = sz_dist_df[ (sz_dist_df.STAT == 'COUNT') ].groupby(['LINE','LABEL'], as_index=False )
sz_line_label_avgs = sz_gb_line_label.mean().sort_values('VALUE', ascending=False)

# Figure out colors
numTemplates = len(exp_order)
numRegalgs = 1

plot_colors = []

for ti,ai in itertools.product( range(numTemplates), range(numRegalgs)):
#     print ti,ai
    plot_colors += [ colorsys.hsv_to_rgb( float(ti)/numTemplates, 0.7 + float(ai)/numRegalgs, 1.0  )]


# For each line, plot the median distance over templates/regMethod
# Only plot for the top N labels
N = 10
line = 0

plt.rc('legend',fontsize=7) # using a size in points

for line in [0,1,2,3]:
    #     print line
    # Get the top N labels for this line
    l = sz_line_label_avgs[ (sz_line_label_avgs.LINE == line ) & (sz_line_label_avgs.LABEL > 0) ].head( N ).LABEL
    med_df = sz_dist_df[(sz_dist_df.STAT == 'MEDIAN') & (sz_dist_df.LINE == line) & (sz_dist_df.LABEL > 0) ]
    med_df_piv = med_df[['EXP','LABEL','VALUE']].pivot( index='LABEL', columns='EXP', values='VALUE' )
    med_df_piv[exp_order].loc[ l ].plot.bar( color=plot_colors )
    
    fig = plt.gcf()
    a = fig.set_size_inches( 18, 8 )

STAT	COUNT	MAX	MEAN	MEDIAN	MIN	STDDEV
LINELABEL
1_4	326.0	39.648457	23.095683	21.942920	8.746428	8.313019
2_25	18024.0	96.044258	34.493334	21.696773	0.000000	28.713701
2_71	28792.0	89.218834	31.513696	24.372116	0.000000	20.716863
3_22	3214.0	80.541916	25.355005	14.662878	0.000000	22.657337
3_24	212.0	50.690235	21.606401	19.176799	1.581139	11.460998
3_35	10.0	21.610184	20.124840	20.309914	18.234583	1.044434
3_69	3578.0	82.416626	23.197841	20.772564	0.000000	17.468560
3_75	334.0	68.996376	24.691066	21.103878	3.000000	16.330725
3_8	2.0	42.243343	22.779984	22.779984	3.316625	19.463359