In [28]:
import sys
import glob
import re
import fnmatch
from os import listdir
from os.path import join, isfile, basename
import numpy as np
from numpy import float32, int32, uint8, dtype, genfromtxt
import pandas
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
# VFB labels and label names
labels = [16,64,8,32,2,4,65,66,33,67,34,17,69,70,35,71,9,18,72,36,73,74,37,75,19,76,38,77,39,78,79,20,5,40,80,10,81,82,83,84,85,86,11,22,23,24,12,3,6,49,50,25,51,13,52,26,53,27,54,55,56,28,7,14,57,58,29,59,30,60,15,61,31,62,63]
label_names_file = '/groups/saalfeld/home/bogovicj/vfb/DrosAdultBRAINdomains/refData/Original_Index.tsv'
label_names = pandas.read_csv( label_names_file, delimiter='\t', header=0 )
# print label_names[ label_names['Stack id'] == 11 ]['JFRCtempate2010.mask130819' ].iloc[0]
# print label_names[ label_names['Stack id'] == 70 ]['JFRCtempate2010.mask130819' ].iloc[0]
def get_label_name( label_id ):
return label_names[ label_names['Stack id'] == label_id ]['JFRCtempate2010.mask130819' ].iloc[0]
# label_names['JFRCtempate2010.mask130819']
label_shorthand_col ='JFRCtempate2010.mask130819'
label_id_col ='Stack id'
# Find left-right matching labels
rnames = label_names[ label_names.apply( lambda x : x[label_shorthand_col].endswith('_R'), axis=1 )]
lr_pair_list = []
for rn in rnames.loc[:,label_shorthand_col]:
ln = rn.replace('_R','_L')
id_R = label_names[ label_names[label_shorthand_col]==rn ].loc[:,label_id_col]
id_L = label_names[ label_names[label_shorthand_col]==ln ].loc[:,label_id_col]
lr_pair_list += [[id_R.values[0], id_L.values[0]]]
lr_pair_list = np.array( lr_pair_list )
# print lr_pair_list
In [3]:
ids = np.unique( label_names.as_matrix( columns=['Stack id']))
print ids
In [4]:
# Relevant directories
neuron_list_file="/nrs/saalfeld/john/projects/flyChemStainAtlas/eval/prefix_list_by_neuron.txt"
f=open(neuron_list_file)
tmp=f.read().splitlines()
f.close()
neurons = [ n.split(' ') for n in tmp ]
print ' '
name1 = 'ants_jfrc2'
jfrcDir = '/nrs/saalfeld/john/projects/flyChemStainAtlas/eval/ants_jfrc2/evalComp'
print jfrcDir
name2 = 'ants-el-ag_owl'
grpDir = '/nrs/saalfeld/john/projects/flyChemStainAtlas/eval_f/ants-el-ag_owl/evalComp'
print grpDir
names = [ name1, name2 ]
print names
In [5]:
# jfrc_hist_sizes = pandas.read_csv( '/nrs/saalfeld/john/projects/flyChemStainAtlas/eval/ants_jfrc2/evalComp/totals.csv', header=None, names=['pair','count'] )
# grp_hist_sizes = pandas.read_csv( '/nrs/saalfeld/john/projects/flyChemStainAtlas/eval_f/ants-el-ag_owl/evalComp/totals.csv', header=None, names=['pair','count'] )
In [15]:
# Functions
def flatten( list_of_lists ):
return [ item for sublist in list_of_lists for item in sublist ]
def getCount( df_counts, pair_name ):
res = df_counts[ df_counts.pair == pair_name ]['count']
def normalizeHistogram( hist ):
hist[:,1] = hist[:,1]/np.sum( hist[:,1])
return hist
def plotHistBar( hist, title ):
x = hist[1:-1,0]
y = hist[1:-1,1]
plt.bar( x, y )
plt.xlabel('Distance')
plt.ylabel('Freq')
plt.title( title )
def plotHistsTogether( hists, names ):
for i in range( len( hists )):
hist = hists[ i ]
x = hist[1:-1,0]
y = hist[1:-1,1]
plt.plot( x, y )
plt.xlabel('Distance')
plt.ylabel('Freq')
plt.legend( names )
def findHistCsv( folder, label, line ):
testf = join( folder, 'combined__labelHist_{}_line{}.csv'.format( label, line ))
if isfile( testf ):
return testf
else:
print'could not find file for label {} and line {} in folder {}'.format( label, line, folder )
return None
def findHistStatCsv( line_names, folder, label, line ):
out = []
for prefix in line_names[ line ]:
pattern = '{}/{}*_labelHist_stats_{}.csv'.format(folder,prefix,label)
# print pattern
possibles = glob.glob( pattern )
#print 'possibles ', possibles
out += possibles
return out
def findCombinedDataCsv( folder, label, line ):
out = []
pattern = '{}/combined__labelData_{}_line{}.csv'.format(folder,label,line)
possibles = glob.glob( pattern )
out += possibles
return out
def concatenateCentiles( centile_files, cent=0.5 ):
out = np.array([[]])
for f in centile_files:
try:
stats = genfromtxt( f, delimiter=',' )
a = stats[1, np.argwhere(stats[0,:]==cent )]
out = np.concatenate( (out,a), axis=1)
except ValueError:
continue
return out
def getHistogramCounts( centile_files ):
out = np.array([[]])
def cheap_hist_percentile( hist_in, percentile ):
hist = hist_in[1:-1,:]
total=np.sum(hist[:,1])
cumsum=np.cumsum(hist[:,1])
return hist[ cumsum > ( percentile * total ), 0 ][ 0 ]
def counts( df_pair_label_counts, names ):
results = []
for n in names:
pair=basename( n.replace('_stats_','_'))
# print pair
thiscount = df_pair_label_counts[ df_pair_label_counts[ 'pair' ] == pair ]['count']
if thiscount.size != 1:
continue
results += [ thiscount.values ]
# print results
return np.mean( np.array( results ))
In [259]:
# Load the data
line = 1
sample_counts = {}
grp_all_data = {}
jrc_all_data = {}
sz = None
for l in labels:
jfrcDataFileList = findCombinedDataCsv( jfrcDir, l, line )
grpDataFileList = findCombinedDataCsv( grpDir, l, line )
# print grpDataFileList
# print jfrcDataFileList
grp_dat = genfromtxt( grpDataFileList[0], delimiter=',' )
jrc_dat = genfromtxt( jfrcDataFileList[0], delimiter=',' )
ng = len( grp_dat )
nj = len( jrc_dat )
grp_all_data[ l ] = grp_dat
jrc_all_data[ l ] = jrc_dat
sample_counts[ l ] = ( nj + ng )
# sample_counts = np.array( sample_counts )
# sample_counts[ np.isnan( sample_counts ) ] = 0
# # print sample_counts
# sort_idxs = ( np.argsort( sample_counts ) )[::-1] # Reverse so its in descending order
# # print sort_idxs
# sorted_counts = np.array( sample_counts )[ sort_idxs ]
# labels_sorted_by_n_samples = np.array( labels )[ sort_idxs ]
# labels_of_interest = labels_sorted_by_n_samples[ sorted_counts > 0 ]
# sorted_counts_of_interest = sorted_counts[ sorted_counts > 0 ]
# # print labels_of_interest
# # print sorted_counts_of_interest
# xlabels = flatten( [ [ str(get_label_name(l)), ''] for l in labels_sorted_by_n_samples ] )
In [258]:
# Decide on label ordering and compile x-labels
sorted_idxs = np.argsort( np.array( sample_counts.values() ))
# print 'sorted_idxs:'
# print sorted_idxs
# print ' '
sorted_labels = np.array( sample_counts.keys())[ sorted_idxs ]
sorted_labels = list(reversed(sorted_labels))
# print 'sorted_labels:'
# print sorted_labels
# print ' '
# print lr_pair_list[ np.where( lr_pair_list == 86 )[0], : ]
ordered_labels = []
xlabels = []
# Pick xlabels based on size, but make sure left and right are adjacent
for l in sorted_labels:
# print l
if l not in ordered_labels:
if l in lr_pair_list:
i,j = np.where( lr_pair_list == l )
jother = 0 if j==1 else 1
alsoaddme = lr_pair_list[i,jother][0]
if alsoaddme < l:
ordered_labels += [alsoaddme, l ]
else:
ordered_labels += [ l, alsoaddme ]
else:
ordered_labels += [l]
# print ' '
# print ordered_labels
xlabels = flatten( [ [ str(get_label_name(l)), ''] for l in ordered_labels ] )
# print xlabels
In [228]:
# make the sorted data set
print grp_all_data[27].shape
print jrc_all_data[27].shape
xpositions = []
small_space = 3
big_space = 6
p = 1
all_data = []
for l in ordered_labels:
all_data += [ jrc_all_data[ l ] ]
all_data += [ grp_all_data[ l ] ]
xpositions += [ p, p + small_space ]
p += ( big_space + small_space )
print xpositions
bp = plt.boxplot( all_data, positions=xpositions, whis=2.5, showfliers=False )
c1 = '#226078'
c2 = '#910033'
even = True
for box in bp['boxes']:
if even:
box.set( color=c1, linewidth=1)
else:
box.set( color=c2, linewidth=1)
even = not even
i = 0
for w in bp['whiskers']:
if i == 0 or i == 1:
w.set( color=c1, linewidth=0.5, linestyle='-')
else:
w.set( color=c2, linewidth=0.5, linestyle='-')
i += 1
if i == 4:
i = 0
even = True
for m in bp['medians']:
if even:
m.set( color=c1, linewidth=1)
else:
m.set( color=c2, linewidth=1)
even = not even
for c in bp['caps']:
c.set_visible( False )
even = True
for flier in bp['fliers']:
# flier.set_visible( False )
if even:
flier.set(marker='.', markeredgecolor=c1, markerfacecolor=c1, markersize=1, alpha=0.2 )
else:
flier.set(marker='.', markeredgecolor=c2, markerfacecolor=c2, markersize=1, alpha=0.2 )
even = not even
plt.legend( names )
ax = plt.gca()
ax.set_xticklabels( xlabels )
a = plt.setp( ax.get_xticklabels(), rotation=45, fontsize=8 )
fig = plt.gcf()
a = fig.set_size_inches( 16, 5 )
plt.savefig('/nrs/saalfeld/john/projects/flyChemStainAtlas/comparisons/all_data_byLabel.svg')
In [255]:
def color_boxplot_by_group( bp, groups, colors ):
for box,i in zip( bp['boxes'], np.arange(len( bp['boxes']))):
box.set( color=colors[groups[i]])
for med,i in zip( bp['medians'], np.arange(len( bp['medians']))):
# med.set( color=colors[groups[i]])
med.set( color='k')
# repeat because there are two whiskers for each box
for wskr,i in zip( bp['whiskers'], np.repeat( np.array(groups), 2, axis = 0 ) ):
wskr.set( color=colors[groups[i]])
In [256]:
some_data = all_data[0:6]
some_xpositions = xpositions[0:6]
print len( some_data )
print some_xpositions
groups = [0,1,0,1,0,1]
colors = ['#226078','#910033']
whisker_props = { 'linewidth' : 1.0, 'linestyle' : '-'}
box_props = { 'linewidth' : 2.0 }
med_props = { 'linewidth' : 2.0 }
max_scatter_pts = 100
scatter_radius = 0.5
print np.repeat( some_xpositions[0], 20 )
bp = plt.boxplot( some_data, positions=some_xpositions, whis=2, showcaps=False, showfliers=False,
boxprops=box_props, whiskerprops=whisker_props, medianprops=med_props )
for i in range( len( some_data )):
if( len(some_data[ i ]) <= max_scatter_pts ):
sample = some_data[ i ]
else:
sample = some_data[ i ][ np.random.choice( len(some_data[i]), max_scatter_pts)]
x = np.repeat( (1 + some_xpositions[i]), len( sample ))
x = x + ( scatter_radius * np.random.rand( *x.shape ) - (scatter_radius / 2) )
plt.plot( x, sample, linestyle='None', marker='o', markersize=8, markeredgewidth=0,
markerfacecolor=colors[groups[i]], alpha=0.1 )
plt.gca().set_xlim([0,25])
plt.gca().set_ylim([-0.5,12])
color_boxplot_by_group( bp, groups, colors )
fig = plt.gcf()
_ = fig.set_size_inches( 12, 9 )
plt.savefig('/groups/saalfeld/home/bogovicj/box-scatter-plot-style2.svg')