In [1]:

    
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')









    Out[1]:



In [2]:

    
#%connect_info
#%qtconsole
#%matplotlib inline

Main lib



In [3]:

    
import bokeh

Load basic, non-plotting libs



In [4]:

    
import pandas as pd
import numpy as np
import math as m

Load data



In [5]:

    
df = pd.read_csv('photoz.csv',header=0)
df.describe()









    Out[5]:






  
    
      
      z_spec
      z_photo
      z_photo_wgt
      z_photo_err
    
  
  
    
      count
      6071.000000
      6071.000000
      6071
      6071.000000
    
    
      mean
      0.490558
      0.489772
      1
      0.067040
    
    
      std
      0.158013
      0.110906
      0
      0.021950
    
    
      min
      0.005954
      0.061783
      1
      0.017339
    
    
      25%
      0.415064
      0.458071
      1
      0.055957
    
    
      50%
      0.512590
      0.528519
      1
      0.066252
    
    
      75%
      0.584309
      0.560872
      1
      0.076828
    
    
      max
      5.810223
      0.663229
      1
      0.198344



In [6]:

    
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure, hplot, vplot, gridplot
from bokeh.charts import Scatter, Histogram, defaults
from bokeh.models import BoxSelectTool, LassoSelectTool, Paragraph, Range1d



In [7]:

    
output_notebook()
#output_file('yuka.html')









    






    
    
        
        BokehJS successfully loaded.

z-spec vs photo-z

This is about analyse the quality of our photo-z estimation. A series of plots are to be explored here. That's all about it.



In [8]:

    
xs_label = 'z_spec';    xs = df[xs_label]
xp_label = 'z_photo';   xp = df[xp_label]

lim_scatter = [0,1]

nbins = 20

xs_min = xs.min()
xs_max = xs.max()
xp_min = xp.min()
xp_max = xp.max()

xs_bins = np.linspace(lim_scatter[0], lim_scatter[1], nbins)
xp_bins = np.linspace(lim_scatter[0], lim_scatter[1], nbins)

Scatter plot: z-spec vs photo-z



In [9]:

    
title = "z-spec .vs. photo-z"

TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"

# create the scatter plot
p_scatter = figure(title = title,
                   min_border = 10,
                   min_border_left = 50,
                   tools=TOOLS)

p_scatter.xaxis.axis_label = xs_label
p_scatter.x_range = Range1d(start = lim_scatter[0],
                            end = lim_scatter[1])

p_scatter.yaxis.axis_label = xp_label
p_scatter.y_range = Range1d(start = lim_scatter[0],
                            end = lim_scatter[1])

p_scatter.line(lim_scatter, lim_scatter, color='black', line_width=2)

p_scatter.scatter(xs, xp, size=3, color="#3A5785", alpha=0.5)


# Kernel Density Estimates
#
def kde(values,vmin=None,vmax=None):
    from scipy.stats import gaussian_kde
    kern = gaussian_kde(xs)
    vmin = vmin if vmin is not None else values.min()
    vmax = vmax if vmax is not None else values.max()
    _cov = kern.covariance[0][0]
    pnts = np.arange(vmin, vmax, 10*_cov)
    kde = kern(pnts)
    return kde,pnts

xs_kde,xs_grid = kde(xs,xs_min,xs_max)
hmax = xs_kde.max()*1.1

xp_kde,xp_grid = kde(xp,xp_min,xp_max)
vmax = xp_kde.max()*1.1


# Create the HORIZONTAL plot (former histogram)
#
#LINE_ARGS = dict(color="#3A5785", line_color=None)
#
p_kde_spec = figure(title = None,
                    plot_width = p_scatter.plot_width,
                    plot_height = p_scatter.plot_height/3,
                    x_range = p_scatter.x_range,
                    y_range = (0, hmax),
                    min_border = 10,
                    min_border_left = 50,
                    toolbar_location = None,
                    tools=TOOLS)
p_kde_spec.xgrid.grid_line_color = None
#
p_kde_spec.line(xs_grid, xs_kde)#,**LINE_ARGS)


# Create the VERTICAL plot (former histogram)
#
th = 42 # need to adjust for toolbar height, unfortunately
#
p_kde_photo = figure(title = None,
                    plot_width = p_scatter.plot_width/3,
                    plot_height = p_scatter.plot_height+th-10,
                    x_range = (0, vmax),
                    y_range = p_scatter.y_range,
                    min_border = 10,
                    min_border_top = th,
                    toolbar_location = None,
                    tools=TOOLS)
p_kde_photo.ygrid.grid_line_color = None
#
p_kde_photo.line(xp_kde, xp_grid)


# Let's adjust the borders
#
p_kde_photo.min_border_top = 80
p_kde_photo.min_border_left = 0
p_kde_photo.min_border_bottom = 50

p_kde_spec.min_border_top = 10
p_kde_spec.min_border_right = 10
p_kde_spec.min_border_left = 80

p_scatter.min_border_right = 10
p_scatter.min_border_left = 80
p_scatter.min_border_bottom = 50


# Arrange them (the plots) to a regular grid
#
layout = gridplot([[p_scatter,p_kde_photo],[p_kde_spec,None]])

show(layout)









    






    







    Out[9]:





<bokeh.io._CommsHandle at 0x7f3d94a1ca10>

Histogram 2D: z-spec vs photo-z



In [10]:

    
title = 'z-spec .vs. photo-z'

# create the scatter plot
p_hist2d = figure(title=title,
                  x_range = lim_scatter,
                  y_range = lim_scatter,
                  plot_width=600,
                  plot_height=600,
                  min_border=10,
                  min_border_left=50,
                  tools=TOOLS)
p_hist2d.select(BoxSelectTool).select_every_mousemove = False
p_hist2d.select(LassoSelectTool).select_every_mousemove = False

p_hist2d.xaxis.axis_label = xs_label
p_hist2d.yaxis.axis_label = xp_label
#p_hist2d.x_range = Range1d(start=0,end=1)
#p_hist2d.y_range = Range1d(start=0,end=1)

p_hist2d.line(lim_scatter, lim_scatter, color='light_gray', line_width=1)

# We will not plot the usual colorful/heatmap-like histogram, but a size-scaled one
# So the next steps are to compute the histogram-2d itself, then clean it (no zero-counte)
#  and (re)define the (x,y) grid to plot the points (scaled by the histogram bins counts).
#
hist2d, xs_edges, xp_edges = np.histogram2d(xp, xs, bins=(xp_bins,xs_bins))
assert np.array_equal(xs_edges,xs_bins)
assert np.array_equal(xp_edges,xp_bins)

# remove null bins
hist2d_xp_ind, hist2d_xs_ind = np.where(hist2d>0)

xp_edges = xp_edges[hist2d_xp_ind]
xs_edges = xs_edges[hist2d_xs_ind]
hist2d = hist2d[hist2d_xp_ind, hist2d_xs_ind]

# give them a size; no science here, just the old "looks-good"...
_hmin = hist2d.min()
_hmax = hist2d.max()
hist2d_scale = 2 + 10 * np.log2( (hist2d-_hmin) / (_hmax-_hmin) +1 )

# a DataFrame makes life easier when using Bokeh
#
_df = pd.DataFrame({'spec' :xs_edges,
                    'photo':xp_edges,
                    'scale':hist2d_scale})

r_hist2d = p_hist2d.square(x =    _df['spec'],
                           y =    _df['photo'],
                           size = _df['scale'],
                           color = "#3A5785",
                           alpha = 0.5)
#s = p.scatter(x=xs, y=xp, size=1, color="black")


# create the horizontal histogram
xs_hist, hedges = np.histogram(xs, bins=xs_bins,
                             range=lim_scatter)
hzeros = np.zeros(len(xs_hist)-1)
hmax = max(xs_hist)*1.1

LINE_ARGS = dict(color="#3A5785", line_color=None)

p_hist_spec = figure(toolbar_location=None,
                     plot_width=p_hist2d.plot_width,
                     plot_height=200,
                     x_range=p_hist2d.x_range,
                     y_range=(-hmax, hmax),
                     title=None,
                     min_border=10,
                     min_border_left=50,
                     tools=TOOLS)
p_hist_spec.xgrid.grid_line_color = None

p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=xs_hist, color="white", line_color="#3A5785")
hh1 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.5, **LINE_ARGS)
hh2 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)


# create the vertical histogram
xp_hist, vedges = np.histogram(xp, bins=xp_bins,
                               range=lim_scatter)
vzeros = np.zeros(len(xp_hist)-1)
vmax = max(xp_hist)*1.1

th = 42 # need to adjust for toolbar height, unfortunately
p_hist_photo = figure(toolbar_location=None,
                      plot_width=200,
                      plot_height=p_hist2d.plot_height+th-10,
                      x_range=(-vmax, vmax),
                      y_range=p_hist2d.y_range,
                      title=None,
                      min_border=10,
                      min_border_top=th,
                      tools=TOOLS)
p_hist_photo.ygrid.grid_line_color = None
p_hist_photo.xaxis.major_label_orientation = -3.14/2

p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=xp_hist, color="white", line_color="#3A5785")
vh1 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.5, **LINE_ARGS)
vh2 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.1, **LINE_ARGS)

p_hist_photo.min_border_top = 80
p_hist_photo.min_border_left = 0

p_hist_spec.min_border_top = 10
p_hist_spec.min_border_right = 10

p_hist2d.min_border_right = 10

layout = gridplot([[p_hist2d,p_hist_photo],
                   [p_hist_spec,None]])

def update(attr, old, new):
    inds = np.array(new['1d']['indices'])
    if len(inds) == 0 or len(inds) == len(xs_edges):
        hhist1, hhist2 = hzeros, hzeros
        vhist1, vhist2 = vzeros, vzeros
    else:
        #xs_edges[inds]
        #inds_orig_xs = 
        neg_inds = np.ones_like(xs, dtype=np.bool)
        neg_inds[inds] = False
        hhist1, _ = np.histogram(xs[inds_xs], bins=hedges)
        vhist1, _ = np.histogram(xp[inds_xp], bins=vedges)
        hhist2, _ = np.histogram(xs[neg_inds_xs], bins=hedges)
        vhist2, _ = np.histogram(xp[neg_inds_xp], bins=vedges)

    hh1.data_source.data["top"]   =  hhist1
    hh2.data_source.data["top"]   = -hhist2
    vh1.data_source.data["right"] =  vhist1
    vh2.data_source.data["right"] = -vhist2

r_hist2d.data_source.on_change('selected', update)
show(layout)









    






    







    Out[10]:





<bokeh.io._CommsHandle at 0x7f3d91128c90>



In [11]:

    
from bokeh.models import CrosshairTool,HoverTool
#TOOLS = 'box_zoom,box_select,crosshair,resize,reset'
TOOLS = 'pan,box_zoom,wheel_zoom,crosshair,hover,resize,reset'

p_hists = figure(tools=TOOLS)
p_hists.xgrid.grid_line_color = None
p_hists.ygrid.minor_grid_line_color = 'gray'
p_hists.ygrid.minor_grid_line_alpha = 0.5
p_hists.ygrid.minor_grid_line_dash = 'dashed'

p_hists.select(CrosshairTool).dimensions = ['height']

p_hists.select(HoverTool).mode = 'hline'
p_hists.select(HoverTool).tooltips = [("z: ","$x")]

x_label = 'z_spec'
y_label = 'z_photo'

x = df[x_label]
y = df[y_label]

#nbins = 50
xmin = 0#x.min()
xmax = 1#x.max()
ymin = 0#y.min()
ymax = 1#y.max()


bins = np.linspace(xmin,xmax,nbins)
hs,b = np.histogram(x,bins=bins,normed=False)
p_hists.quad(top=hs,
               bottom=0,
               left=bins[:-1],
               right=bins[1:],
               fill_color="#036564",fill_alpha=0.5,
               legend="z-spec")

def histogram2stepfunction(hist,bins):
    h = hist
    b = bins
    h = h.tolist()
    hh = h+h
    hh[::2] = h
    hh[1::2] = h
    b = b.tolist()
    bb = b[:-1]+b[1:]
    bb.sort()
    assert len(hh)==len(bb)
    return bb,hh

hp,b = np.histogram(y,bins=bins,normed=False)
bb,hh = histogram2stepfunction(hp,b)
p_hists.line(x=bb,
       y=hh,
       line_color="#D95B43",line_width=2,
       legend="photo-z")

_b = np.diff(bins)/2+bins[:-1]
p_hists.circle(x=_b,
         y=hp,
         size=9,line_color="#D95B43",line_width=2,fill_color="white",fill_alpha=1,
         legend='photo-z')

p_hists.yaxis.axis_label = 'Counts'

show(p_hists)









    






    







    Out[11]:





<bokeh.io._CommsHandle at 0x7f3d91056750>

bins_x_i = np.digitize(x,bins)-1 x_in_bins = x[bins_x_i] x_factor = pd.qcut(x,bins)

bins_y_i = np.digitize(y,bins)-1 y_in_bins = y[bins_y_i] y_factor = pd.qcut(y,bins)



In [12]:

    
df_mod = df.copy()
#quantils = [0,0.05,0.25,0.5,0.75,0.95,1]
df_mod['quantils_spec'], bins_xs_quantil = pd.qcut(df_mod['z_spec'],nbins,retbins=True)
df_mod['quantils_photo'], bins_xp_quantil = pd.qcut(df_mod['z_photo'],nbins,retbins=True)

df_mod['bins_spec'], bins_xs_bins = pd.cut(df_mod['z_spec'],xs_bins,retbins=True)
df_mod['bins_photo'],bins_xp_bins = pd.cut(df_mod['z_photo'],xp_bins,retbins=True)



In [13]:

    
def boxplot(df,column,by,mean=True):
    if mean:
        df['mean'] = df.groupby(by)[column].transform(lambda x:x-x.mean())
    groups = df.groupby(by)
    if mean:
        groups = groups['mean']
    else:
        groups = groups[column]

    import re
    _bins = set([ re.sub(r'[^\d.]+','',s) for c in df[by].values.categories for s in c.split(',') ])
    _bins = list(_bins)
    _bins.sort()
    _bins = np.asarray(_bins,dtype=np.float)
    _diff = np.diff(_bins)
    _center = _bins[:-1] + _diff/2
    
    # Generate some synthetic time series for six different categories
    cats = [ s for s,g in groups ]

    # Find the quartiles and IQR foor each category
    q1 = groups.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5*iqr
    lower = q1 - 1.5*iqr

    # find the outliers for each category
    def outliers(group):
        cat = group.name
        return group[(group > upper.loc[cat][0]) | (group < lower.loc[cat][0])]
    out = groups.apply(outliers).dropna()

    # Prepare outlier data for plotting, we need coordinate for every outlier.
    outx = []
    outy = []
    for i,cat in enumerate(cats):
        # only add outliers if they exist
        if not out.loc[cat].empty:
            for value in out[cat]:
                outx.append(_center[i])
                outy.append(value)

    p = figure(title="")

    from bokeh.models import FixedTicker
    p.x_range = Range1d(_bins.min(),_bins.max())
    p.xaxis.ticker = FixedTicker(ticks=_center)
    
    # If no outliers, shrink lengths of stems to be no longer than the minimums or maximums
    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    upper = [ min([x,y]) for (x,y) in zip(qmax,upper) ]
    lower = [ max([x,y]) for (x,y) in zip(qmin,lower) ]

    # stems
    #p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
    #p.segment(cats, lower, cats, q1, line_width=2, line_color="black")
    p.segment(_center, upper, _center, q3, line_width=2, line_color="black")
    p.segment(_center, lower, _center, q1, line_width=2, line_color="black")

    # boxes
    p.rect(_center, (q3+q2)/2, _diff/2, q3-q2,    fill_color="#E08E79", line_width=2, line_color="black")
    p.rect(_center, (q2+q1)/2, _diff/2, q2-q1,    fill_color="#3B8686", line_width=2, line_color="black")
    
    # whiskers (almost-0 height rects simpler than segments)
    p.rect(_center, lower, _diff/4, 0.002, line_color="black")
    p.rect(_center, upper, _diff/4, 0.002, line_color="black")

    # outliers
    p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

    p.xgrid.grid_line_color = None
    p.ygrid.minor_grid_line_color = 'gray'
    p.ygrid.minor_grid_line_alpha = 0.5
    p.ygrid.minor_grid_line_dash = 'dashed'

    p.xaxis.major_label_text_font_size="12pt"
    p.xaxis.major_label_orientation = -3.14/2
    
    p.xaxis.axis_label = by
    p.yaxis.axis_label = column if not mean else column + ' (0-mean)'
    
    return p

p_boxplot_spec_my = boxplot(df_mod,column='z_spec',by='bins_spec')
show(p_boxplot_spec_my)









    



/home/chbrandt/.conda/envs/bokeh/lib/python2.7/site-packages/pandas/core/generic.py:1517: FutureWarning: slice indexers when using iloc should be integers and not floating point
  result = self.iloc[loc]






    






    







    Out[13]:





<bokeh.io._CommsHandle at 0x7f3d90f81450>



In [14]:

    
p_boxplot_photo_my = boxplot(df_mod,column='z_photo',by='bins_photo')
show(p_boxplot_photo_my)









    






    







    Out[14]:





<bokeh.io._CommsHandle at 0x7f3d90fe1550>



In [15]:

    
p_hists.plot_height = 400

p_boxplot_photo_my.tools = p_hists.tools
p_boxplot_photo_my.x_range = p_hists.x_range
p_boxplot_photo_my.plot_height = p_hists.plot_height/2
p_boxplot_photo_my.xaxis.axis_label = None
p_boxplot_photo_my.ygrid.minor_grid_line_color = None

p_boxplot_spec_my.tools = p_hists.tools
p_boxplot_spec_my.x_range = p_hists.x_range
p_boxplot_spec_my.plot_height = p_hists.plot_height/2
p_boxplot_spec_my.xaxis.axis_label = None
p_boxplot_spec_my.ygrid.minor_grid_line_color = None

p = gridplot([[p_hists],[p_boxplot_spec_my],[p_boxplot_photo_my]])
show(p)









    






    







    Out[15]:





<bokeh.io._CommsHandle at 0x7f3d90d2fb90>



In [ ]:

	z_spec	z_photo	z_photo_wgt	z_photo_err
count	6071.000000	6071.000000	6071	6071.000000
mean	0.490558	0.489772	1	0.067040
std	0.158013	0.110906	0	0.021950
min	0.005954	0.061783	1	0.017339
25%	0.415064	0.458071	1	0.055957
50%	0.512590	0.528519	1	0.066252
75%	0.584309	0.560872	1	0.076828
max	5.810223	0.663229	1	0.198344