In [1]:
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[1]:
In [2]:
#%connect_info
#%qtconsole
#%matplotlib inline
In [3]:
import bokeh
In [4]:
import pandas as pd
import numpy as np
import math as m
In [5]:
df = pd.read_csv('photoz.csv',header=0)
df.describe()
Out[5]:
In [6]:
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure, hplot, vplot, gridplot
from bokeh.charts import Scatter, Histogram, defaults
from bokeh.models import BoxSelectTool, LassoSelectTool, Paragraph, Range1d
In [7]:
output_notebook()
#output_file('yuka.html')
In [8]:
xs_label = 'z_spec'; xs = df[xs_label]
xp_label = 'z_photo'; xp = df[xp_label]
lim_scatter = [0,1]
nbins = 20
xs_min = xs.min()
xs_max = xs.max()
xp_min = xp.min()
xp_max = xp.max()
xs_bins = np.linspace(lim_scatter[0], lim_scatter[1], nbins)
xp_bins = np.linspace(lim_scatter[0], lim_scatter[1], nbins)
In [9]:
title = "z-spec .vs. photo-z"
TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"
# create the scatter plot
p_scatter = figure(title = title,
min_border = 10,
min_border_left = 50,
tools=TOOLS)
p_scatter.xaxis.axis_label = xs_label
p_scatter.x_range = Range1d(start = lim_scatter[0],
end = lim_scatter[1])
p_scatter.yaxis.axis_label = xp_label
p_scatter.y_range = Range1d(start = lim_scatter[0],
end = lim_scatter[1])
p_scatter.line(lim_scatter, lim_scatter, color='black', line_width=2)
p_scatter.scatter(xs, xp, size=3, color="#3A5785", alpha=0.5)
# Kernel Density Estimates
#
def kde(values,vmin=None,vmax=None):
from scipy.stats import gaussian_kde
kern = gaussian_kde(xs)
vmin = vmin if vmin is not None else values.min()
vmax = vmax if vmax is not None else values.max()
_cov = kern.covariance[0][0]
pnts = np.arange(vmin, vmax, 10*_cov)
kde = kern(pnts)
return kde,pnts
xs_kde,xs_grid = kde(xs,xs_min,xs_max)
hmax = xs_kde.max()*1.1
xp_kde,xp_grid = kde(xp,xp_min,xp_max)
vmax = xp_kde.max()*1.1
# Create the HORIZONTAL plot (former histogram)
#
#LINE_ARGS = dict(color="#3A5785", line_color=None)
#
p_kde_spec = figure(title = None,
plot_width = p_scatter.plot_width,
plot_height = p_scatter.plot_height/3,
x_range = p_scatter.x_range,
y_range = (0, hmax),
min_border = 10,
min_border_left = 50,
toolbar_location = None,
tools=TOOLS)
p_kde_spec.xgrid.grid_line_color = None
#
p_kde_spec.line(xs_grid, xs_kde)#,**LINE_ARGS)
# Create the VERTICAL plot (former histogram)
#
th = 42 # need to adjust for toolbar height, unfortunately
#
p_kde_photo = figure(title = None,
plot_width = p_scatter.plot_width/3,
plot_height = p_scatter.plot_height+th-10,
x_range = (0, vmax),
y_range = p_scatter.y_range,
min_border = 10,
min_border_top = th,
toolbar_location = None,
tools=TOOLS)
p_kde_photo.ygrid.grid_line_color = None
#
p_kde_photo.line(xp_kde, xp_grid)
# Let's adjust the borders
#
p_kde_photo.min_border_top = 80
p_kde_photo.min_border_left = 0
p_kde_photo.min_border_bottom = 50
p_kde_spec.min_border_top = 10
p_kde_spec.min_border_right = 10
p_kde_spec.min_border_left = 80
p_scatter.min_border_right = 10
p_scatter.min_border_left = 80
p_scatter.min_border_bottom = 50
# Arrange them (the plots) to a regular grid
#
layout = gridplot([[p_scatter,p_kde_photo],[p_kde_spec,None]])
show(layout)
Out[9]:
In [10]:
title = 'z-spec .vs. photo-z'
# create the scatter plot
p_hist2d = figure(title=title,
x_range = lim_scatter,
y_range = lim_scatter,
plot_width=600,
plot_height=600,
min_border=10,
min_border_left=50,
tools=TOOLS)
p_hist2d.select(BoxSelectTool).select_every_mousemove = False
p_hist2d.select(LassoSelectTool).select_every_mousemove = False
p_hist2d.xaxis.axis_label = xs_label
p_hist2d.yaxis.axis_label = xp_label
#p_hist2d.x_range = Range1d(start=0,end=1)
#p_hist2d.y_range = Range1d(start=0,end=1)
p_hist2d.line(lim_scatter, lim_scatter, color='light_gray', line_width=1)
# We will not plot the usual colorful/heatmap-like histogram, but a size-scaled one
# So the next steps are to compute the histogram-2d itself, then clean it (no zero-counte)
# and (re)define the (x,y) grid to plot the points (scaled by the histogram bins counts).
#
hist2d, xs_edges, xp_edges = np.histogram2d(xp, xs, bins=(xp_bins,xs_bins))
assert np.array_equal(xs_edges,xs_bins)
assert np.array_equal(xp_edges,xp_bins)
# remove null bins
hist2d_xp_ind, hist2d_xs_ind = np.where(hist2d>0)
xp_edges = xp_edges[hist2d_xp_ind]
xs_edges = xs_edges[hist2d_xs_ind]
hist2d = hist2d[hist2d_xp_ind, hist2d_xs_ind]
# give them a size; no science here, just the old "looks-good"...
_hmin = hist2d.min()
_hmax = hist2d.max()
hist2d_scale = 2 + 10 * np.log2( (hist2d-_hmin) / (_hmax-_hmin) +1 )
# a DataFrame makes life easier when using Bokeh
#
_df = pd.DataFrame({'spec' :xs_edges,
'photo':xp_edges,
'scale':hist2d_scale})
r_hist2d = p_hist2d.square(x = _df['spec'],
y = _df['photo'],
size = _df['scale'],
color = "#3A5785",
alpha = 0.5)
#s = p.scatter(x=xs, y=xp, size=1, color="black")
# create the horizontal histogram
xs_hist, hedges = np.histogram(xs, bins=xs_bins,
range=lim_scatter)
hzeros = np.zeros(len(xs_hist)-1)
hmax = max(xs_hist)*1.1
LINE_ARGS = dict(color="#3A5785", line_color=None)
p_hist_spec = figure(toolbar_location=None,
plot_width=p_hist2d.plot_width,
plot_height=200,
x_range=p_hist2d.x_range,
y_range=(-hmax, hmax),
title=None,
min_border=10,
min_border_left=50,
tools=TOOLS)
p_hist_spec.xgrid.grid_line_color = None
p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=xs_hist, color="white", line_color="#3A5785")
hh1 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.5, **LINE_ARGS)
hh2 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)
# create the vertical histogram
xp_hist, vedges = np.histogram(xp, bins=xp_bins,
range=lim_scatter)
vzeros = np.zeros(len(xp_hist)-1)
vmax = max(xp_hist)*1.1
th = 42 # need to adjust for toolbar height, unfortunately
p_hist_photo = figure(toolbar_location=None,
plot_width=200,
plot_height=p_hist2d.plot_height+th-10,
x_range=(-vmax, vmax),
y_range=p_hist2d.y_range,
title=None,
min_border=10,
min_border_top=th,
tools=TOOLS)
p_hist_photo.ygrid.grid_line_color = None
p_hist_photo.xaxis.major_label_orientation = -3.14/2
p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=xp_hist, color="white", line_color="#3A5785")
vh1 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.5, **LINE_ARGS)
vh2 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.1, **LINE_ARGS)
p_hist_photo.min_border_top = 80
p_hist_photo.min_border_left = 0
p_hist_spec.min_border_top = 10
p_hist_spec.min_border_right = 10
p_hist2d.min_border_right = 10
layout = gridplot([[p_hist2d,p_hist_photo],
[p_hist_spec,None]])
def update(attr, old, new):
inds = np.array(new['1d']['indices'])
if len(inds) == 0 or len(inds) == len(xs_edges):
hhist1, hhist2 = hzeros, hzeros
vhist1, vhist2 = vzeros, vzeros
else:
#xs_edges[inds]
#inds_orig_xs =
neg_inds = np.ones_like(xs, dtype=np.bool)
neg_inds[inds] = False
hhist1, _ = np.histogram(xs[inds_xs], bins=hedges)
vhist1, _ = np.histogram(xp[inds_xp], bins=vedges)
hhist2, _ = np.histogram(xs[neg_inds_xs], bins=hedges)
vhist2, _ = np.histogram(xp[neg_inds_xp], bins=vedges)
hh1.data_source.data["top"] = hhist1
hh2.data_source.data["top"] = -hhist2
vh1.data_source.data["right"] = vhist1
vh2.data_source.data["right"] = -vhist2
r_hist2d.data_source.on_change('selected', update)
show(layout)
Out[10]:
In [11]:
from bokeh.models import CrosshairTool,HoverTool
#TOOLS = 'box_zoom,box_select,crosshair,resize,reset'
TOOLS = 'pan,box_zoom,wheel_zoom,crosshair,hover,resize,reset'
p_hists = figure(tools=TOOLS)
p_hists.xgrid.grid_line_color = None
p_hists.ygrid.minor_grid_line_color = 'gray'
p_hists.ygrid.minor_grid_line_alpha = 0.5
p_hists.ygrid.minor_grid_line_dash = 'dashed'
p_hists.select(CrosshairTool).dimensions = ['height']
p_hists.select(HoverTool).mode = 'hline'
p_hists.select(HoverTool).tooltips = [("z: ","$x")]
x_label = 'z_spec'
y_label = 'z_photo'
x = df[x_label]
y = df[y_label]
#nbins = 50
xmin = 0#x.min()
xmax = 1#x.max()
ymin = 0#y.min()
ymax = 1#y.max()
bins = np.linspace(xmin,xmax,nbins)
hs,b = np.histogram(x,bins=bins,normed=False)
p_hists.quad(top=hs,
bottom=0,
left=bins[:-1],
right=bins[1:],
fill_color="#036564",fill_alpha=0.5,
legend="z-spec")
def histogram2stepfunction(hist,bins):
h = hist
b = bins
h = h.tolist()
hh = h+h
hh[::2] = h
hh[1::2] = h
b = b.tolist()
bb = b[:-1]+b[1:]
bb.sort()
assert len(hh)==len(bb)
return bb,hh
hp,b = np.histogram(y,bins=bins,normed=False)
bb,hh = histogram2stepfunction(hp,b)
p_hists.line(x=bb,
y=hh,
line_color="#D95B43",line_width=2,
legend="photo-z")
_b = np.diff(bins)/2+bins[:-1]
p_hists.circle(x=_b,
y=hp,
size=9,line_color="#D95B43",line_width=2,fill_color="white",fill_alpha=1,
legend='photo-z')
p_hists.yaxis.axis_label = 'Counts'
show(p_hists)
Out[11]:
bins_x_i = np.digitize(x,bins)-1 x_in_bins = x[bins_x_i] x_factor = pd.qcut(x,bins)
bins_y_i = np.digitize(y,bins)-1 y_in_bins = y[bins_y_i] y_factor = pd.qcut(y,bins)
In [12]:
df_mod = df.copy()
#quantils = [0,0.05,0.25,0.5,0.75,0.95,1]
df_mod['quantils_spec'], bins_xs_quantil = pd.qcut(df_mod['z_spec'],nbins,retbins=True)
df_mod['quantils_photo'], bins_xp_quantil = pd.qcut(df_mod['z_photo'],nbins,retbins=True)
df_mod['bins_spec'], bins_xs_bins = pd.cut(df_mod['z_spec'],xs_bins,retbins=True)
df_mod['bins_photo'],bins_xp_bins = pd.cut(df_mod['z_photo'],xp_bins,retbins=True)
In [13]:
def boxplot(df,column,by,mean=True):
if mean:
df['mean'] = df.groupby(by)[column].transform(lambda x:x-x.mean())
groups = df.groupby(by)
if mean:
groups = groups['mean']
else:
groups = groups[column]
import re
_bins = set([ re.sub(r'[^\d.]+','',s) for c in df[by].values.categories for s in c.split(',') ])
_bins = list(_bins)
_bins.sort()
_bins = np.asarray(_bins,dtype=np.float)
_diff = np.diff(_bins)
_center = _bins[:-1] + _diff/2
# Generate some synthetic time series for six different categories
cats = [ s for s,g in groups ]
# Find the quartiles and IQR foor each category
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr
# find the outliers for each category
def outliers(group):
cat = group.name
return group[(group > upper.loc[cat][0]) | (group < lower.loc[cat][0])]
out = groups.apply(outliers).dropna()
# Prepare outlier data for plotting, we need coordinate for every outlier.
outx = []
outy = []
for i,cat in enumerate(cats):
# only add outliers if they exist
if not out.loc[cat].empty:
for value in out[cat]:
outx.append(_center[i])
outy.append(value)
p = figure(title="")
from bokeh.models import FixedTicker
p.x_range = Range1d(_bins.min(),_bins.max())
p.xaxis.ticker = FixedTicker(ticks=_center)
# If no outliers, shrink lengths of stems to be no longer than the minimums or maximums
qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)
upper = [ min([x,y]) for (x,y) in zip(qmax,upper) ]
lower = [ max([x,y]) for (x,y) in zip(qmin,lower) ]
# stems
#p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
#p.segment(cats, lower, cats, q1, line_width=2, line_color="black")
p.segment(_center, upper, _center, q3, line_width=2, line_color="black")
p.segment(_center, lower, _center, q1, line_width=2, line_color="black")
# boxes
p.rect(_center, (q3+q2)/2, _diff/2, q3-q2, fill_color="#E08E79", line_width=2, line_color="black")
p.rect(_center, (q2+q1)/2, _diff/2, q2-q1, fill_color="#3B8686", line_width=2, line_color="black")
# whiskers (almost-0 height rects simpler than segments)
p.rect(_center, lower, _diff/4, 0.002, line_color="black")
p.rect(_center, upper, _diff/4, 0.002, line_color="black")
# outliers
p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)
p.xgrid.grid_line_color = None
p.ygrid.minor_grid_line_color = 'gray'
p.ygrid.minor_grid_line_alpha = 0.5
p.ygrid.minor_grid_line_dash = 'dashed'
p.xaxis.major_label_text_font_size="12pt"
p.xaxis.major_label_orientation = -3.14/2
p.xaxis.axis_label = by
p.yaxis.axis_label = column if not mean else column + ' (0-mean)'
return p
p_boxplot_spec_my = boxplot(df_mod,column='z_spec',by='bins_spec')
show(p_boxplot_spec_my)
Out[13]:
In [14]:
p_boxplot_photo_my = boxplot(df_mod,column='z_photo',by='bins_photo')
show(p_boxplot_photo_my)
Out[14]:
In [15]:
p_hists.plot_height = 400
p_boxplot_photo_my.tools = p_hists.tools
p_boxplot_photo_my.x_range = p_hists.x_range
p_boxplot_photo_my.plot_height = p_hists.plot_height/2
p_boxplot_photo_my.xaxis.axis_label = None
p_boxplot_photo_my.ygrid.minor_grid_line_color = None
p_boxplot_spec_my.tools = p_hists.tools
p_boxplot_spec_my.x_range = p_hists.x_range
p_boxplot_spec_my.plot_height = p_hists.plot_height/2
p_boxplot_spec_my.xaxis.axis_label = None
p_boxplot_spec_my.ygrid.minor_grid_line_color = None
p = gridplot([[p_hists],[p_boxplot_spec_my],[p_boxplot_photo_my]])
show(p)
Out[15]:
In [ ]: