<img src="images/continuum_analytics_logo.png" alt="Continuum Logo", align="right", width="30%">
In this tutorial we'll learn how to use Bokeh to build interactive visualizations viewable in a browser. Generally this tutorial will have the following format
charting
- High level interface to go from data to plotplotting
- Intermediate interface allowing control to all parts of a plotThis tutorial uses many different libraries that are all available with the Anaconda Distribution. Once you have Anaconda install, please run these commands from a terminal:
$ conda install -y blaze
$ conda install -y bokeh
$ conda install -y odo
Provide a first-class visualization library for web-aware applications, without requiring web-level programming.
Write a visualization python. Bokeh creates data descripors and a scenegraph consumed by BokehJS. This works in ipython notebook, creating static files and interacting with dynamic data sources.
Bokeh includes pre-built schemas in bokeh.charts, a low-level composition interface (similar to matplotlib), a server for large and/or dynamic datasources and widgets for providing client-side realtime interaction.
The non-JS framework also has prototypes in other languages (Scala, Julia...maybe R).
Note: There are examples notebooks in bokeh/examples/plotting/notebooks. Start an ipython notebook server there to get more examples.
Gallery -- tutorial -- Documentation -- Repo
In [1]:
import pandas as pd
import numpy as np
In [2]:
from bokeh.plotting import output_notebook
output_notebook() # Tell Bokeh to output in an ipython notebook (other options later)
In [3]:
import numpy as np
from bokeh.plotting import *
N = 102
lin_arr = np.linspace(0, 4*np.pi, N)
sin_arr = np.sin(lin_arr)
cos_arr = np.cos(lin_arr)
In [4]:
p1 = figure()
p1.scatter(lin_arr, sin_arr, color="#FF00FF")
p1.scatter(lin_arr, cos_arr, color="green")
show(p1)
In [5]:
p2 = figure()
p2.scatter(x=lin_arr, y=sin_arr, color="red")
show(p2)
In [6]:
p3 = figure()
p3.scatter(x=lin_arr, y=cos_arr , marker="square", color="green")
show(p3)
There are lots of glyph types and lots of properties...here is just a sample
In [7]:
p4 = figure()
p4.scatter(x=lin_arr, y=sin_arr, size=cos_arr**2*10)
show(p4)
Let's play with colors now. Brewer is a popular set of palletes. Here we pick one and then build a vector of colors for the plot.
In [8]:
from bokeh.palettes import brewer
print "Brewer Palettes:", brewer.keys()
print "Brewer Grey Palettes:", brewer["Greys"].keys()
palette = brewer["Greys"][9] + list(reversed(brewer["Greys"][9]))
colors = palette * (len(lin_arr) / len(palette)) + palette[0:len(lin_arr) % len(palette)]
In [9]:
p5 = figure()
p5.scatter(x=lin_arr, y=sin_arr, size=cos_arr**2*10 + 5, fill_color=colors)
show(p5)
In [10]:
source = ColumnDataSource(
data=dict(
x=lin_arr,
y=sin_arr,
size=cos_arr**2*10 + 5,
colors=colors
)
)
In [11]:
from bokeh.models import HoverTool
from collections import OrderedDict
TOOLS="crosshair,pan,wheel_zoom,box_zoom,reset,hover,previewsave"
p6 = figure(title="Hoverful Scatter", tools=TOOLS)
p6.circle(x="x", y="y", size="size", source=source,
fill_color="colors", fill_alpha=0.6, line_color=None)
hover = p6.select(dict(type=HoverTool))
hover.tooltips = OrderedDict([
("index", "$index"),
("(x,y)", "(@x, @y)"),
("size", "@size"),
("fill color", "$color[hex, swatch]:fill_color"),
])
show(p6)
In [12]:
N = 300
x = np.linspace(0, 4*np.pi, N)
y1 = np.sin(x)
y2 = np.cos(x)
source = ColumnDataSource()
source.add(data=x, name='x')
source.add(data=y1, name='y1')
source.add(data=y2, name='y2')
TOOLS = "pan,wheel_zoom,box_zoom,reset,save,box_select,lasso_select"
s1 = figure(tools=TOOLS, plot_width=350, plot_height=350)
s1.scatter('x', 'y1', source=source)
# Linked brushing in Bokeh is expressed by sharing data sources between
# renderers. Note below that s2.scatter is called with the `source`
# keyword argument, and supplied with the same data source from s1.scatter
s2 = figure(tools=TOOLS, plot_width=350, plot_height=350, x_range=s1.x_range)
s2.scatter('x', 'y2', source=source, )
p = gridplot([[s1,s2]])
show(p)
In [13]:
x = np.linspace(0, 2*np.pi, 2000)
y = np.sin(x)
source = ColumnDataSource(data=dict(x=x, y=y))
p = figure(title="simple line example", plot_height=300, plot_width=600)
p.line(x, y, color="#2222aa", line_width=3, source=source, name="foo")
Out[13]:
In [14]:
from IPython.html.widgets import interact
@interact(f=["sin", "cos", "tan"], w=(0,100), A=(1,10), phi=(0, 10, 0.1))
def update(f, w=1, A=1, phi=0):
if f == "sin": func = np.sin
elif f == "cos": func = np.cos
elif f == "tan": func = np.tan
source.data['y'] = A * func(w * x + phi)
source.push_notebook()
In [15]:
show(p)
Common Schemas for common tasks (and parameters).
Expects data to be formatted as either an OrderedDict or a pandas dataframe.
Supported Schemas: Bar, Boxplot, Categorical Heatmap, Histogram, Scatter, Timeseries
In [16]:
from collections import OrderedDict
from bokeh.charts import Histogram
mu, sigma = 0, 0.5
normal_dist = OrderedDict(normal=np.random.normal(mu, sigma, size=1000))
hist = Histogram(normal_dist, bins=50,
title="kwargs, dict_input",
ylabel="frequency",
legend="top_left",
width=400,
height=350,
notebook=True)
hist.show()
In [17]:
import blaze as bz
import pandas as pd
import numpy as np
from odo import odo
from bokeh.plotting import *
output_notebook()
In [18]:
db = bz.Data('sqlite:///lahman2013.sqlite')
db.dshape
Out[18]:
In [19]:
list(db.Salaries.teamID.distinct())
Out[19]:
In [20]:
r = bz.compute(db.Salaries["teamID"].distinct())
odo(r, pd.DataFrame)
Out[20]:
In [21]:
result = bz.by(db.Salaries.teamID, avg=db.Salaries.salary.mean(),
max=db.Salaries.salary.max(),
ratio=db.Salaries.salary.max() / db.Salaries.salary.min()
).sort('ratio', ascending=False)
df = odo(result, pd.DataFrame)
In [22]:
df.head()
Out[22]:
In [23]:
df = df.sort('avg')
source = ColumnDataSource(df)
p = figure(x_range=list(df["teamID"]))
p.scatter(x="teamID", y="avg", source=source)
show(p)
Hmm, can't read the y axis very well...
In [24]:
df = df.sort('avg')
source = ColumnDataSource(df)
p = figure(x_range=list(df["teamID"]))
p.scatter(x="teamID", y="avg", source=source)
p.xaxis.major_label_orientation = np.pi/3
show(p)
Let's view a max versus ratio
In [25]:
TOOLS = "pan,wheel_zoom,box_zoom,reset,save,lasso_select"
df = df.sort('avg')
source = ColumnDataSource(df)
s1 = figure(title="Pay Avg",x_range=source.data["teamID"], tools=TOOLS, width=500)
s1.scatter(x="teamID", y="avg", source=source)
s1.xaxis.major_label_orientation = np.pi/3
s2 = figure(title="Pay Ratio", x_range=s1.x_range, tools=TOOLS, width=500)
s2.scatter(x="teamID", y="ratio", source=source)
s2.xaxis.major_label_orientation = np.pi/3
p = gridplot([[s1, s2]])
show(p)
Now let's join on the AllStars table to see how max salaries and all star count correlate.
In [26]:
result = bz.by(db.AllstarFull.teamID, all_stars=db.AllstarFull.playerID.count()
).sort('all_stars', ascending=False)
r = bz.Data(odo(result, pd.DataFrame))
m = odo(r, pd.DataFrame)["all_stars"].max()
print "max number of all stars from a single team:", m
print "normalized list of all_stars:\n", bz.compute((r.all_stars / m).head())
# Now let's use this as the size of the circles in the scatter plot
df1 = odo(r, pd.DataFrame)
df1['all_stars'] /= (df1['all_stars'].max() / 10)
df1['all_stars'] += 10
Now lets join the data to all_star sizes
In [27]:
r = bz.join(bz.Data(df1), bz.Data(df), 'teamID')
r.head()
Out[27]:
In [28]:
df_j = odo(r, pd.DataFrame)
df_j = df_j.sort("max")
print df_j.head()
source = odo(df_j, ColumnDataSource)
p = figure(x_range=list(df_j["teamID"]))
p.scatter(x="teamID", y="max", size="all_stars", source=source, fill_alpha=0.5, )
p.xaxis.major_label_orientation = np.pi/3
show(p)
Now let's make this an interactive plot!
In [29]:
def compute_df(year=2012):
result = db.Salaries[ db.Salaries.yearID==year ]
result = bz.Data(odo(result, pd.DataFrame))
result = bz.by(result.teamID, max=result.salary.max()).sort('max', ascending=False)
df = odo(result, pd.DataFrame)
asf_year = db.AllstarFull[ db.AllstarFull.yearID==year]
result = bz.by(asf_year.teamID, all_stars=db.AllstarFull.playerID.count()
).sort('all_stars', ascending=False)
r = bz.Data(odo(result, pd.DataFrame))
df1 = odo(r, pd.DataFrame)
df1['all_stars'] /= (df1['all_stars'].max() / 10)
df1['all_stars'] += 10
r = bz.join(bz.Data(df1), bz.Data(df), 'teamID')
df_j = odo(r, pd.DataFrame)
df_j = df_j.sort("max")
return df_j
source = odo(compute_df(), ColumnDataSource)
p = figure(x_range=list(source.data["teamID"]))
p.scatter(x="teamID", y="max", size="all_stars", source=source, fill_alpha=0.5, )
p.xaxis.major_label_orientation = np.pi/3
In [30]:
from IPython.html.widgets import interact, IntSliderWidget
def update(year):
df = compute_df(year)
source.data['all_stars'] = df['all_stars']
source.data['max'] = df['max']
source.push_notebook()
#interact(update, year=(1980, 2013))
interact(update, year=IntSliderWidget(min=1985, max=2013, value=2013))
Out[30]:
In [31]:
show(p)
In [ ]: