Overview

This notebook demonstrates how to use DataShader to display large datasets inside a plotly FigureWidget. Change callbacks are used to recompute the datashader image whenever the axis range or figure size changes

Imports


In [1]:
from plotly.graph_objs import FigureWidget

# core
import io
import base64 
import time

# pandas
import pandas as pd

# numpy
import numpy as np

# scikit learn
from sklearn import datasets

# datashader
import datashader as ds
import datashader.transfer_functions as tf
from datashader.colors import inferno


/usr/local/lib/python3.7/site-packages/sklearn/utils/__init__.py:4: DeprecationWarning:

Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working

Generate dataset

We will create a large dataset by duplicating the Iris dataset many times with random noise


In [2]:
num_copies = 7000 # 1,050,000 rows

iris_data = datasets.load_iris()
feature_names = [name.replace(' (cm)', '').replace(' ', '_') for name in iris_data.feature_names]
iris_df_orig = pd.DataFrame(iris_data.data, columns=feature_names)
target_orig = iris_data.target + 1

# frame of features
iris_df = pd.concat(
    np.random.normal(scale=0.2, size=iris_df_orig.shape) + iris_df_orig for i in range(num_copies)
).reset_index(drop=True)

# array of targets
target = [t for i in range(num_copies) for t in target_orig]

# dataframe that includes target as categorical
iris_target_df = pd.concat([iris_df, pd.Series(target, name='target', dtype='category')], axis=1)

iris_df.describe()


Out[2]:
sepal_length sepal_width petal_length petal_width
count 1.050000e+06 1.050000e+06 1.050000e+06 1.050000e+06
mean 5.843156e+00 3.057130e+00 3.757689e+00 1.198856e+00
std 8.493071e-01 4.779875e-01 1.770393e+00 7.858779e-01
min 3.544670e+00 1.272155e+00 1.841896e-01 -6.895429e-01
25% 5.156799e+00 2.740750e+00 1.635134e+00 3.940583e-01
50% 5.801061e+00 3.035911e+00 4.307234e+00 1.315450e+00
75% 6.443381e+00 3.354392e+00 5.142524e+00 1.827952e+00
max 8.714116e+00 5.190893e+00 7.798239e+00 3.194495e+00

Define DataShader image generation function

Define a function that inputs an x/y ranges and the plot width/height and generates a DataShader image of the dataset. The image will be returned as a PIL image object


In [3]:
def gen_ds_image(x_range, y_range, plot_width, plot_height):
    if x_range is None or y_range is None or plot_width is None or plot_height is None:
        return None
    
    cvs = ds.Canvas(x_range=x_range, y_range=y_range, plot_height=plot_height, plot_width=plot_width)
    agg_scatter = cvs.points(iris_target_df, 
                         'sepal_length', 'sepal_width', 
                          ds.count_cat('target'))
    img = tf.shade(agg_scatter)
    img = tf.dynspread(img, threshold=0.95, max_px=5, shape='circle')
    
    return img.to_pil()

Define initial ranges and plot size


In [4]:
x_range=[3, 10]
y_range=[0, 6]
plot_height=500
plot_width=700

In [5]:
# Test image generation function and display the PIL image
initial_img = gen_ds_image(x_range, y_range, plot_width, plot_height)

In [6]:
initial_img


Out[6]:

Create FigureWidget with background image


In [7]:
f = FigureWidget(data=[{'x': x_range, 
                        'y': y_range, 
                        'mode': 'markers',
                        'marker': {'opacity': 0}}], # invisible trace to init axes and to support autoresize
                 layout={'width': plot_width, 'height': plot_height})
f



In [8]:
# Set background image
f.layout.images = [dict(
    source = initial_img,  # plotly now performs auto conversion of PIL image to png data URI
    xref = "x",
    yref = "y",
    x = x_range[0],
    y = y_range[1],
    sizex = x_range[1] - x_range[0],
    sizey = y_range[1] - y_range[0],
    sizing = "stretch",
    layer = "below")]

Install change callback to update image on zoom/resize


In [9]:
def update_ds_image(layout, x_range, y_range, plot_width, plot_height):
    img = f.layout.images[0]
    
    # Update with batch_update so all updates happen simultaneously
    with f.batch_update():
        img.x = x_range[0]
        img.y = y_range[1]
        img.sizex = x_range[1] - x_range[0]
        img.sizey = y_range[1] - y_range[0]
        img.source = gen_ds_image(x_range, y_range, plot_width, plot_height)

# Install callback to run exactly once if one or more of the following properties changes
#  - xaxis range
#  - yaxis range
#  - figure width
#  - figure height
f.layout.on_change(update_ds_image, 'xaxis.range', 'yaxis.range', 'width', 'height')

Image updates on drag zoom


In [10]:
f.layout.dragmode = 'zoom'
f



In [ ]: