In [2]:
import altair as alt
# remove when running on Colab or JupyterLab
alt.renderers.enable('notebook')
Out[2]:
In [6]:
from vega_datasets import data
cars = data.cars()
cars
Out[6]:
In [10]:
# interactive allows for zoom and pan
alt.Chart(cars).mark_point().encode(
x='Horsepower',
y='Miles_per_Gallon',
color='Origin',
).interactive()
Out[10]:
In [23]:
# Choose one of the two following data sets, the larger one gives better results, but might clutter the visualization depending on resolution
!curl -O https://raw.githubusercontent.com/DJCordhose/ai/master/notebooks/scipy/data/insurance-customers-1500.csv
# !curl -O https://raw.githubusercontent.com/DJCordhose/ai/master/notebooks/scipy/data/insurance-customers-300.csv
In [25]:
import pandas as pd
# customers = pd.read_csv('./insurance-customers-300.csv', sep=';')
customers = pd.read_csv('./insurance-customers-1500.csv', sep=';')
In [26]:
import altair as alt
brush = alt.selection(type='interval', encodings=['x'], resolve='intersect', empty='none')
# Define the base chart, with the common parts of the
# background and highlights
base = alt.Chart().mark_bar().encode(
x=alt.X(alt.repeat('column'), type='quantitative'),
y='count()'
).properties(
width=250,
height=150
)
# blue background with selection
background = base.properties(
selection=brush
)
# yellow highlights on the transformed data
highlight = base.encode(
color=alt.value('goldenrod')
).transform_filter(
brush
)
# layer the two charts & repeat
alt.layer(
background, highlight,
data=customers
).repeat(
column=["age", "max speed", "thousand km per year"]
)
Out[26]:
In [18]:
# https://beta.observablehq.com/@djcordhose/vega-lite
# https://altair-viz.github.io/user_guide/selections.html
# https://altair-viz.github.io/gallery/interactive_layered_crossfilter.html
brush = alt.selection_interval(encodings=['x'], resolve='intersect')
age = alt.Chart(customers).mark_bar().encode(
x='age',
# x=alt.X('age', bin=True),
y='count()',
# color=alt.condition(brush, alt.value('green'), alt.value('lightgray'))
# color=alt.condition(brush, 'count()', alt.value('lightgray'))
).transform_filter(brush).properties(
selection=brush
)
speed = alt.Chart(customers).mark_bar().encode(
x='max speed',
y='count()',
# color=alt.condition(brush, 'count()', alt.value('lightgray'))
).properties(
selection=brush
)
distance = alt.Chart(customers).mark_bar().encode(
x='thousand km per year',
y='count()',
# color=alt.condition(brush, 'count()', alt.value('lightgray'))
).add_selection(brush)
age.properties(width=200, height=200) | speed.properties(width=200, height=200) | distance.properties(width=200, height=200)
Out[18]:
In [1]:
import dask.dataframe as dd
df = dd.read_parquet('...')
data = df[['age', 'income', 'married']]
labels = df['outcome']
In [ ]: