In [2]:
import altair as alt

# remove when running on Colab or JupyterLab
alt.renderers.enable('notebook')


Out[2]:
RendererRegistry.enable('notebook')

In [6]:
from vega_datasets import data
cars = data.cars()
cars


Out[6]:
Acceleration Cylinders Displacement Horsepower Miles_per_Gallon Name Origin Weight_in_lbs Year
0 12.0 8 307.0 130.0 18.0 chevrolet chevelle malibu USA 3504 1970-01-01
1 11.5 8 350.0 165.0 15.0 buick skylark 320 USA 3693 1970-01-01
2 11.0 8 318.0 150.0 18.0 plymouth satellite USA 3436 1970-01-01
3 12.0 8 304.0 150.0 16.0 amc rebel sst USA 3433 1970-01-01
4 10.5 8 302.0 140.0 17.0 ford torino USA 3449 1970-01-01
5 10.0 8 429.0 198.0 15.0 ford galaxie 500 USA 4341 1970-01-01
6 9.0 8 454.0 220.0 14.0 chevrolet impala USA 4354 1970-01-01
7 8.5 8 440.0 215.0 14.0 plymouth fury iii USA 4312 1970-01-01
8 10.0 8 455.0 225.0 14.0 pontiac catalina USA 4425 1970-01-01
9 8.5 8 390.0 190.0 15.0 amc ambassador dpl USA 3850 1970-01-01
10 17.5 4 133.0 115.0 NaN citroen ds-21 pallas Europe 3090 1970-01-01
11 11.5 8 350.0 165.0 NaN chevrolet chevelle concours (sw) USA 4142 1970-01-01
12 11.0 8 351.0 153.0 NaN ford torino (sw) USA 4034 1970-01-01
13 10.5 8 383.0 175.0 NaN plymouth satellite (sw) USA 4166 1970-01-01
14 11.0 8 360.0 175.0 NaN amc rebel sst (sw) USA 3850 1970-01-01
15 10.0 8 383.0 170.0 15.0 dodge challenger se USA 3563 1970-01-01
16 8.0 8 340.0 160.0 14.0 plymouth 'cuda 340 USA 3609 1970-01-01
17 8.0 8 302.0 140.0 NaN ford mustang boss 302 USA 3353 1970-01-01
18 9.5 8 400.0 150.0 15.0 chevrolet monte carlo USA 3761 1970-01-01
19 10.0 8 455.0 225.0 14.0 buick estate wagon (sw) USA 3086 1970-01-01
20 15.0 4 113.0 95.0 24.0 toyota corona mark ii Japan 2372 1970-01-01
21 15.5 6 198.0 95.0 22.0 plymouth duster USA 2833 1970-01-01
22 15.5 6 199.0 97.0 18.0 amc hornet USA 2774 1970-01-01
23 16.0 6 200.0 85.0 21.0 ford maverick USA 2587 1970-01-01
24 14.5 4 97.0 88.0 27.0 datsun pl510 Japan 2130 1970-01-01
25 20.5 4 97.0 46.0 26.0 volkswagen 1131 deluxe sedan Europe 1835 1970-01-01
26 17.5 4 110.0 87.0 25.0 peugeot 504 Europe 2672 1970-01-01
27 14.5 4 107.0 90.0 24.0 audi 100 ls Europe 2430 1970-01-01
28 17.5 4 104.0 95.0 25.0 saab 99e Europe 2375 1970-01-01
29 12.5 4 121.0 113.0 26.0 bmw 2002 Europe 2234 1970-01-01
... ... ... ... ... ... ... ... ... ...
376 18.6 4 112.0 88.0 27.0 chevrolet cavalier wagon USA 2640 1982-01-01
377 18.0 4 112.0 88.0 34.0 chevrolet cavalier 2-door USA 2395 1982-01-01
378 16.2 4 112.0 85.0 31.0 pontiac j2000 se hatchback USA 2575 1982-01-01
379 16.0 4 135.0 84.0 29.0 dodge aries se USA 2525 1982-01-01
380 18.0 4 151.0 90.0 27.0 pontiac phoenix USA 2735 1982-01-01
381 16.4 4 140.0 92.0 24.0 ford fairmont futura USA 2865 1982-01-01
382 20.5 4 151.0 NaN 23.0 amc concord dl USA 3035 1982-01-01
383 15.3 4 105.0 74.0 36.0 volkswagen rabbit l Europe 1980 1982-01-01
384 18.2 4 91.0 68.0 37.0 mazda glc custom l Japan 2025 1982-01-01
385 17.6 4 91.0 68.0 31.0 mazda glc custom Japan 1970 1982-01-01
386 14.7 4 105.0 63.0 38.0 plymouth horizon miser USA 2125 1982-01-01
387 17.3 4 98.0 70.0 36.0 mercury lynx l USA 2125 1982-01-01
388 14.5 4 120.0 88.0 36.0 nissan stanza xe Japan 2160 1982-01-01
389 14.5 4 107.0 75.0 36.0 honda Accelerationord Japan 2205 1982-01-01
390 16.9 4 108.0 70.0 34.0 toyota corolla Japan 2245 1982-01-01
391 15.0 4 91.0 67.0 38.0 honda civic Japan 1965 1982-01-01
392 15.7 4 91.0 67.0 32.0 honda civic (auto) Japan 1965 1982-01-01
393 16.2 4 91.0 67.0 38.0 datsun 310 gx Japan 1995 1982-01-01
394 16.4 6 181.0 110.0 25.0 buick century limited USA 2945 1982-01-01
395 17.0 6 262.0 85.0 38.0 oldsmobile cutlass ciera (diesel) USA 3015 1982-01-01
396 14.5 4 156.0 92.0 26.0 chrysler lebaron medallion USA 2585 1982-01-01
397 14.7 6 232.0 112.0 22.0 ford granada l USA 2835 1982-01-01
398 13.9 4 144.0 96.0 32.0 toyota celica gt Japan 2665 1982-01-01
399 13.0 4 135.0 84.0 36.0 dodge charger 2.2 USA 2370 1982-01-01
400 17.3 4 151.0 90.0 27.0 chevrolet camaro USA 2950 1982-01-01
401 15.6 4 140.0 86.0 27.0 ford mustang gl USA 2790 1982-01-01
402 24.6 4 97.0 52.0 44.0 vw pickup Europe 2130 1982-01-01
403 11.6 4 135.0 84.0 32.0 dodge rampage USA 2295 1982-01-01
404 18.6 4 120.0 79.0 28.0 ford ranger USA 2625 1982-01-01
405 19.4 4 119.0 82.0 31.0 chevy s-10 USA 2720 1982-01-01

406 rows × 9 columns


In [10]:
# interactive allows for zoom and pan
alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()


Out[10]:


In [23]:
# Choose one of the two following data sets, the larger one gives better results, but might clutter the visualization depending on resolution

!curl -O https://raw.githubusercontent.com/DJCordhose/ai/master/notebooks/scipy/data/insurance-customers-1500.csv
# !curl -O https://raw.githubusercontent.com/DJCordhose/ai/master/notebooks/scipy/data/insurance-customers-300.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
 27 26935   27  7434    0     0   3605      0  0:00:07  0:00:02  0:00:05  3605
100 26935  100 26935    0     0  12961      0  0:00:02  0:00:02 --:--:-- 12961

In [25]:
import pandas as pd
# customers = pd.read_csv('./insurance-customers-300.csv', sep=';')
customers = pd.read_csv('./insurance-customers-1500.csv', sep=';')

In [26]:
import altair as alt

brush = alt.selection(type='interval', encodings=['x'], resolve='intersect', empty='none')

# Define the base chart, with the common parts of the
# background and highlights
base = alt.Chart().mark_bar().encode(
    x=alt.X(alt.repeat('column'), type='quantitative'),
    y='count()'
).properties(
    width=250,
    height=150
)

# blue background with selection
background = base.properties(
    selection=brush
)

# yellow highlights on the transformed data
highlight = base.encode(
    color=alt.value('goldenrod')
).transform_filter(
    brush
)

# layer the two charts & repeat
alt.layer(
    background, highlight,
    data=customers
).repeat(
    column=["age", "max speed", "thousand km per year"]
)


Out[26]:


In [18]:
# https://beta.observablehq.com/@djcordhose/vega-lite
# https://altair-viz.github.io/user_guide/selections.html
# https://altair-viz.github.io/gallery/interactive_layered_crossfilter.html

brush = alt.selection_interval(encodings=['x'], resolve='intersect')

age = alt.Chart(customers).mark_bar().encode(
        x='age',
#         x=alt.X('age', bin=True),
        y='count()',
#         color=alt.condition(brush, alt.value('green'), alt.value('lightgray'))
#         color=alt.condition(brush, 'count()', alt.value('lightgray'))
).transform_filter(brush).properties(
    selection=brush
)
speed = alt.Chart(customers).mark_bar().encode(
        x='max speed',
        y='count()',
#         color=alt.condition(brush, 'count()', alt.value('lightgray'))
).properties(
    selection=brush
)
distance = alt.Chart(customers).mark_bar().encode(
        x='thousand km per year',
        y='count()',
#         color=alt.condition(brush, 'count()', alt.value('lightgray'))
).add_selection(brush)

age.properties(width=200, height=200) | speed.properties(width=200, height=200) | distance.properties(width=200, height=200)


Out[18]:


In [1]:
import dask.dataframe as dd
df = dd.read_parquet('...')
data = df[['age', 'income', 'married']]
labels = df['outcome']


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-1-795a592f5176> in <module>()
      1 import dask.dataframe as dd
----> 2 df = dd.read_parquet('...')
      3 data = df[['age', 'income', 'married']]
      4 labels = df['outcome']

C:\ProgramData\Anaconda3\lib\site-packages\dask\dataframe\io\parquet.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, infer_divisions)
    979         )
    980     else:
--> 981         read = get_engine(engine)['read']
    982         fs, fs_token, paths = get_fs_token_paths(
    983             path, mode='rb',

C:\ProgramData\Anaconda3\lib\site-packages\dask\dataframe\io\parquet.py in get_engine(engine)
    865                 pass
    866         else:
--> 867             raise RuntimeError("Please install either fastparquet or pyarrow")
    868 
    869     elif engine == 'fastparquet':

RuntimeError: Please install either fastparquet or pyarrow

In [ ]: