Advanced Visualization

A couple of different visualization libraries in this notebook. Starting w/ altair, running through some matplotlib, hitting bqplot, and ending with ipyvolume.

Altair


In [ ]:


In [ ]:


In [2]:
# %load ../scripts/altair-example.py
#!/usr/bin/env python
"""
Altair Tooltip Example
======================

An example script that Jonathan lightly modified to make it work in a Notebook.
"""
import sys

import altair as alt

alt.renderers.enable("notebook")

from vega_datasets import data

iris = data.iris()
# alt.renderers.enable('default')

alt.Chart(iris).mark_point().encode(x="petalLength", y="petalWidth", color="species")


cars = data.cars()

LITERS_PER_GALLON = 3.78541

cars["Miles_per_Liter"] = cars["Miles_per_Gallon"] * LITERS_PER_GALLON

alt.Chart(cars).mark_circle(size=60).encode(
    x="Horsepower",
    y="Miles_per_Gallon",
    color="Origin",
    tooltip=["Name", "Origin", "Horsepower", "Miles_per_Gallon"],
).interactive()


Out[2]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_context('poster')
sns.set_style('whitegrid') 
# sns.set_style('darkgrid') 
plt.rcParams['figure.figsize'] = 12, 8  # plotsize 

import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.datasets import load_boston

import warnings
warnings.filterwarnings('ignore')

import matplotlib as mpl

Note

Using cleaned data from Data Cleaning Notebook. See Notebook for details.


In [4]:
df = pd.read_csv("../data/coal_prod_cleaned.csv")

In [5]:
df.head()


Out[5]:
MSHA_ID Average_Employees Company_Type Labor_Hours Mine_Basin Mine_County Mine_Name Mine_State Mine_Status Mine_Type Operating_Company Operating_Company_Address Operation_Type Production_short_tons Union_Code Year
0 103295 18.0 Independent Producer Operator 39175.0 Appalachia Southern Bibb Seymour Mine Alabama Active Surface Hope Coal Company Inc P.O. Box 249, Maylene, AL 35114 Mine only 105082.0 NaN 2008
1 103117 19.0 Operating Subsidiary 29926.0 Appalachia Southern Cullman Mine #2, #3, #4 Alabama Active, men working, not producing Surface Twin Pines Coal Company Inc 1874 County Road 15, Bremen, AL 35033 Mine only 10419.0 NaN 2008
2 103361 20.0 Operating Subsidiary 42542.0 Appalachia Southern Cullman Cold Springs West Mine Alabama Active Surface Twin Pines Coal Company 74 Industrial Parkway, Jasper, AL 35502 Mine only 143208.0 NaN 2008
3 100759 395.0 Operating Subsidiary 890710.0 Appalachia Southern Fayette North River # 1 Underground Mi Alabama Active Underground Chevron Mining Inc 3114 County Road 63 S, Berry, AL 35546 Mine and Preparation Plant 2923261.0 United Mine Workers of America 2008
4 103246 22.0 Independent Producer Operator 55403.0 Appalachia Southern Franklin Bear Creek Alabama Active Surface Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Mine only 183137.0 NaN 2008

In [6]:
colors = sns.color_palette(n_colors=df.Year.nunique())

In [7]:
color_dict = dict(zip(sorted(df.Year.unique()), colors))
color_dict


Out[7]:
{2008: (0.12156862745098039, 0.4666666666666667, 0.7058823529411765),
 2009: (1.0, 0.4980392156862745, 0.054901960784313725),
 2010: (0.17254901960784313, 0.6274509803921569, 0.17254901960784313),
 2011: (0.8392156862745098, 0.15294117647058825, 0.1568627450980392),
 2012: (0.5803921568627451, 0.403921568627451, 0.7411764705882353)}

In [8]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
                c=np.c_[color_dict[year]],
                s=500,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
plt.savefig("../ex1.png")



In [9]:
plt.style.available


Out[9]:
['seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-ticks',
 'fivethirtyeight',
 'seaborn-whitegrid',
 'classic',
 '_classic_test',
 'fast',
 'seaborn-talk',
 'seaborn-dark-palette',
 'seaborn-bright',
 'seaborn-pastel',
 'grayscale',
 'seaborn-notebook',
 'ggplot',
 'seaborn-colorblind',
 'seaborn-muted',
 'seaborn',
 'Solarize_Light2',
 'seaborn-paper',
 'bmh',
 'tableau-colorblind10',
 'seaborn-white',
 'dark_background',
 'seaborn-poster',
 'seaborn-deep']

In [10]:
mpl.style.use('seaborn-colorblind')

In [ ]:


In [11]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
#                 c=color_dict[year],
                s=50,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend();
# plt.savefig("ex1.png")



In [12]:
df_dict = load_boston()
features = pd.DataFrame(data=df_dict.data, columns = df_dict.feature_names)
target = pd.DataFrame(data=df_dict.target, columns = ['MEDV'])
df = pd.concat([features, target], axis=1)

In [13]:
df['Zone'] = df['ZN'].astype('category')
df.head()


Out[13]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV Zone
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0 18.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6 0.0
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7 0.0
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4 0.0
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2 0.0

In [14]:
# Target variable
fig, ax = plt.subplots(figsize=(6, 4))
sns.distplot(df.MEDV, ax=ax, rug=True, hist=False);



In [15]:
fig, ax = plt.subplots(figsize=(10,7))
sns.kdeplot(df.LSTAT,
            df.MEDV,
            ax=ax);



In [16]:
fig, ax = plt.subplots(figsize=(10, 10))
scatter_matrix(df[['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS']], alpha=0.2, diagonal='hist', ax=ax);



In [17]:
# fig, ax = plt.subplots(figsize=(10, 10))
sns.pairplot(data=df, 
             vars=['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS'],
             plot_kws={'s':20, 'alpha':0.5},
            );


Quantile cuts


In [18]:
players = pd.read_csv("../data/raw_players.csv.gz", compression='gzip')

In [19]:
players.head()


Out[19]:
playerShort birthday height weight position photoID rater1 rater2
0 aaron-hughes 08.11.1979 182.0 71.0 Center Back 3868.jpg 0.25 0.00
1 aaron-hunt 04.09.1986 183.0 73.0 Attacking Midfielder 20136.jpg 0.00 0.25
2 aaron-lennon 16.04.1987 165.0 63.0 Right Midfielder 13515.jpg 0.25 0.25
3 aaron-ramsey 26.12.1990 178.0 76.0 Center Midfielder 94953.jpg 0.00 0.00
4 abdelhamid-el-kaoutari 17.03.1990 180.0 73.0 Center Back 124913.jpg 0.25 0.25

In [20]:
weight_categories = ["vlow_weight",
                     "low_weight",
                     "mid_weight",
                     "high_weight",
                     "vhigh_weight",
                    ]

players['weightclass'] = pd.qcut(players['weight'],
                                 len(weight_categories),
                                 weight_categories)

In [21]:
players.head()


Out[21]:
playerShort birthday height weight position photoID rater1 rater2 weightclass
0 aaron-hughes 08.11.1979 182.0 71.0 Center Back 3868.jpg 0.25 0.00 low_weight
1 aaron-hunt 04.09.1986 183.0 73.0 Attacking Midfielder 20136.jpg 0.00 0.25 low_weight
2 aaron-lennon 16.04.1987 165.0 63.0 Right Midfielder 13515.jpg 0.25 0.25 vlow_weight
3 aaron-ramsey 26.12.1990 178.0 76.0 Center Midfielder 94953.jpg 0.00 0.00 mid_weight
4 abdelhamid-el-kaoutari 17.03.1990 180.0 73.0 Center Back 124913.jpg 0.25 0.25 low_weight

BQPlot

Examples here are shamelessly stolen from the amazing: https://github.com/maartenbreddels/jupytercon-2017/blob/master/jupytercon2017-widgets.ipynb


In [22]:
from IPython.display import YouTubeVideo

In [22]:
YouTubeVideo("uHPcshgTotE", width=560, height=315)


Out[22]:

In [23]:
# mixed feelings about this import
import bqplot.pyplot as plt
import numpy as np

In [24]:
x = np.linspace(0, 2, 50)
y = x**2

In [25]:
fig = plt.figure()
scatter = plt.scatter(x, y)
plt.show()



In [27]:
fig.animation_duration = 500
scatter.y = x**.5

In [28]:
scatter.selected_style = {'stroke':'red', 'fill': 'orange'}
plt.brush_selector();

In [30]:
scatter.selected


Out[30]:
[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]

In [34]:
scatter.selected = [1,2,10,42,45]

ipyvolume


In [35]:
import ipyvolume as ipv

In [ ]:


In [36]:
N = 1000
x, y, z = np.random.random((3, N))

In [37]:
fig = ipv.figure()
scatter = ipv.scatter(x, y, z, marker='box')
ipv.show()



In [54]:
scatter.x = x + np.random.rand() - 0.5

In [58]:
scatter.color = "green"
scatter.size = 5

In [56]:
scatter.color = np.random.random((N,3))

In [57]:
scatter.size = 2

In [59]:
ex = ipv.datasets.animated_stream.fetch().data

In [60]:
ex.shape


Out[60]:
(6, 200, 1250)

In [61]:
ex[:, ::, ::4].shape


Out[61]:
(6, 200, 313)

In [62]:
ipv.figure()
ipv.style.use('dark')
quiver = ipv.quiver(*ipv.datasets.animated_stream.fetch().data[:,::,::4], size=5)
ipv.animation_control(quiver, interval=200)
ipv.show()
ipv.style.use('light')



In [63]:
quiver.geo = "cat"

In [68]:
# N = 1000*1000
N = 1000

x, y, z = np.random.random((3, N)).astype('f4')

In [69]:
ipv.figure()
s = ipv.scatter(x, y, z, size=0.2, )
ipv.show()



In [70]:
ipv.save("3d-example-plot.html")

In [71]:
!open 3d-example-plot.html

In [72]:
name = "Billy"

In [ ]: