Python Visualization Landscape


In [26]:
from IPython.lib.display import YouTubeVideo

In [27]:
YouTubeVideo("FytuB8nFHPQ", width=400, height=300)


Out[27]:

In [1]:
from __future__ import absolute_import, division, print_function

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import seaborn as sns
sns.set_context('poster')
sns.set_style('whitegrid') 
# sns.set_style('darkgrid') 
plt.rcParams['figure.figsize'] = 12, 8  # plotsize

In [4]:
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
from sklearn.datasets import load_boston

import warnings
warnings.filterwarnings('ignore')

Note

Using cleaned data from Data Cleaning Notebook. See Notebook for details.


In [5]:
df = pd.read_csv("../data/coal_prod_cleaned.csv")

In [6]:
df.head()


Out[6]:
MSHA_ID Average_Employees Company_Type Labor_Hours Mine_Basin Mine_County Mine_Name Mine_State Mine_Status Mine_Type Operating_Company Operating_Company_Address Operation_Type Production_short_tons Union_Code Year
0 103295 18.0 Independent Producer Operator 39175.0 Appalachia Southern Bibb Seymour Mine Alabama Active Surface Hope Coal Company Inc P.O. Box 249, Maylene, AL 35114 Mine only 105082.0 NaN 2008
1 103117 19.0 Operating Subsidiary 29926.0 Appalachia Southern Cullman Mine #2, #3, #4 Alabama Active, men working, not producing Surface Twin Pines Coal Company Inc 1874 County Road 15, Bremen, AL 35033 Mine only 10419.0 NaN 2008
2 103361 20.0 Operating Subsidiary 42542.0 Appalachia Southern Cullman Cold Springs West Mine Alabama Active Surface Twin Pines Coal Company 74 Industrial Parkway, Jasper, AL 35502 Mine only 143208.0 NaN 2008
3 100759 395.0 Operating Subsidiary 890710.0 Appalachia Southern Fayette North River # 1 Underground Mi Alabama Active Underground Chevron Mining Inc 3114 County Road 63 S, Berry, AL 35546 Mine and Preparation Plant 2923261.0 United Mine Workers of America 2008
4 103246 22.0 Independent Producer Operator 55403.0 Appalachia Southern Franklin Bear Creek Alabama Active Surface Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Mine only 183137.0 NaN 2008

In [7]:
plt.scatter(df['Average_Employees'], 
            df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked");


BQPlot

Examples here are shamelessly stolen from the amazing: https://github.com/maartenbreddels/jupytercon-2017/blob/master/jupytercon2017-widgets.ipynb


In [54]:
from IPython.display import YouTubeVideo

In [56]:
YouTubeVideo("uHPcshgTotE", width=560, height=315)


Out[56]:

In [8]:
import bqplot as bq

In [16]:
sample_df = df.sample(100)

In [17]:
x_sc = bq.LinearScale()
y_sc = bq.LinearScale()

ax_x = bq.Axis(label='Number of Employees', scale=x_sc, grid_lines='solid')
ax_y = bq.Axis(label='Total Hours Worked', scale=y_sc, orientation='vertical', grid_lines='solid')

line = bq.Scatter(x=sample_df.Average_Employees, 
                  y=sample_df.Labor_Hours,
                  scales={'x': x_sc, 'y': y_sc}, 
                 interactions={'click': 'select'},
                        selected_style={'opacity': 1.0, 'fill': 'DarkOrange', 'stroke': 'Red'},
                       unselected_style={'opacity': 0.5})

fig = bq.Figure(axes=[ax_x, ax_y], marks=[line], title='BQPlot Example')
fig



In [18]:
line.selected


Out[18]:
[63, 53, 85, 61]

In [19]:
line.selected = [23, 3]

In [20]:
import bqplot.pyplot as plt
import numpy as np

x = np.linspace(0, 2, 50)
y = x**2

fig = plt.figure()
scatter = plt.scatter(x, y)
plt.show()



In [26]:
fig.animation_duration = 5000
scatter.y = x**.5

In [27]:
scatter.selected_style = {'stroke':'red', 'fill': 'orange'}
plt.brush_selector();

In [28]:
scatter.selected


Out[28]:
[13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36]

In [29]:
scatter.selected = [1,2,10,40]

In [ ]:


In [31]:
import ipyvolume as ipv
import numpy as np

In [32]:
ipv.example_ylm()



In [33]:
N = 1000
x, y, z = np.random.random((3, N))

In [34]:
fig = ipv.figure()
scatter = ipv.scatter(x, y, z, marker='box')
ipv.show()



In [35]:
scatter.x = scatter.x + 0.1

In [36]:
scatter.color = "green"
scatter.size = 5

In [37]:
scatter.color = np.random.random((N,3))

In [38]:
scatter.size = 2

In [50]:
ex = ipv.datasets.animated_stream.fetch().data

In [52]:
ex.shape


Out[52]:
(6, 200, 1250)

In [53]:
ex[:, ::, ::4].shape


Out[53]:
(6, 200, 313)

In [ ]:


In [47]:
ipv.figure()
ipv.style.use('dark')
quiver = ipv.quiver(*ipv.datasets.animated_stream.fetch().data[:,::,::4], size=5)
ipv.animation_control(quiver, interval=200)
ipv.show()
ipv.style.use('light')



In [40]:
ipv.style.use('light')

In [41]:
quiver.geo = "cat"

In [42]:
N = 1000*1000
x, y, z = np.random.random((3, N)).astype('f4')

In [43]:
ipv.figure()
s = ipv.scatter(x, y, z, size=0.2)
ipv.show()



In [48]:
ipv.save("bqplot.html", )

In [49]:
!open bqplot.html

In [12]:
colors = sns.color_palette(n_colors=df.Year.nunique())

In [13]:
color_dict = {key: value 
              for key, value in zip(sorted(df.Year.unique()), colors)}

In [9]:
color_dict


Out[9]:
{2008: (0.29803921568627451, 0.44705882352941179, 0.69019607843137254),
 2009: (0.33333333333333331, 0.6588235294117647, 0.40784313725490196),
 2010: (0.7686274509803922, 0.30588235294117649, 0.32156862745098042),
 2011: (0.50588235294117645, 0.44705882352941179, 0.69803921568627447),
 2012: (0.80000000000000004, 0.72549019607843135, 0.45490196078431372)}

In [10]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
                c=color_dict[year],
                s=50,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend()
plt.savefig("ex1.png")



In [11]:
import matplotlib as mpl

In [12]:
plt.style.available


Out[12]:
['_classic_test',
 'bmh',
 'classic',
 'dark_background',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark-palette',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'seaborn']

In [13]:
mpl.style.use('seaborn-colorblind')

In [15]:
for year in sorted(df.Year.unique()[[0, 2, -1]]):
    plt.scatter(df[df.Year == year].Labor_Hours,
                df[df.Year == year].Production_short_tons, 
#                 c=color_dict[year],
                s=50,
                label=year,
               )
plt.xlabel("Total Hours Worked")
plt.ylabel("Total Amount Produced")
plt.legend();
# plt.savefig("ex1.png")



In [16]:
df_dict = load_boston()
features = pd.DataFrame(data=df_dict.data, columns = df_dict.feature_names)
target = pd.DataFrame(data=df_dict.target, columns = ['MEDV'])
df = pd.concat([features, target], axis=1)
df.head()


Out[16]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2

In [17]:
# Target variable
fig, ax = plt.subplots(figsize=(6, 4))
sns.distplot(df.MEDV, ax=ax, rug=True, hist=False)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b76c160>

In [18]:
fig, ax = plt.subplots(figsize=(10,7))
sns.kdeplot(df.LSTAT,
            df.MEDV,
            ax=ax)


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f80dc18>

In [19]:
fig, ax = plt.subplots(figsize=(10, 10))
scatter_matrix(df[['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS']], alpha=0.2, diagonal='hist', ax=ax);



In [20]:
sns.pairplot(data=df, 
             vars=['MEDV', 'LSTAT', 'CRIM', 'RM', 'NOX', 'DIS'],
             plot_kws={'s':20, 'alpha':0.5}
            );


Quantile cuts


In [21]:
players = pd.read_csv("../data/raw_players.csv.gz", compression='gzip')

In [22]:
players.head()


Out[22]:
playerShort birthday height weight position photoID rater1 rater2
0 aaron-hughes 08.11.1979 182.0 71.0 Center Back 3868.jpg 0.25 0.00
1 aaron-hunt 04.09.1986 183.0 73.0 Attacking Midfielder 20136.jpg 0.00 0.25
2 aaron-lennon 16.04.1987 165.0 63.0 Right Midfielder 13515.jpg 0.25 0.25
3 aaron-ramsey 26.12.1990 178.0 76.0 Center Midfielder 94953.jpg 0.00 0.00
4 abdelhamid-el-kaoutari 17.03.1990 180.0 73.0 Center Back 124913.jpg 0.25 0.25

In [23]:
weight_categories = ["vlow_weight",
                     "low_weight",
                     "mid_weight",
                     "high_weight",
                     "vhigh_weight",
                    ]

players['weightclass'] = pd.qcut(players['weight'],
                                 len(weight_categories),
                                 weight_categories)

In [24]:
players.head()


Out[24]:
playerShort birthday height weight position photoID rater1 rater2 weightclass
0 aaron-hughes 08.11.1979 182.0 71.0 Center Back 3868.jpg 0.25 0.00 low_weight
1 aaron-hunt 04.09.1986 183.0 73.0 Attacking Midfielder 20136.jpg 0.00 0.25 low_weight
2 aaron-lennon 16.04.1987 165.0 63.0 Right Midfielder 13515.jpg 0.25 0.25 vlow_weight
3 aaron-ramsey 26.12.1990 178.0 76.0 Center Midfielder 94953.jpg 0.00 0.00 mid_weight
4 abdelhamid-el-kaoutari 17.03.1990 180.0 73.0 Center Back 124913.jpg 0.25 0.25 low_weight

In [ ]: