Overview. We introduce and apply a new and exciting graphics package plotly. We show how we can leverage our knowledge of Matplotlib to jumpstart our usage of plotly. We then show how to access some of plotly's unique features to do things that are difficult or impossible with our knowledge of matplotilb.
Outline
pd.read_html
example to get latitude and longitude coordinates for country capitalsNote: requires internet access to run.
This Jupyter notebook was created by Dave Backus, Chase Coleman, and Spencer Lyon for the NYU Stern course Data Bootcamp.
import
statements. conda
and pip
: package managers for python. Install new packages using conda install package_name
or pip3 install package name
.We will need to have the plotly python package installed. To do this enter the following from the command line (command prompt on windows, terminal on mac):
pip install plotly --upgrade
pip install cufflinks
conda install -c anaconda pandas-datareader=0.2.1
conda install -c anaconda html5lib=0.999
conda install -c anaconda lxml=3.7.3
Once you've done that, come back to this notebook and run the following cell to make sure plotly is installed properly.
In [ ]:
import numpy as np # foundation for Pandas
import pandas as pd # data package
from pandas_datareader import wb, data as web # worldbank data
import html5lib
import matplotlib.pyplot as plt # graphics module
import datetime as dt # date and time module
import seaborn.apionly as sns # fancy matplotlib graphics (no styling)
# plotly imports
import plotly # just to print version and init notebook
from plotly.offline import iplot, iplot_mpl # plotting functions
import plotly.graph_objs as go # ditto
# these lines make our graphics show up in the notebook
%matplotlib inline
plotly.offline.init_notebook_mode(connected=True)
import cufflinks as cf # gives us df.iplot that feels like df.plot
cf.set_config_file(offline=True, offline_show_link=False)
# check versions (overkill, but why not?)
print('Pandas version: ', pd.__version__)
print('Plotly version: ', plotly.__version__)
print('Today: ', dt.date.today())
Before we get too far, we'll need some data. Let's get some now.
First we will download national data from the World Bank for all countries in Europe. In order to put this data on a map we will need to have either the 3 letter ISO code for the country or latitude and longitude coordinates. We'll grab both here. This will be a little sophistocated, so bear with us.
We show these steps here so that you can re-use this code if you want to do something similar in the future.
Latitude and longitude coordinates: http://www.csgnetwork.com/llinfotable.html
use pd.read_html
: Read HTML tables into a list
of DataFrame
objects.
attrs = {'id': 'table'}
is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for any HTML tag
In [ ]:
# read a list of latitude and longitude coordinates for
# country capitals
lat_lon = pd.read_html("http://www.csgnetwork.com/llinfotable.html", header=0,
attrs={"align": "center", "cellpadding": 5, "bgcolor": "#FFFFFF"})[0]
In [ ]:
lat_lon.head()
In [ ]:
lat_lon.dtypes
In [ ]:
lat_lon['Latitude'].str.split("°")
In [ ]:
lat_lon['Latitude'].str.split("°").str.get(1)
#lat_lon['Latitude'].str.split("°").str.get(1).str.split("'")
#lat_lon['Latitude'].str.split("°").str.get(1).str.split("'").str[0].astype(float)
#lat_lon['Latitude'].str.split("°").str.get(1).str.split("'").str[0].astype(float)/60
#(lat_lon['Latitude'].str.split("°").str.get(1).str.split("'").str[0].astype(float)/60).astype(str).str.lstrip('0')
In [ ]:
# If the RHS is S, set the sign negative
#lat_lon['Latitude'].str.split("°").str.get(1).str.split("'").str[1] == 'S'
(lat_lon['Latitude'].str.split("°").str.get(1).str.split("'").str[1] == 'S').replace({True: '-', False: ""})
In [ ]:
# clean up so lat and long are numeric in degrees east and degrees north
def clean_latlon(series, to_negate):
# get XX.YY data
split1 = series.str.split("°")
split2 = split1.str.get(1).str.split("'")
data = split1.str.get(0) + (split2.str.get(0).astype(float)/60).astype(str).str.lstrip('0')
# now add a negative side if last character == to_negate
signs = split2.str.get(1) == to_negate
signs = signs.replace({True: "-", False: ""})
data = signs.str[:] + data.str[:]
return data.astype(float)
lat_lon["Latitude"] = clean_latlon(lat_lon["Latitude"], "S")
lat_lon["Longitude"] = clean_latlon(lat_lon["Longitude"], "W")
lat_lon = lat_lon.drop("Capital", axis=1)
#lat_lon = lat_lon.set_index("Country")
lat_lon.head()
In [ ]:
#import ssl
#ssl._create_default_https_context = ssl._create_unverified_context
In [ ]:
import requests
from bs4 import BeautifulSoup
In [ ]:
# dataframe of country names and iso codes
url = 'https://unstats.un.org/unsd/methodology/m49/'
iso_raw = requests.get(url)
iso_soup = BeautifulSoup(iso_raw.content, 'html.parser')
In [ ]:
iso_soup.find_all('table')[0]
Efficient way of scraping: if the html contains 'table' tags, we can pass it straight to 'pd.read_html' (sometimes it doesn't work with the url).
In [ ]:
#pd.read_html?
In [ ]:
iso = pd.read_html(str(iso_soup.find_all('table')[0]), header=0)
print(iso)
iso = pd.read_html(str(iso_soup.find_all('table')[0]), header=0)[0]
In [ ]:
iso.shape
In [ ]:
iso = iso.rename(columns={"ISO-alpha3 code": "ISO", "Country or Area": "Country"})
iso = iso.drop("M49 code", axis=1)
#iso = iso.set_index("Country")
iso.head()
In [ ]:
iso.tail()
In [ ]:
# select 44 european countries
europe = ["Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus",
"Belgium", "Bosnia and Herzegovina", "Bulgaria", "Croatia", "Cyprus",
"Czech Republic", "Denmark", "Estonia", "Finland", "France", "Georgia",
"Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy",
"Kazakhstan", "Kosovo", "Latvia", "Liechtenstein", "Lithuania",
"Luxembourg", "Macedonia", "Malta", "Moldova", "Monaco", "Montenegro",
"Netherlands", "Norway", "Poland", "Portugal", "Romania", "Russia",
"San Marino", "Serbia", "Slovakia", "Slovenia", "Spain", "Sweden",
"Switzerland", "Turkey", "Ukraine", "United Kingdom", "Vatican City"]
In [ ]:
iso[iso['Country'].isin(europe)]
In [ ]:
def euro_wb_data(indicators, year=2013): # get data from worldbank
iso_europe = iso[iso['Country'].isin(europe)]
# IP.JRN.ARTC.SC is "scientific and technical journal articles"
# NOTE: visit the world bank website to pick a different subject if you'd like
# link: http://data.worldbank.org/indicator
df = wb.download(country=iso_europe["ISO"], indicator=indicators,
start=year, end=year)
df = df.reset_index(level="year")
df.index.name = "Country"
# some countries didn't have data. Drop them now
df = df.dropna()
return df
papers = euro_wb_data(["IP.JRN.ARTC.SC"])
papers.rename(columns={"IP.JRN.ARTC.SC": "publications"}, inplace=True)
papers = papers.reset_index()
papers.head()
In [ ]:
papers = pd.merge(papers, iso, on = 'Country', how='left')
papers.head()
In [ ]:
papers.shape
In [ ]:
papers = pd.merge(papers, lat_lon, on = 'Country', how='left')
papers.head()
In [ ]:
papers.shape
In [ ]:
papers = papers.set_index('Country')
papers.head()
In [ ]:
# Earnings by school and gender. Source plotly docs. Real source, unknown
url = "https://raw.githubusercontent.com/plotly/datasets/master/school_earnings.csv"
earnings = pd.read_csv(url)
print(earnings.head())
earnings = earnings.set_index("School")
earnings = earnings.sort_values("Women")
print("\n\nAfter set_index and sort_values:\n")
print(earnings.head())
In [ ]:
# tips at restaurants in NYC. Source unknown, but classic dataset
tips = sns.load_dataset("tips")
tips.head()
In [ ]:
# info on titanic passengers. Source unknown.
titanic = sns.load_dataset("titanic")
titanic.head()
Plotly is a javascript based plotting library. Plotly leverages industry grade javascript technologies to provide great flexibility and good performance.
Being a javascript library, plotly graphics are inherently interactive meant to be viewed in a webbrowser. The good news is that we can embed our interactive plots in any website: Jupyter notebooks, blog posts, etc. The great news is that we don't have to write any javascript ourselves!
The plotly project was started about five years ago. Over that time, plotly has transitioned between three phases:
As a warmup, let's utilize our expertise of Matplotlib to quickly generate some basic plotly graphics.
The main steps in this process are:
Figure
object (usually named fig
in our examples) to the function iplot_mpl
.That's it!
Disclaimer: the functions that convert matplotlib figures to plotly figures are not perfect. We'll see some issues below, but will show how to build the plots using plotly's API so they look as we expect.
We'll start by looking at some examples from the seaborn documentation. The actual figures are not important here. We are mostly concerned with how well matplotlylib can take a matplotlib figure and construct a plotly figure.
In [ ]:
ax = sns.swarmplot(x="day", y="total_bill", data=tips)
fig_mpl = ax.get_figure()
Now let's convert our Matplotlib figure fig_mpl
into a plotly figure named fig_py
. To do this we will use the function iplot_mpl
as follows:
In [ ]:
iplot_mpl(fig_mpl)
For this example we see that the converter did a decent job, though it didn't quite get the xlabels correct
Let's try another example
In [ ]:
out = sns.pointplot(x="class", y="survived", hue="sex", data=titanic,
palette={"male": "g", "female": "m"},
markers=["^", "o"], linestyles=["-", "--"]);
In [ ]:
iplot_mpl(out.get_figure())
What worked well in this conversion? What didn't work?
Let's do one more example using the college graduate data
In [ ]:
fig_mpl, ax = plt.subplots(figsize=(6, 10))
earnings.plot.barh(ax=ax, y="Men", color="Blue")
earnings.plot.barh(ax=ax, y="Women", color="Pink")
In [ ]:
iplot_mpl(fig_mpl)
What did and didn't work here?
Below we'll recreate this same figure using plotly's api and overcome these issues
Let's now consider how to use plotly's own API to construct plots instead of building the graphics through matplotlib.
Plotly has over 20 core chart types and many more can be created by combining one or more chart types in the same figure. We don't have time to cover all of them here, but please check out the documentation.
Plotly has a purely declarative API. This means that we describe all the features we want in our figure at once, without worrying about which functions to call in what order.
The plotly can achieve this is by fully describing the plot in a data format called JSON. For our purposes we can think of JSON as dictionaries, where values can be of any type, including other dictionaries.
Plotly figures are composed of two things:
The trace
s describe the data that should be plotted as well as how it should be displayed. Here's an example of a trace defining a scatter plot:
trace = dict(type="scatter", # trace type
x=[1, 2, 3], # x data
y=[1, 4, 9], # y data
name="Squares" # legend label
)
In this example, x
, y
, name
and marker
are called the attributes of the trace. All traces have a type
attribute that describes the type of chart to generate for a particular piece of data. Here we chose scatter
, which is what plotly calls scatter plots or line plots.
An example of a layout
is
l = dict(title="Penguins food", # plot title
yaxis=dict(title="Quantity (%)", # yaxis label
range=(0, 1) # set limits for y axis
)
Notice that the value associated with yaxis
had type dict
. This allowed us to control features of the yaxis.
For an overwhelmingly comprehensive overview of all trace types and their associated attributes (everything plotly can do) see the chart attribute reference in the plotly python documentation
After we have defined one or more traces and a layout, we build the plotly figure using the function plotly.graph_objs.Figure
. The imports up top allow us to refer to this function as go.Figure
. This is how we call the function:
fig = go.Figure(data=D, layout=L)
where D
is a list of traces and L
describes the layout.
Finally, the last thing we need to know how to do is display the figure. In the notebook we will use the function plotly.offline.iplot
, which we imported directly as iplot
. To display the figure above we would do
iplot(fig)
To get a feel for what this looks like, let's revisit the horizontal bar chart using the college grad earnings data.
Here's how we might build that figure using plotly's API
In [ ]:
# the long way. Construct all the dicts by hand
men = dict(type="bar", # trace type
orientation="h", # make bars horizontal
name="Men", # legend entry
x=earnings["Men"], # x data
y=earnings.index, # y data
marker={"color": "Blue"} # blue bars
)
women = dict(type="bar", # trace type
orientation="h", # horizontal bars
name="Women", # legend entry
x=earnings["Women"], # x data
y=earnings.index, # y data
marker={"color": "Pink"} # pink bars
)
layout = dict(width=650, height=750, # plot width/height
yaxis={"title": "School"}, # yaxis label
title="Gender earnings disparity", # title
xaxis={"title": "Annual Salary (thousands)"} # xaxis label
)
iplot(go.Figure(data=[men, women], layout=layout))
Exercise: now generate a similar plot using the df.iplot
method. (Hint you can reuse the layout
object from above)
In [ ]:
earnings[['Men', 'Women']].iplot(kind='barh', layout=layout)
Example: Dumbell plot. The striking fact in the data is that there is a gap between earnings of men and women. To highlight that, here's another way we might visualize the same data:
In [ ]:
men2 = dict(type="scatter",
name="Men",
mode="markers", # draw dots
x=earnings["Men"], # x data
y=earnings.index, # y data
marker={"color": "Blue", "size": 12} # dot color/size
)
women2 = dict(type="scatter", name="Women", mode="markers",
x=earnings["Women"], y=earnings.index,
marker={"color": "Pink", "size": 12})
def draw_line(row):
sc = row.name
line = dict(type="scatter", # trace type
x=[row["Women"], row["Men"]], # x data
y=[sc, sc], # y data flat
mode="lines", # draw line
name=sc, # name trace
showlegend=False, # no legend entry
line={"color": "gray"} # line color
)
return line
lines = list(earnings.apply(draw_line, axis=1))
# use + for two lists
data = [men2, women2] + lines
# build and display the figure
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Exercise: Look at the figure attribute reference and figure out how to remove the grid lines from the figure above. First remove the veritcal ones, then horizontal, then both. (Hint: Look for an attribute on the xaxis
and yaxis
of the layout) (Hint 2 you can get the layout by doing fig.layout
)
In [ ]:
fig.layout
In [ ]:
layout = fig.layout
layout['xaxis']['showgrid'] = False
layout['yaxis']['showgrid'] = False
fig = go.Figure(data=data, layout=layout)
iplot(fig)
There are two map-based traces in plotly:
scattergeo
: this allows you to draw lines or dots on a mapchoropleth
: this allows you to fill regions with different colorsThere is also the geo
layout attribute. We'll look
In [ ]:
# first create layout/marker objects we can re-use in both plots
layout = dict(geo={"scope": "europe", "resolution": 50},
width=750, height=550)
marker = {"color": papers["publications"],
"size": papers["publications"]/5000,
"colorscale": "Reds",
"colorbar": {"title": "# of papers"}}
scattergeo
dotsWe'll look at the scattergeo
trace type first. Suppose we want to draw dots on the map. There are two possible sets of trace attributes we can work with:
lat
and lon
each to a list that specifiy the latitide and longitude for each point respectivelylocations
to be one of "ISO-3"
, "USA-states"
, or "country names"
and then set location
to be a valid member of that mode.We can then set any other attributes
Let's see an example of each version:
In [ ]:
# using location mode
trace = dict(type="scattergeo", # trace type
mode="markers", # draw points
locations=papers["ISO"], # use ISO code
marker=marker # marker settings (size, color, ...)
)
iplot(go.Figure(data=[trace], layout=layout), link_text="")
In [ ]:
# using lat/lon mode
trace = dict(type="scattergeo", # trace type
mode="markers", # draw dots
lat=papers["Latitude"], # latitude coordinate
lon=papers["Longitude"], # longitude coordinate
marker=marker # marker settings (color, size...)
)
iplot(go.Figure(data=[trace], layout=layout), link_text="")
In [ ]:
papers["Latitude"][papers.ISO == "ITA"]
In [ ]:
papers["Latitude"][papers.ISO == "ITA"].iloc[0]
In [ ]:
def get_lat_lon_for(df, iso):
lat = df["Latitude"][df["ISO"] == iso]
lon = df["Longitude"][df["ISO"] == iso]
return float(lat.iloc[0]), float(lon.iloc[0])
italy_lat, italy_lon = get_lat_lon_for(papers, "ITA")
traces = []
for country in ["FRA", "ESP", "DEU"]:
lat, lon = get_lat_lon_for(papers, country)
trace = dict(type="scattergeo", # trace type
mode="lines", # draw lines
lat=[italy_lat, lat], # latitude coordinates
lon=[italy_lon, lon], # longitude coordinates
line={"width": 4.0}, # thick lines
name="ITA to {}".format(country) # legend entry
)
traces.append(trace)
iplot(go.Figure(data=traces, layout=layout))
In [ ]:
trace = dict(type="choropleth",
locations=papers["ISO"], # use ISO names
z=papers["publications"], # defines the color
colorscale="Viridis", # change pallette
text=papers.index, # change text on hover
)
# reuse the same layout
iplot(go.Figure(data=[trace], layout=layout), link_text="")
This example was taken directly from the plotly python documentation. See here
In [ ]:
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')
for col in df.columns:
df[col] = df[col].astype(str)
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
df['text'] = df['state'] + '<br>' +\
'Beef '+df['beef']+' Dairy '+df['dairy']+'<br>'+\
'Fruits '+df['total fruits']+' Veggies ' + df['total veggies']+'<br>'+\
'Wheat '+df['wheat']+' Corn '+df['corn']
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = df['code'],
z = df['total exports'].astype(float),
locationmode = 'USA-states',
text = df['text'],
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = "Millions USD")
) ]
layout = dict(
title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
iplot(go.Figure(data=data, layout=layout), link_text="")
In [ ]:
t=np.linspace(-1,1,100)
x=t+t**2
y=t-t**2
xm=np.min(x)-1.5
xM=np.max(x)+1.5
ym=np.min(y)-1.5
yM=np.max(y)+1.5
N=50
s=np.linspace(-1,1,N)
xx=s+s**2
yy=s-s**2
data=[dict(x=x, y=y,
mode='lines',
line=dict(width=2, color='blue')
),
dict(x=x, y=y,
mode='lines',
line=dict(width=2, color='blue')
)
]
layout=dict(xaxis=dict(range=[xm, xM], autorange=False, zeroline=False),
yaxis=dict(range=[ym, yM], autorange=False, zeroline=False),
title='Kinematic Generation of a Planar Curve', hovermode='closest',
updatemenus= [{'type': 'buttons',
'buttons': [{'label': 'Play',
'method': 'animate',
'args': [[]]}]}])
frames=[dict(data=[dict(x=[xx[k]],
y=[yy[k]],
mode='markers',
marker=dict(color='red', size=10)
)
]) for k in range(N)]
figure1=dict(data=data, layout=layout, frames=frames)
iplot(figure1)
Another one from the docs https://plot.ly/python/candlestick-charts/#custom-candlestick-colors
In [ ]:
from plotly.tools import FigureFactory as FF
from plotly.graph_objs import Line, Marker
from datetime import datetime
df = web.DataReader("aapl", 'yahoo', datetime(2008, 1, 1), datetime(2009, 4, 1))
fig = FF.create_candlestick(df.Open, df.High, df.Low, df.Close, dates=df.index)
# Make increasing ohlc sticks and customize their color and name
fig_increasing = FF.create_candlestick(df.Open, df.High, df.Low, df.Close, dates=df.index,
direction='increasing', name='AAPL',
marker=Marker(color='rgb(150, 200, 250)'),
line=Line(color='rgb(150, 200, 250)'))
# Make decreasing ohlc sticks and customize their color and name
fig_decreasing = FF.create_candlestick(df.Open, df.High, df.Low, df.Close, dates=df.index,
direction='decreasing',
marker=Marker(color='rgb(128, 128, 128)'),
line=Line(color='rgb(128, 128, 128)'))
# Initialize the figure
fig = fig_increasing
# Add decreasing data with .extend()
fig['data'].extend(fig_decreasing['data'])
iplot(fig)
In [ ]: