In [ ]:
%matplotlib inline
import os
import requests
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
In [ ]:
ENERGY = "http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
In [ ]:
def download_data(url, path='data'):
if not os.path.exists(path):
os.mkdir(path)
response = requests.get(url)
name = os.path.basename(url)
with open(os.path.join(path, name), 'wb') as f:
f.write(response.content)
In [ ]:
download_data(ENERGY)
In [ ]:
energy = pd.read_excel('data/ENB2012_data.xlsx', sep=",")
In [ ]:
energy.head()
In [ ]:
energy.columns = ['compactness','surface_area','wall_area','roof_area','height',\
'orientation','glazing_area','distribution','heating_load','cooling_load']
In [ ]:
energy.describe()
Pandas has a bunch of really useful visualization tools. (Hint: They're all Matplotlib under the hood. Most Python viz libraries are wrappers for Matplotlib!)
In [ ]:
# We can use the ggplot style with Matplotlib, which is a little bit nicer-looking than the standard style.
matplotlib.style.use('ggplot')
In [ ]:
energy.plot(kind='area', stacked=False,figsize=[20,10])
In [ ]:
energy.plot(kind='scatter', x='roof_area', y='cooling_load', c='surface_area',figsize=[20,10])
In [ ]:
energy.plot(kind='scatter', x='wall_area', y='heating_load', s=energy['glazing_area']*500,figsize=[20,10])
In [ ]:
energy.plot(kind='box',figsize=(20,10))
In [ ]:
energy['compactness'].plot(kind='hist', alpha=0.5, figsize=(20,10))
In [ ]:
energy.hist(figsize=(20,10))
In [ ]:
energy['wall_area'].plot(kind='kde')
In [ ]:
areas = energy[['glazing_area','roof_area','surface_area','wall_area']]
scatter_matrix(areas, alpha=0.2, figsize=(18,18), diagonal='kde')
Sometimes you'll want to something a bit more custom (or you'll want to figure out how to tweak the labels, change the colors, make small multiples, etc), so you'll want to go straight to the Matplotlib documentation.
Matplotlib is the O.G. visualization library for Python. If you've used MATLAB or Mathematica before, it will immediately start to look familiar.
It produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms. matplotlib can be used in python scripts, the Python and iPython shell, web application servers, and six graphical user interface toolkits.
In [ ]:
x = [1, 2, 3, 4]
y = [1, 4, 9, 6]
labels = ['Frogs', 'Hogs', 'Bogs', 'Slogs']
plt.plot(x, y, 'ro')
# You can specify a rotation for the tick labels in degrees or with keywords.
plt.xticks(x, labels, rotation=30)
# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)
# Tweak spacing to prevent clipping of tick-labels
plt.subplots_adjust(bottom=0.15)
plt.show()
See also: Matplotlib colormaps
In [ ]:
# Make up some fake data
x = np.linspace(-np.pi, np.pi, 50)
y = np.linspace(-np.pi, np.pi, 50)
X,Y = np.meshgrid(x,y)
Z = np.sin(X + Y/4)
fig = plt.figure(figsize = (12,2.5))
fig.subplots_adjust(wspace=0.3)
# Blues
plt.subplot(1,3,1)
plt.pcolormesh(X, Y, Z, cmap=plt.cm.get_cmap('Blues'))
plt.colorbar()
plt.axis([-3, 3, -3, 3])
plt.title('Sequential')
# Red-Blue
plt.subplot(1,3,2)
plt.pcolormesh(X, Y, Z, cmap=plt.cm.get_cmap('RdBu'))
plt.colorbar()
plt.axis([-3, 3, -3, 3])
plt.title('Diverging')
# Red-Blue
plt.subplot(1,3,3)
plt.pcolormesh(X, Y, Z, cmap=plt.cm.get_cmap('plasma'))
plt.colorbar()
plt.axis([-3, 3, -3, 3])
plt.title('Fancy!')
Seaborn
is another great Python visualization library to have up your sleeve.
Seaborn is a Python visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics. For a brief introduction to the ideas behind the package, you can read the introductory notes. More practical information is on the installation page. You may also want to browse the example gallery to get a sense for what you can do with seaborn
and then check out the tutorial and API reference to find out how.
Seaborn has a lot of the same methods as Pandas
, like boxplots and histograms (albeit with slightly different syntax!), but also comes with some novel tools...
Violinplots are a combination of a boxplot and a kernel density estimate. Very useful for visualizing categorical data!
In [ ]:
sns.set_style('whitegrid')
sns.violinplot(x='height',y='cooling_load', data=energy)
See more: https://stanford.edu/~mwaskom/software/seaborn/tutorial/regression.html
In [ ]:
sns.regplot(x='wall_area', y='cooling_load', data=energy, x_estimator=np.mean)
In [ ]:
sns.set(style="ticks")
# Create a dataset with many short random walks
rs = np.random.RandomState(4)
pos = rs.randint(-1, 2, (20, 5)).cumsum(axis=1)
pos -= pos[:, 0, np.newaxis]
step = np.tile(range(5), 20)
walk = np.repeat(range(20), 5)
df = pd.DataFrame(np.c_[pos.flat, step, walk],
columns=["position", "step", "walk"])
# Initialize a grid of plots with an Axes for each walk
grid = sns.FacetGrid(df, col="walk", hue="walk", col_wrap=5, size=1.5)
# Draw a horizontal line to show the starting point
grid.map(plt.axhline, y=0, ls=":", c=".5")
# Draw a line plot to show the trajectory of each random walk
grid.map(plt.plot, "step", "position", marker="o", ms=4)
# Adjust the tick positions and labels
grid.set(xticks=np.arange(5), yticks=[-3, 3],
xlim=(-.5, 4.5), ylim=(-3.5, 3.5))
# Adjust the arrangement of the plots
grid.fig.tight_layout(w_pad=1)
In [ ]:
sns.set()
# Load the example flights dataset and conver to long-form
flights_long = sns.load_dataset('flights')
flights = flights_long.pivot('month', 'year', 'passengers')
# Draw a heatmap with the numeric values in each cell
sns.heatmap(flights, annot=True, fmt='d', linewidths=.5)
In [ ]:
from string import ascii_letters as letters
sns.set(style="white")
# Generate a large random dataset
rs = np.random.RandomState(33)
d = pd.DataFrame(data=rs.normal(size=(100, 26)),
columns=list(letters[:26]))
# Compute the correlation matrix
corr = d.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
square=True, xticklabels=5, yticklabels=5,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)