Plotting and Visualization


In [1]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)

In [2]:
%matplotlib inline

In [3]:
%pwd


Out[3]:
u'/Users/pmui/datascience/lecture04.visualization'

A brief matplotlib API primer


In [4]:
import matplotlib.pyplot as plt

Figures and Subplots


In [3]:
fig = plt.figure()


<matplotlib.figure.Figure at 0x10ffba1d0>

In [14]:
ax1 = fig.add_subplot(2, 2, 1)

In [15]:
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)

The 'k--' is a style option instructing matplotlib to plot a black dashed line. The objects returned by fig.add_subplot above are AxesSubplot objects, on which you can directly plot on the other empty subplots by calling each one’s instance methods,


In [16]:
from numpy.random import randn

# "random walk"
plt.plot(randn(500).cumsum(), 'k--')


Out[16]:
[<matplotlib.lines.Line2D at 0x113817250>]

In [17]:
_ =ax1.hist(randn(100), bins=20, color='k', alpha=0.3)
ax2.scatter(np.arange(30), np.arange(30) + 3 * randn(30))


Out[17]:
<matplotlib.collections.PathCollection at 0x11391f290>

In [18]:
plt.close('all')

In [19]:
fig, axes = plt.subplots(2, 3)
axes


Out[19]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x113ab3690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x113c5d4d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x113ce0510>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x113d46110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x113dc5590>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x113e12e10>]], dtype=object)

Adjusting the spacing around subplots


In [20]:
plt.subplots_adjust(left=None, bottom=None, right=None, top=None,
                wspace=None, hspace=None)


<matplotlib.figure.Figure at 0x113f400d0>

In [21]:
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
    for j in range(2):
        axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)
plt.subplots_adjust(wspace=0, hspace=0)



In [32]:
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
    for j in range(2):
        axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)
plt.subplots_adjust(wspace=0, hspace=0)


Colors, markers, and line styles


In [14]:
plt.figure()


Out[14]:
<matplotlib.figure.Figure at 0x1132a3750>
<matplotlib.figure.Figure at 0x1132a3750>

In [15]:
plt.plot(randn(30).cumsum(), 'ko--')


Out[15]:
[<matplotlib.lines.Line2D at 0x113f49410>]

In [16]:
plt.close('all')

In [17]:
data = randn(30).cumsum()
plt.plot(data, 'k--', label='Default')
plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')
plt.legend(loc='best')


Out[17]:
<matplotlib.legend.Legend at 0x1132df710>

Ticks, labels, and legends

Setting the title, axis labels, ticks, and ticklabels


In [18]:
fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)
ax.plot(randn(1000).cumsum())

ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],
                            rotation=30, fontsize='small')
ax.set_title('My first matplotlib plot')
ax.set_xlabel('Stages')


Out[18]:
<matplotlib.text.Text at 0x11438f5d0>

Adding legends


In [19]:
fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)
ax.plot(randn(1000).cumsum(), 'k', label='one')
ax.plot(randn(1000).cumsum(), 'k--', label='two')
ax.plot(randn(1000).cumsum(), 'k.', label='three')

ax.legend(loc='best')


Out[19]:
<matplotlib.legend.Legend at 0x114588750>

Annotations and drawing on a subplot


In [20]:
from datetime import datetime

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

data = pd.read_csv('ch08/spx.csv', index_col=0, parse_dates=True)
spx = data['SPX']

spx.plot(ax=ax, style='k-')

crisis_data = [
    (datetime(2007, 10, 11), 'Peak of bull market'),
    (datetime(2008, 3, 12), 'Bear Stearns Fails'),
    (datetime(2008, 9, 15), 'Lehman Bankruptcy')
]

for date, label in crisis_data:
    ax.annotate(label, xy=(date, spx.asof(date) + 50),
                xytext=(date, spx.asof(date) + 200),
                arrowprops=dict(facecolor='black'),
                horizontalalignment='left', verticalalignment='top')

# Zoom in on 2007-2010
ax.set_xlim(['1/1/2007', '1/1/2011'])
ax.set_ylim([600, 1800])

ax.set_title('Important dates in 2008-2009 financial crisis')


Out[20]:
<matplotlib.text.Text at 0x114711550>

In [21]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)

ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)


Out[21]:
<matplotlib.patches.Polygon at 0x114848750>

Saving plots to file


In [22]:
fig


Out[22]:

In [23]:
fig.savefig('figpath.svg')

In [24]:
fig.savefig('figpath.png', dpi=400, bbox_inches='tight')

In [25]:
from io import BytesIO
buffer = BytesIO()
plt.savefig(buffer)
plot_data = buffer.getvalue()


<matplotlib.figure.Figure at 0x11356d5d0>

matplotlib configuration


In [26]:
plt.rc('figure', figsize=(10, 10))

Plotting functions in pandas

Line plots


In [27]:
plt.close('all')

In [28]:
s = Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
s.plot()


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x1147ccfd0>

In [29]:
df = DataFrame(np.random.randn(10, 4).cumsum(0),
               columns=['A', 'B', 'C', 'D'],
               index=np.arange(0, 100, 10))
df.plot()


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x114e72a50>

Bar plots


In [30]:
fig, axes = plt.subplots(2, 1)
data = Series(np.random.rand(16), index=list('abcdefghijklmnop'))
data.plot(kind='bar', ax=axes[0], color='k', alpha=0.7)
data.plot(kind='barh', ax=axes[1], color='k', alpha=0.7)


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x114e000d0>

In [31]:
df = DataFrame(np.random.rand(6, 4),
               index=['one', 'two', 'three', 'four', 'five', 'six'],
               columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df
df.plot(kind='bar')


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x117908fd0>

In [32]:
plt.figure()


Out[32]:
<matplotlib.figure.Figure at 0x114f6f310>
<matplotlib.figure.Figure at 0x114f6f310>

In [33]:
df.plot(kind='barh', stacked=True, alpha=0.5)


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1138d6fd0>

In [34]:
tips = pd.read_csv('ch08/tips.csv')
party_counts = pd.crosstab(tips.day, tips.size)
party_counts
# Not many 1- and 6-person parties
party_counts = party_counts.ix[:, 2:5]

In [35]:
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(1).astype(float), axis=0)
party_pcts

party_pcts.plot(kind='bar', stacked=True)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-35-3ff72fa59bb3> in <module>()
      3 party_pcts
      4 
----> 5 party_pcts.plot(kind='bar', stacked=True)

/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/tools/plotting.pyc in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   3735                           fontsize=fontsize, colormap=colormap, table=table,
   3736                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3737                           sort_columns=sort_columns, **kwds)
   3738     __call__.__doc__ = plot_frame.__doc__
   3739 

/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/tools/plotting.pyc in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2609                  yerr=yerr, xerr=xerr,
   2610                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2611                  **kwds)
   2612 
   2613 

/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/tools/plotting.pyc in _plot(data, x, y, subplots, ax, kind, **kwds)
   2436         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2437 
-> 2438     plot_obj.generate()
   2439     plot_obj.draw()
   2440     return plot_obj.result

/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/tools/plotting.pyc in generate(self)
   1021     def generate(self):
   1022         self._args_adjust()
-> 1023         self._compute_plot_data()
   1024         self._setup_subplots()
   1025         self._make_plot()

/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/tools/plotting.pyc in _compute_plot_data(self)
   1130         if is_empty:
   1131             raise TypeError('Empty {0!r}: no numeric data to '
-> 1132                             'plot'.format(numeric_data.__class__.__name__))
   1133 
   1134         self.data = numeric_data

TypeError: Empty 'DataFrame': no numeric data to plot

Histograms and density plots


In [ ]:
plt.figure()

In [ ]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips['tip_pct'].hist(bins=50)

In [ ]:
plt.figure()

In [ ]:
tips['tip_pct'].plot(kind='kde')

In [ ]:
plt.figure()

In [ ]:
comp1 = np.random.normal(0, 1, size=200)  # N(0, 1)
comp2 = np.random.normal(10, 2, size=200)  # N(10, 4)
values = Series(np.concatenate([comp1, comp2]))
values.hist(bins=100, alpha=0.3, color='k', normed=True)
values.plot(kind='kde', style='k--')

Scatter plots


In [ ]:
macro = pd.read_csv('ch08/macrodata.csv')
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
trans_data = np.log(data).diff().dropna()
trans_data[-5:]

In [ ]:
plt.figure()

In [ ]:
plt.scatter(trans_data['m1'], trans_data['unemp'])
plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))

In [ ]:
pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)

Plotting Maps: Visualizing Haiti Earthquake Crisis data


In [ ]:
data = pd.read_csv('ch08/Haiti.csv')
data.info()

In [ ]:
data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]

In [ ]:
data['CATEGORY'][:6]

In [ ]:
data.describe()

In [ ]:
data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
            (data.LONGITUDE > -75) & (data.LONGITUDE < -70)
            & data.CATEGORY.notnull()]

In [ ]:
def to_cat_list(catstr):
    stripped = (x.strip() for x in catstr.split(','))
    return [x for x in stripped if x]

def get_all_categories(cat_series):
    cat_sets = (set(to_cat_list(x)) for x in cat_series)
    return sorted(set.union(*cat_sets))

def get_english(cat):
    code, names = cat.split('.')
    if '|' in names:
        names = names.split(' | ')[1]
    return code, names.strip()

In [ ]:
get_english('2. Urgences logistiques | Vital Lines')

In [ ]:
all_cats = get_all_categories(data.CATEGORY)
# Generator expression
english_mapping = dict(get_english(x) for x in all_cats)
english_mapping['2a']
english_mapping['6c']

In [ ]:
def get_code(seq):
    return [x.split('.')[0] for x in seq if x]

all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                        index=data.index, columns=code_index)

In [ ]:
dummy_frame.ix[:, :6].info()

In [ ]:
for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1

data = data.join(dummy_frame.add_prefix('category_'))

In [ ]:
data.ix[:, 10:15].info()

In [ ]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,
                    lllon=-75, urlon=-71):
    # create polar stereographic Basemap instance.
    m = Basemap(ax=ax, projection='stere',
                lon_0=(urlon + lllon) / 2,
                lat_0=(urlat + lllat) / 2,
                llcrnrlat=lllat, urcrnrlat=urlat,
                llcrnrlon=lllon, urcrnrlon=urlon,
                resolution='f')
    # draw coastlines, state and country boundaries, edge of map.
    m.drawcoastlines()
    m.drawstates()
    m.drawcountries()
    return m

In [ ]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)

to_plot = ['2a', '1', '3c', '7a']

lllat=17.25; urlat=20.25; lllon=-75; urlon=-71

for code, ax in zip(to_plot, axes.flat):
    m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,
                        lllon=lllon, urlon=urlon)

    cat_data = data[data['category_%s' % code] == 1]

    # compute map proj coordinates.
    x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)

    m.plot(x, y, 'k.', alpha=0.5)
    ax.set_title('%s: %s' % (code, english_mapping[code]))

In [ ]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)

to_plot = ['2a', '1', '3c', '7a']

lllat=17.25; urlat=20.25; lllon=-75; urlon=-71

def make_plot():

    for i, code in enumerate(to_plot):
        cat_data = data[data['category_%s' % code] == 1]
        lons, lats = cat_data.LONGITUDE, cat_data.LATITUDE

        ax = axes.flat[i]
        m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,
                            lllon=lllon, urlon=urlon)

        # compute map proj coordinates.
        x, y = m(lons.values, lats.values)

        m.plot(x, y, 'k.', alpha=0.5)
        ax.set_title('%s: %s' % (code, english_mapping[code]))

In [ ]:
make_plot()

In [ ]:
shapefile_path = 'ch08/PortAuPrince_Roads/PortAuPrince_Roads'
m.readshapefile(shapefile_path, 'roads')