Create Basic Charts (Plots)

In this notebook we'll be creating a number of basic charts from our data, including a histogram, box plot, and scatterplot.


In [ ]:
# To show matplotlib plots in iPython Notebook we can use an iPython magic function
%matplotlib inline

# Import everything we need
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

Import The Data


In [ ]:
# Import the dataset from the CSV file
accidents_data_file = '/Users/robert.dempsey/Dropbox/Private/Art of Skill Hacking/' \
                      'Books/Python Business Intelligence Cookbook/Data/Stats19-Data1979-2004/Accidents7904.csv'
accidents = pd.read_csv(accidents_data_file,
                        sep=',',
                        header=0,
                        index_col=False,
                        parse_dates=['Date'],
                        dayfirst=True,
                        tupleize_cols=False,
                        error_bad_lines=True,
                        warn_bad_lines=True,
                        skip_blank_lines=True,
                        low_memory=False,
                        nrows=1000000
                        )
accidents.head()

Create a Histogram for a Column

Create a histogram of the number of casualties


In [ ]:
# Create a frequency table of casualty counts from the previous recipe
casualty_count = accidents.groupby('Date').agg({'Number_of_Casualties': np.sum})

In [ ]:
# Create a histogram from the casualty count dataframe
plt.hist(casualty_count['Number_of_Casualties'],
         bins=30)
plt.title('Number of Casualties Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

Plot the Data as a Probability Distribution


In [ ]:
# Show the probability of finding a number in a bin
plt.hist(casualty_count['Number_of_Casualties'],
         bins=30,
         normed=True)
plt.title('Probability Distribution')
plt.xlabel('Value')
plt.ylabel('Probability')
plt.show()

Plot a Cumulative Distribution Function


In [ ]:
# Shows the probability of finding a number in a bin or any lower bin
plt.hist(casualty_count['Number_of_Casualties'],
         bins=20,
         normed=True,
         cumulative=True)
plt.title('Cumulative Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

Show the Histogram as a Stepped Line


In [ ]:
plt.hist(casualty_count['Number_of_Casualties'],
         bins=20,
         histtype='step')
plt.title('Number of Casualties Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

Plot Two Sets of Values in a Probability Distribution


In [ ]:
# Create a frequency table of vehicle counts
vehicle_count = accidents.groupby('Date').agg({'Number_of_Vehicles': np.sum})

In [ ]:
# Plot the two dataframes
plt.hist(casualty_count['Number_of_Casualties'], bins=20, histtype='stepfilled', normed=True, color='b', label='Casualties')
plt.hist(vehicle_count['Number_of_Vehicles'], bins=20, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Vehicles')
plt.title("Casualties/Vehicles Histogram")
plt.xlabel("Value")
plt.ylabel("Probability")
plt.legend()
plt.show()

Create a Customized Box Plot with Whiskers


In [ ]:
data_to_plot = [casualty_count['Number_of_Casualties'],
                vehicle_count['Number_of_Vehicles']]

In [ ]:
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))

# Create an axis instance
ax = fig.add_subplot(111)

# Create the boxplot
bp = ax.boxplot(data_to_plot)

# Change the color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#7570b3', linewidth=2)

# Change the color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#b2df8a', linewidth=2)

# Change the style of the fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='o', color='#e7298a', alpha=0.5)

# Add x-axis labels
ax.set_xticklabels(['Casualties', 'Vehicles'])

# Show the figure
fig.savefig('fig1.png', bbox_inches='tight')

Create a Basic Bar Chart of Casualties Over Time


In [ ]:
# Create a figure instance
fig = plt.figure()

# Create an axis instance
ax = fig.add_subplot(111)

# Create the bar chart
ax.bar(range(len(casualty_count.index.values)), casualty_count['Number_of_Casualties'])

# Save the figure
fig.savefig('fig2.png')

In [ ]: