Part 2: Data Exploration and Visualization in Pandas


In [ ]:
"""
----------------------------------------------------------------------
Filename : 02_data_exploration.py
Date     : 12th Dec, 2013
Author   : Jaidev Deshpande
Purpose  : Introducing exploratory data analysis with Pandas.
Libraries: Pandas, NumPy, SciPy, Matplotlib
----------------------------------------------------------------------
"""

In [ ]:
# standard library imports
from os.path import join

# system library imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Problem: Describing a dataset.


In [ ]:
# Load the data
filename = join('data','01_heights_weights_genders.csv')
hwg = pd.read_csv(filename, sep=',')
heights = hwg['Height']
weights = hwg['Weight']
gender = hwg['Gender']

In [ ]:
summary_stats = ['min','max','mean','median','var','std','nunique']

In [ ]:
def print_summary_statistics(series):
    '''
    Given a pandas series, this function prints it's summary statistics,
    particularly: 
    1. minimum and maximum values,
    2. mean
    3. median
    4. variance
    5. standard deviation
    6. number of unique values
    '''
    print("=" * 80)
    print("Summary Statistics")
    print("=" * 80)
    for stat in summary_stats:
        attr = series.__getattribute__(stat)
        value = attr()
        print(stat.capitalize() + ":" + "\t" + str(value))
    print("=" * 80)

In [ ]:
print_summary_statistics(heights)

In [ ]:
# DataFrame.describe??

In [ ]:
def plot_sumstats_visualizations(series, show_hist=False):
    '''
    Given a series, visualize it's summary statistics.
    '''
    fig, axes = plt.subplots(nrows=2,ncols=2, figsize=(20,10))

    # Min & Max
    ax = axes[0,0]
    ax.axvline(heights.min(),color='g',linewidth=3)
    ax.axvline(heights.max(),color='g',linewidth=3)
    ax.set_title("Min & Max")
    ax.set_xlabel('Height in "')
    ax.set_ylabel("# People")

    # Mean
    ax = axes[0,1]
    ax.axvline(heights.mean(),color='r',linewidth=3)
    ax.set_title("Mean")
    ax.set_xlabel('Height in "')
    ax.set_ylabel("# People")

    # Median
    ax = axes[1,0]
    ax.axvline(heights.median(),color='k',linewidth=3)
    ax.set_title("Median")
    ax.set_xlabel('Height in "')
    ax.set_ylabel("# People")

    # Standard Deviation
    ax = axes[1,1]
    sd0     = heights.mean() + heights.std()
    sd1     = heights.mean() + 2*heights.std()
    sd_neg1 = heights.mean() - heights.std()
    ax.axvline(heights.mean(),color='r',linewidth=3)
    ax.axvline(sd0,color='c',linewidth=3)
    ax.axvline(sd1,color='c',linewidth=3)
    ax.axvline(sd_neg1,color='c',linewidth=3)
    ax.set_title("Standard Deviation")
    ax.set_xlabel('Height in "')
    ax.set_ylabel("# People")
    
    if show_hist:
        series.hist(ax=axes[0,0],fc='steelblue')
        series.hist(ax=axes[0,1],fc='steelblue')
        series.hist(ax=axes[1,0],fc='steelblue')
        series.hist(ax=axes[1,1],fc='steelblue')
        
    plt.show()

In [ ]:
plot_sumstats_visualizations(heights)

In [ ]:
plot_sumstats_visualizations(heights, show_hist=True)

Problem: How many bins in the histogram?


In [ ]:
def draw_density_estimates(df,cols):
    '''
    Draw the kernel density estimates of data in `df`. 
    `cols` is a list of column names that specifies which columns to consider.
    '''
    for col in cols:
        plt.figure(figsize=(20,5))
        df[col].plot(kind='kde',label=col)
        plt.xlabel(col)
        plt.ylabel('Density')
        plt.title('Density Plot')
    plt.show()

In [ ]:
draw_density_estimates(hwg, cols=['Height', 'Weight'])

In [ ]:
def draw_density_estimates_by_gender(df, col='Height'):
    """
    Same as `draw_density_estimates`, except 
    this time gender information is taken into account.
    """
    plt.figure(figsize=(20,5))
    male_heights = df[df['Gender']=='Male'][col]
    female_heights = df[df['Gender']=='Female'][col]
    male_heights.plot(kind='kde',label='Male',legend=True)
    female_heights.plot(kind='kde',label='Female',legend=True)
    plt.ylabel('Density')
    plt.xlabel(col)
    plt.title('Density Plot')
    plt.show()

In [ ]:
draw_density_estimates_by_gender(hwg)
draw_density_estimates_by_gender(hwg,'Weight')

In [ ]:
def draw_scatter_plot(df,cols=('Height','Weight')):
    fig = plt.figure(figsize=(20,10))
    axes = fig.add_subplot(111)
    pd.tools.plotting.scatter_plot(df, cols[0], cols[1],c='steelblue',ax=axes)
    plt.grid()
    plt.xlabel(cols[0])
    plt.ylabel(cols[1])
    plt.title('Scatter Plot')
    plt.show()

In [ ]:
draw_scatter_plot(hwg)

In [ ]:
def draw_binary_color_scatter_plot(df, x, y, colorby):
    """
    Draw a scatter plot of data in `df`, with `x` and `y` specifying the columns,
    and `colorby` specifying which column to use for mapping to colors.
    `colorby` should be binary data.
    """
    a = df[df[colorby]==True]
    b = df[df[colorby]==False]
    fig = plt.figure(figsize=(20,10))
    axes = fig.add_subplot(111)
    pd.tools.plotting.scatter_plot(a, x, y, ax=axes, c='0',label='Male')
    pd.tools.plotting.scatter_plot(b, x, y, ax=axes, c='0.5', label='Female')
    plt.legend()
    plt.grid()
    plt.show()

In [ ]:
male_inds = [gender == 'Male' for gender in hwg['Gender']]
hwg['colorby'] = male_inds

In [ ]:
draw_binary_color_scatter_plot(hwg, 'Height','Weight','colorby')