In [ ]:
"""
----------------------------------------------------------------------
Filename : 02_data_exploration.py
Date : 12th Dec, 2013
Author : Jaidev Deshpande
Purpose : Introducing exploratory data analysis with Pandas.
Libraries: Pandas, NumPy, SciPy, Matplotlib
----------------------------------------------------------------------
"""
In [ ]:
# standard library imports
from os.path import join
# system library imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [ ]:
# Load the data
filename = join('data','01_heights_weights_genders.csv')
hwg = pd.read_csv(filename, sep=',')
heights = hwg['Height']
weights = hwg['Weight']
gender = hwg['Gender']
In [ ]:
summary_stats = ['min','max','mean','median','var','std','nunique']
In [ ]:
def print_summary_statistics(series):
'''
Given a pandas series, this function prints it's summary statistics,
particularly:
1. minimum and maximum values,
2. mean
3. median
4. variance
5. standard deviation
6. number of unique values
'''
print("=" * 80)
print("Summary Statistics")
print("=" * 80)
for stat in summary_stats:
attr = series.__getattribute__(stat)
value = attr()
print(stat.capitalize() + ":" + "\t" + str(value))
print("=" * 80)
In [ ]:
print_summary_statistics(heights)
In [ ]:
# DataFrame.describe??
In [ ]:
def plot_sumstats_visualizations(series, show_hist=False):
'''
Given a series, visualize it's summary statistics.
'''
fig, axes = plt.subplots(nrows=2,ncols=2, figsize=(20,10))
# Min & Max
ax = axes[0,0]
ax.axvline(heights.min(),color='g',linewidth=3)
ax.axvline(heights.max(),color='g',linewidth=3)
ax.set_title("Min & Max")
ax.set_xlabel('Height in "')
ax.set_ylabel("# People")
# Mean
ax = axes[0,1]
ax.axvline(heights.mean(),color='r',linewidth=3)
ax.set_title("Mean")
ax.set_xlabel('Height in "')
ax.set_ylabel("# People")
# Median
ax = axes[1,0]
ax.axvline(heights.median(),color='k',linewidth=3)
ax.set_title("Median")
ax.set_xlabel('Height in "')
ax.set_ylabel("# People")
# Standard Deviation
ax = axes[1,1]
sd0 = heights.mean() + heights.std()
sd1 = heights.mean() + 2*heights.std()
sd_neg1 = heights.mean() - heights.std()
ax.axvline(heights.mean(),color='r',linewidth=3)
ax.axvline(sd0,color='c',linewidth=3)
ax.axvline(sd1,color='c',linewidth=3)
ax.axvline(sd_neg1,color='c',linewidth=3)
ax.set_title("Standard Deviation")
ax.set_xlabel('Height in "')
ax.set_ylabel("# People")
if show_hist:
series.hist(ax=axes[0,0],fc='steelblue')
series.hist(ax=axes[0,1],fc='steelblue')
series.hist(ax=axes[1,0],fc='steelblue')
series.hist(ax=axes[1,1],fc='steelblue')
plt.show()
In [ ]:
plot_sumstats_visualizations(heights)
In [ ]:
plot_sumstats_visualizations(heights, show_hist=True)
In [ ]:
def draw_density_estimates(df,cols):
'''
Draw the kernel density estimates of data in `df`.
`cols` is a list of column names that specifies which columns to consider.
'''
for col in cols:
plt.figure(figsize=(20,5))
df[col].plot(kind='kde',label=col)
plt.xlabel(col)
plt.ylabel('Density')
plt.title('Density Plot')
plt.show()
In [ ]:
draw_density_estimates(hwg, cols=['Height', 'Weight'])
In [ ]:
def draw_density_estimates_by_gender(df, col='Height'):
"""
Same as `draw_density_estimates`, except
this time gender information is taken into account.
"""
plt.figure(figsize=(20,5))
male_heights = df[df['Gender']=='Male'][col]
female_heights = df[df['Gender']=='Female'][col]
male_heights.plot(kind='kde',label='Male',legend=True)
female_heights.plot(kind='kde',label='Female',legend=True)
plt.ylabel('Density')
plt.xlabel(col)
plt.title('Density Plot')
plt.show()
In [ ]:
draw_density_estimates_by_gender(hwg)
draw_density_estimates_by_gender(hwg,'Weight')
In [ ]:
def draw_scatter_plot(df,cols=('Height','Weight')):
fig = plt.figure(figsize=(20,10))
axes = fig.add_subplot(111)
pd.tools.plotting.scatter_plot(df, cols[0], cols[1],c='steelblue',ax=axes)
plt.grid()
plt.xlabel(cols[0])
plt.ylabel(cols[1])
plt.title('Scatter Plot')
plt.show()
In [ ]:
draw_scatter_plot(hwg)
In [ ]:
def draw_binary_color_scatter_plot(df, x, y, colorby):
"""
Draw a scatter plot of data in `df`, with `x` and `y` specifying the columns,
and `colorby` specifying which column to use for mapping to colors.
`colorby` should be binary data.
"""
a = df[df[colorby]==True]
b = df[df[colorby]==False]
fig = plt.figure(figsize=(20,10))
axes = fig.add_subplot(111)
pd.tools.plotting.scatter_plot(a, x, y, ax=axes, c='0',label='Male')
pd.tools.plotting.scatter_plot(b, x, y, ax=axes, c='0.5', label='Female')
plt.legend()
plt.grid()
plt.show()
In [ ]:
male_inds = [gender == 'Male' for gender in hwg['Gender']]
hwg['colorby'] = male_inds
In [ ]:
draw_binary_color_scatter_plot(hwg, 'Height','Weight','colorby')