In [1]:
import pandas as pd
import numpy as np

In [2]:
# Seaborn is a visualization library on top of matplotlib.
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('turnstile_weather_v2.csv', parse_dates=['datetime'])
#print df.describe()

In [4]:
station_group = df.groupby(["station"], as_index=False)
# group by station and aggregate to obtain average ridership per station
ordered = station_group["station", "ENTRIESn_hourly"].aggregate(np.mean).sort("ENTRIESn_hourly", ascending=False)
#ordered.tail()

In [5]:
from IPython.html.widgets import interact
import IPython
%matplotlib inline

# Interactive components available starting with IPython 2.0
#IPython.__version__

In [6]:
def get_station_data(station_name):
    """Filter subway dataframe for a particular station.
    Return two Pandas Series with ridership data - one for rain and one for no-rain.
    Prints number of elements in each of the series. 
    This is a helper function for visualization.
    """
    station_rain = df[ (df["station"] == station_name) & (df["rain"] == 1) ]["ENTRIESn_hourly"]
    station_no_rain = df[ (df["station"] == station_name) & (df["rain"] == 0) ]["ENTRIESn_hourly"]
    
    print "RAIN number of elements: ", str(len(station_rain))
    print "NO RAIN number of elements: ", str(len(station_no_rain))
    return station_rain, station_no_rain

In [7]:
def histogram_vis(station_name):
    """Produce a figure with two histograms for the particular station. 
    One histogram for rain and one for no-rain.
    """
    station_rain, station_no_rain = get_station_data(station_name)

    # find maximum ridership value for this station - used for x-axis limits
    max_value = max(station_rain.max(), station_no_rain.max())
    
    # create figure and axes
    fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(14, 6))
    
    # produce histogram
    # use ax argument to specify axis explicitly
    station_rain.hist(bins=20, ax=ax0)
    ax0.set_xlim(-1, max_value*1.1)
    ax0.set_title("Ridership with RAIN for " + station_name)
    
    station_no_rain.hist(bins=20, ax=ax1)
    ax1.set_xlim(-1, max_value*1.1)
    ax1.set_title("Ridership with NO RAIN for " + station_name)

In [8]:
top_stations = list(ordered.head()["station"])

In [9]:
i1 = interact(histogram_vis, station_name=top_stations)


RAIN number of elements:  42
NO RAIN number of elements:  144

In [10]:
def compare_vis(station_name, type_vis):
    """Produce two visualizations comparing ridership for a given station.
    type_vis == "violin", then the two visualizations are violin plots. 
    type_vis == "box", then the two visualizations are box plots
    Uses Seaborn to produce plots.
    """
    station_rain, station_no_rain = get_station_data(station_name)
    
    fig, ax0 = plt.subplots(ncols=1, figsize=(7, 5))
    ax0.set_title("Compare Ridership for " + station_name)
    
    if type_vis == "box":
        sns.boxplot([station_rain, station_no_rain], names=["RAIN", "NO RAIN"], ax=ax0)
    elif type_vis == "violin":
        # bw sets the binwidth of the kernel density estimator for each of the violins
        sns.violinplot([station_rain, station_no_rain], names=["RAIN", "NO RAIN"], bw=0.3, ax=ax0)

In [11]:
i2 = interact(compare_vis, station_name = top_stations, type_vis = ["box", "violin"])


RAIN number of elements:  42
NO RAIN number of elements:  144