In [1]:
import pandas as pd
import numpy as np
In [2]:
# Seaborn is a visualization library on top of matplotlib.
import seaborn as sns
import matplotlib.pyplot as plt
In [3]:
df = pd.read_csv('turnstile_weather_v2.csv', parse_dates=['datetime'])
#print df.describe()
In [4]:
station_group = df.groupby(["station"], as_index=False)
# group by station and aggregate to obtain average ridership per station
ordered = station_group["station", "ENTRIESn_hourly"].aggregate(np.mean).sort("ENTRIESn_hourly", ascending=False)
#ordered.tail()
In [5]:
from IPython.html.widgets import interact
import IPython
%matplotlib inline
# Interactive components available starting with IPython 2.0
#IPython.__version__
In [6]:
def get_station_data(station_name):
"""Filter subway dataframe for a particular station.
Return two Pandas Series with ridership data - one for rain and one for no-rain.
Prints number of elements in each of the series.
This is a helper function for visualization.
"""
station_rain = df[ (df["station"] == station_name) & (df["rain"] == 1) ]["ENTRIESn_hourly"]
station_no_rain = df[ (df["station"] == station_name) & (df["rain"] == 0) ]["ENTRIESn_hourly"]
print "RAIN number of elements: ", str(len(station_rain))
print "NO RAIN number of elements: ", str(len(station_no_rain))
return station_rain, station_no_rain
In [7]:
def histogram_vis(station_name):
"""Produce a figure with two histograms for the particular station.
One histogram for rain and one for no-rain.
"""
station_rain, station_no_rain = get_station_data(station_name)
# find maximum ridership value for this station - used for x-axis limits
max_value = max(station_rain.max(), station_no_rain.max())
# create figure and axes
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(14, 6))
# produce histogram
# use ax argument to specify axis explicitly
station_rain.hist(bins=20, ax=ax0)
ax0.set_xlim(-1, max_value*1.1)
ax0.set_title("Ridership with RAIN for " + station_name)
station_no_rain.hist(bins=20, ax=ax1)
ax1.set_xlim(-1, max_value*1.1)
ax1.set_title("Ridership with NO RAIN for " + station_name)
In [8]:
top_stations = list(ordered.head()["station"])
In [9]:
i1 = interact(histogram_vis, station_name=top_stations)
In [10]:
def compare_vis(station_name, type_vis):
"""Produce two visualizations comparing ridership for a given station.
type_vis == "violin", then the two visualizations are violin plots.
type_vis == "box", then the two visualizations are box plots
Uses Seaborn to produce plots.
"""
station_rain, station_no_rain = get_station_data(station_name)
fig, ax0 = plt.subplots(ncols=1, figsize=(7, 5))
ax0.set_title("Compare Ridership for " + station_name)
if type_vis == "box":
sns.boxplot([station_rain, station_no_rain], names=["RAIN", "NO RAIN"], ax=ax0)
elif type_vis == "violin":
# bw sets the binwidth of the kernel density estimator for each of the violins
sns.violinplot([station_rain, station_no_rain], names=["RAIN", "NO RAIN"], bw=0.3, ax=ax0)
In [11]:
i2 = interact(compare_vis, station_name = top_stations, type_vis = ["box", "violin"])