In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import hillmaker as hm
#from hillmaker import bydatetime
#from hillmaker import hillpylib
from pandas import Timestamp
# Let's check what version of pandas, numpy and matplotlib we are using
print ("pandas version ", pd.__version__)
print ("numpy version ", np.version.version)
print ("matplotlib version ", mp.__version__)
In [ ]:
hm
In [3]:
# Set up autoreloading of modules so that I can debug code in external files
%load_ext autoreload
%autoreload 2
In [ ]:
file_stopdata = 'data/ShortStay.csv'
df = pd.read_csv(file_stopdata, parse_dates=['InRoomTS','OutRoomTS'])
In [36]:
scenario_name = 'sstest_60'
in_fld_name = 'InRoomTS'
out_fld_name = 'OutRoomTS'
cat_fld_name = 'PatType'
start_analysis = '1/2/1996'
end_analysis = '3/31/1996 23:45'
bin_size_mins = 60
# This next field wasn't in original Hillmaker. Use it to specify the name to use for the overall totals.
tot_fld_name = 'Total'
## Convert string dates to actual datetimes
start_analysis_dt = pd.Timestamp(start_analysis)
end_analysis_dt = pd.Timestamp(end_analysis)
In [13]:
df.head()
Out[13]:
In [37]:
bydt_df = bydatetime.make_bydatetime(df,
in_fld_name,
out_fld_name,
cat_fld_name,
start_analysis_dt,
end_analysis_dt,
tot_fld_name,
bin_size_mins)
In [17]:
bydt_df.head()
Out[17]:
In [38]:
file_bydt_csv = 'data/bydate_' + scenario_name + '.csv'
bydt_df.to_csv(file_bydt_csv)
With this data frame we can compute all kinds of interesting summary statistics by category, by day of week and time of day. To facilitate this type of "group by" analysis, pandas takes what is known as the Split-Apply-Combine approach. The pandas documentation has a nice discussion of this. To really understand split-apply-combine, check out the article by Hadley Wickham who created the plyr package for R. I also created a tutorial on Getting started with Python (with pandas and matplotlib) for group by analysis that covers some of the basics. A companion tutorial shows how to do the same analysis using R instead of Python.
Pandas provides a GroupBy
object to facilitate computing aggregate statistics by grouping fields.
In [18]:
# Create a GroupBy object for the summary stats
bydt_dfgrp1 = bydt_df.groupby(['category','bin_of_week'])
In [19]:
# Having a group by object makes it easy to compute statistics such as the mean of all of the fields other than the grouping fields.
# You'll see that the result is simply another DataFrame.
bydt_dfgrp1.mean()
Out[19]:
In [20]:
# Let's explore some of the means.
bydt_dfgrp1.mean()[100:120]
Out[20]:
Now that we've seen how the a GroupBy
object works, let's see how we can compute a whole bunch of summary statistics at once. Specifically we want to compute the mean, standard deviation, min, max and several percentiles. First let's create a slightly different GroupBy
object.
Now let's define a function that will return a bunch of statistics in a dictionary for a column of data.
In [23]:
def get_occstats(group, stub=''):
return {stub+'count': group.count(), stub+'mean': group.mean(),
stub+'min': group.min(),
stub+'max': group.max(), 'stdev': group.std(),
stub+'p50': group.quantile(0.5), stub+'p55': group.quantile(0.55),
stub+'p60': group.quantile(0.6), stub+'p65': group.quantile(0.65),
stub+'p70': group.quantile(0.7), stub+'p75': group.quantile(0.75),
stub+'p80': group.quantile(0.8), stub+'p85': group.quantile(0.85),
stub+'p90': group.quantile(0.9), stub+'p95': group.quantile(0.95),
stub+'p975': group.quantile(0.975),
stub+'p99': group.quantile(0.99)}
In [39]:
bydt_dfgrp2 = bydt_df.groupby(['category','day_of_week','bin_of_day'])
Now we can use the apply function to apply the get_occstats() function to a data series. We'll create separate output data series for occupancy, arrivals and departures.
In [40]:
occ_stats = bydt_dfgrp2['occupancy'].apply(get_occstats)
arr_stats = bydt_dfgrp2['arrivals'].apply(get_occstats)
dep_stats = bydt_dfgrp2['departures'].apply(get_occstats)
So, what is occ_stats
?
In [26]:
type(occ_stats)
Out[26]:
It's a pandas Series
object. What does its index look like?
In [27]:
occ_stats.index
Out[27]:
Notice it's a MultiIndex
with 4 levels: category, dayofweek, binofday, statistic. It would be nice to "un-pivot" the statistic from the index and have it correspond to a set of columns. That's what unstack()
will do. It will leave us with a DataFrame
with all of the statistics as columns and a 3 level multi-index of category, dayofweek and binofday. Perfect for plotting.
In [29]:
occ_stats.unstack()[100:125]
Out[29]:
In [41]:
occ_stats_summary = occ_stats.unstack()
arr_stats_summary = arr_stats.unstack()
dep_stats_summary = dep_stats.unstack()
In [32]:
print (occ_stats_summary[200:220].values) # Let's peek into the middle of the table.
Let's fire these guys out to csv files so we can check them out and maybe play with them in spreadsheet.
In [33]:
file_occ_csv = 'data/occ_stats_' + scenario_name + '.csv'
file_arr_csv = 'data/arr_stats_' + scenario_name + '.csv'
file_dep_csv = 'data/dep_stats_' + scenario_name + '.csv'
occ_stats_summary.to_csv('data/occ_stats_summary.csv')
arr_stats_summary.to_csv('data/arr_stats_summary.csv')
dep_stats_summary.to_csv('data/dep_stats_summary.csv')
In [42]:
file_stopdata = 'data/ShortStay.csv'
df = pd.read_csv(file_stopdata, parse_dates=['InRoomTS','OutRoomTS'])
In [43]:
scenario_name = 'sstest_60'
in_fld_name = 'InRoomTS'
out_fld_name = 'OutRoomTS'
cat_fld_name = 'PatType'
start_analysis = '1/2/1996'
end_analysis = '3/31/1996 23:45'
bin_size_mins = 60
# This next field wasn't in original Hillmaker. Use it to specify the name to use for the overall totals.
tot_fld_name = 'Total'
## Convert string dates to actual datetimes
start_analysis_dt = pd.Timestamp(start_analysis)
end_analysis_dt = pd.Timestamp(end_analysis)
In [44]:
bydt_df = bydatetime.make_bydatetime(df,
in_fld_name,
out_fld_name,
cat_fld_name,
start_analysis_dt,
end_analysis_dt,
tot_fld_name,
bin_size_mins)
In [23]:
def get_occstats(group, stub=''):
return {stub+'count': group.count(), stub+'mean': group.mean(),
stub+'min': group.min(),
stub+'max': group.max(), 'stdev': group.std(),
stub+'p50': group.quantile(0.5), stub+'p55': group.quantile(0.55),
stub+'p60': group.quantile(0.6), stub+'p65': group.quantile(0.65),
stub+'p70': group.quantile(0.7), stub+'p75': group.quantile(0.75),
stub+'p80': group.quantile(0.8), stub+'p85': group.quantile(0.85),
stub+'p90': group.quantile(0.9), stub+'p95': group.quantile(0.95),
stub+'p975': group.quantile(0.975),
stub+'p99': group.quantile(0.99)}
In [45]:
bydt_dfgrp2 = bydt_df.groupby(['category','day_of_week','bin_of_day'])
occ_stats = bydt_dfgrp2['occupancy'].apply(get_occstats)
arr_stats = bydt_dfgrp2['arrivals'].apply(get_occstats)
dep_stats = bydt_dfgrp2['departures'].apply(get_occstats)
occ_stats_summary = occ_stats.unstack()
arr_stats_summary = arr_stats.unstack()
dep_stats_summary = dep_stats.unstack()
In [46]:
file_bydt_csv = 'data/bydate_' + scenario_name + '.csv'
bydt_df.to_csv(file_bydt_csv)
file_occ_csv = 'data/occ_stats_' + scenario_name + '.csv'
file_arr_csv = 'data/arr_stats_' + scenario_name + '.csv'
file_dep_csv = 'data/dep_stats_' + scenario_name + '.csv'
occ_stats_summary.to_csv('data/occ_stats_summary.csv')
arr_stats_summary.to_csv('data/arr_stats_summary.csv')
dep_stats_summary.to_csv('data/dep_stats_summary.csv')