In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt

import hillmaker as hm
#from hillmaker import bydatetime
#from hillmaker import hillpylib
from pandas import Timestamp

# Let's check what version of pandas, numpy and matplotlib we are using
print ("pandas version ", pd.__version__)
print ("numpy version ", np.version.version)
print ("matplotlib version ", mp.__version__)


pandas version  0.16.2
numpy version  1.9.2
matplotlib version  1.4.3

In [ ]:
hm

In [3]:
# Set up autoreloading of modules so that I can debug code in external files
%load_ext autoreload
%autoreload 2

In [ ]:
file_stopdata = 'data/ShortStay.csv'
df = pd.read_csv(file_stopdata, parse_dates=['InRoomTS','OutRoomTS'])

In [36]:
scenario_name = 'sstest_60'
in_fld_name = 'InRoomTS'
out_fld_name = 'OutRoomTS'
cat_fld_name = 'PatType'
start_analysis = '1/2/1996'
end_analysis = '3/31/1996 23:45'
bin_size_mins = 60

# This next field wasn't in original Hillmaker. Use it to specify the name to use for the overall totals.
tot_fld_name = 'Total'

## Convert string dates to actual datetimes
start_analysis_dt = pd.Timestamp(start_analysis)
end_analysis_dt = pd.Timestamp(end_analysis)

In [13]:
df.head()


Out[13]:
PatID InRoomTS OutRoomTS PatType
0 1 1996-01-01 07:44:00 1996-01-01 08:50:00 IVT
1 2 1996-01-01 08:28:00 1996-01-01 09:20:00 IVT
2 3 1996-01-01 11:44:00 1996-01-01 13:30:00 MYE
3 4 1996-01-01 11:51:00 1996-01-01 12:55:00 CAT
4 5 1996-01-01 12:10:00 1996-01-01 13:00:00 IVT

In [37]:
bydt_df = bydatetime.make_bydatetime(df,
                                     in_fld_name,
                                     out_fld_name,
                                     cat_fld_name,
                                     start_analysis_dt,
                                     end_analysis_dt,
                                     tot_fld_name,
                                     bin_size_mins)


rng_bydt created: 0.0006284789997152984
found unique categories: 0.004088978999789106
Seeded bydatetime DataFrame created: 0.0356079929997577
dayofweek, bin_of_day, bin_of_week computed: 0.34738577700045425
Multi-index on bydatetime DataFrame created: 0.3504454660005649
Multi-index fully lexsorted: 0.3538809880001281
Num inner: 19793
Done processing 19793 stop recs: 14.286970832999941

In [17]:
bydt_df.head()


Out[17]:
category datetime arrivals departures occupancy day_of_week bin_of_day bin_of_week
category datetime
ART 1996-01-02 00:00:00 ART 1996-01-02 00:00:00 0 0 0 1 0 24
1996-01-02 01:00:00 ART 1996-01-02 01:00:00 0 0 0 1 1 25
1996-01-02 02:00:00 ART 1996-01-02 02:00:00 0 0 0 1 2 26
1996-01-02 03:00:00 ART 1996-01-02 03:00:00 0 0 0 1 3 27
1996-01-02 04:00:00 ART 1996-01-02 04:00:00 0 0 0 1 4 28

In [38]:
file_bydt_csv = 'data/bydate_' + scenario_name + '.csv'
bydt_df.to_csv(file_bydt_csv)

With this data frame we can compute all kinds of interesting summary statistics by category, by day of week and time of day. To facilitate this type of "group by" analysis, pandas takes what is known as the Split-Apply-Combine approach. The pandas documentation has a nice discussion of this. To really understand split-apply-combine, check out the article by Hadley Wickham who created the plyr package for R. I also created a tutorial on Getting started with Python (with pandas and matplotlib) for group by analysis that covers some of the basics. A companion tutorial shows how to do the same analysis using R instead of Python.

Pandas provides a GroupBy object to facilitate computing aggregate statistics by grouping fields.


In [18]:
# Create a GroupBy object for the summary stats    
bydt_dfgrp1 = bydt_df.groupby(['category','bin_of_week'])

In [19]:
# Having a group by object makes it easy to compute statistics such as the mean of all of the fields other than the grouping fields.
# You'll see that the result is simply another DataFrame.
bydt_dfgrp1.mean()


Out[19]:
arrivals departures occupancy day_of_week bin_of_day
category bin_of_week
ART 0 0.000000 0.000000 0.000000 0 0
1 0.000000 0.000000 0.000000 0 1
2 0.000000 0.000000 0.000000 0 2
3 0.000000 0.000000 0.000000 0 3
4 0.000000 0.000000 0.000000 0 4
5 1.000000 0.000000 0.309722 0 5
6 5.916667 0.000000 4.154167 0 6
7 3.583333 4.916667 6.440278 0 7
8 4.166667 3.833333 5.390278 0 8
9 2.916667 4.083333 5.215278 0 9
10 3.166667 3.000000 4.443056 0 10
11 2.750000 3.333333 4.855556 0 11
12 3.000000 3.500000 3.987500 0 12
13 2.000000 2.666667 3.558333 0 13
14 0.583333 2.583333 2.095833 0 14
15 0.333333 1.000000 0.668056 0 15
16 0.000000 0.333333 0.319444 0 16
17 0.083333 0.250000 0.090278 0 17
18 0.000000 0.000000 0.000000 0 18
19 0.000000 0.000000 0.000000 0 19
20 0.000000 0.000000 0.000000 0 20
21 0.000000 0.000000 0.000000 0 21
22 0.000000 0.000000 0.000000 0 22
23 0.000000 0.000000 0.000000 0 23
24 0.000000 0.000000 0.000000 1 0
25 0.000000 0.000000 0.000000 1 1
26 0.000000 0.000000 0.000000 1 2
27 0.000000 0.000000 0.000000 1 3
28 0.000000 0.000000 0.000000 1 4
29 0.000000 0.000000 0.000000 1 5
... ... ... ... ... ... ...
OTH 138 0.000000 0.000000 0.000000 5 18
139 0.000000 0.000000 0.000000 5 19
140 0.000000 0.000000 0.000000 5 20
141 0.000000 0.000000 0.000000 5 21
142 0.000000 0.000000 0.000000 5 22
143 0.000000 0.000000 0.000000 5 23
144 0.000000 0.000000 0.000000 6 0
145 0.000000 0.000000 0.000000 6 1
146 0.000000 0.000000 0.000000 6 2
147 0.000000 0.000000 0.000000 6 3
148 0.000000 0.000000 0.000000 6 4
149 0.000000 0.000000 0.000000 6 5
150 0.000000 0.000000 0.000000 6 6
151 0.000000 0.000000 0.000000 6 7
152 0.000000 0.000000 0.000000 6 8
153 0.000000 0.000000 0.000000 6 9
154 0.000000 0.000000 0.000000 6 10
155 0.000000 0.000000 0.000000 6 11
156 0.000000 0.000000 0.000000 6 12
157 0.000000 0.000000 0.000000 6 13
158 0.000000 0.000000 0.000000 6 14
159 0.000000 0.000000 0.000000 6 15
160 0.000000 0.000000 0.000000 6 16
161 0.000000 0.000000 0.000000 6 17
162 0.000000 0.000000 0.000000 6 18
163 0.000000 0.000000 0.000000 6 19
164 0.000000 0.000000 0.000000 6 20
165 0.000000 0.000000 0.000000 6 21
166 0.000000 0.000000 0.000000 6 22
167 0.000000 0.000000 0.000000 6 23

840 rows × 5 columns


In [20]:
# Let's explore some of the means.
bydt_dfgrp1.mean()[100:120]


Out[20]:
arrivals departures occupancy day_of_week bin_of_day
category bin_of_week
ART 100 0.000000 0.000000 0.000000 4 4
101 0.769231 0.000000 0.167949 4 5
102 7.153846 0.000000 3.471795 4 6
103 3.692308 5.538462 6.602564 4 7
104 4.307692 4.153846 5.932051 4 8
105 3.692308 4.461538 5.807692 4 9
106 4.769231 3.923077 6.028205 4 10
107 3.230769 3.846154 6.094872 4 11
108 3.769231 4.461538 4.975641 4 12
109 2.615385 3.461538 4.260256 4 13
110 0.846154 3.000000 2.921795 4 14
111 0.538462 2.000000 0.970513 4 15
112 0.461538 0.307692 0.635897 4 16
113 0.000000 0.692308 0.230769 4 17
114 0.000000 0.000000 0.000000 4 18
115 0.000000 0.000000 0.000000 4 19
116 0.000000 0.000000 0.000000 4 20
117 0.000000 0.000000 0.000000 4 21
118 0.000000 0.000000 0.000000 4 22
119 0.000000 0.000000 0.000000 4 23

Now that we've seen how the a GroupBy object works, let's see how we can compute a whole bunch of summary statistics at once. Specifically we want to compute the mean, standard deviation, min, max and several percentiles. First let's create a slightly different GroupBy object.

Now let's define a function that will return a bunch of statistics in a dictionary for a column of data.


In [23]:
def get_occstats(group, stub=''):
    return {stub+'count': group.count(), stub+'mean': group.mean(), 
            stub+'min': group.min(),
            stub+'max': group.max(), 'stdev': group.std(), 
            stub+'p50': group.quantile(0.5), stub+'p55': group.quantile(0.55),
            stub+'p60': group.quantile(0.6), stub+'p65': group.quantile(0.65),
            stub+'p70': group.quantile(0.7), stub+'p75': group.quantile(0.75),
            stub+'p80': group.quantile(0.8), stub+'p85': group.quantile(0.85),
            stub+'p90': group.quantile(0.9), stub+'p95': group.quantile(0.95),
            stub+'p975': group.quantile(0.975), 
            stub+'p99': group.quantile(0.99)}

In [39]:
bydt_dfgrp2 = bydt_df.groupby(['category','day_of_week','bin_of_day'])

Now we can use the apply function to apply the get_occstats() function to a data series. We'll create separate output data series for occupancy, arrivals and departures.


In [40]:
occ_stats = bydt_dfgrp2['occupancy'].apply(get_occstats)
arr_stats = bydt_dfgrp2['arrivals'].apply(get_occstats)
dep_stats = bydt_dfgrp2['departures'].apply(get_occstats)

So, what is occ_stats?


In [26]:
type(occ_stats)


Out[26]:
pandas.core.series.Series

It's a pandas Series object. What does its index look like?


In [27]:
occ_stats.index


Out[27]:
MultiIndex(levels=[['ART', 'CAT', 'IVT', 'MYE', 'OTH'], [0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], ['count', 'max', 'mean', 'min', 'p50', 'p55', 'p60', 'p65', 'p70', 'p75', 'p80', 'p85', 'p90', 'p95', 'p975', 'p99', 'stdev']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...]],
           names=['category', 'day_of_week', 'bin_of_day', None])

Notice it's a MultiIndex with 4 levels: category, dayofweek, binofday, statistic. It would be nice to "un-pivot" the statistic from the index and have it correspond to a set of columns. That's what unstack() will do. It will leave us with a DataFrame with all of the statistics as columns and a 3 level multi-index of category, dayofweek and binofday. Perfect for plotting.


In [29]:
occ_stats.unstack()[100:125]


Out[29]:
count max mean min p50 p55 p60 p65 p70 p75 p80 p85 p90 p95 p975 p99 stdev
category day_of_week bin_of_day
ART 4 4 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
5 13 0.866667 0.167949 0.000000 0.000000 0.030000 0.090000 0.210000 0.276667 0.316667 0.326667 0.340000 0.360000 0.566667 0.716667 0.806667 0.257432
6 13 6.033333 3.471795 1.283333 3.333333 3.503333 3.693333 3.923333 4.246667 4.616667 4.736667 4.846667 4.936667 5.393333 5.713333 5.905333 1.371644
7 13 8.283333 6.602564 2.750000 7.383333 7.513333 7.626667 7.706667 7.760000 7.800000 8.000000 8.140000 8.160000 8.213333 8.248333 8.269333 1.846153
8 13 8.400000 5.932051 2.166667 6.383333 6.483333 6.580000 6.670000 6.820000 7.000000 7.480000 7.803333 7.813333 8.050000 8.225000 8.330000 1.769422
9 13 7.433333 5.807692 3.100000 6.016667 6.206667 6.340000 6.360000 6.626667 7.016667 7.026667 7.103333 7.313333 7.403333 7.418333 7.427333 1.332389
10 13 9.700000 6.028205 3.383333 5.933333 6.083333 6.203333 6.263333 6.290000 6.300000 6.680000 7.186667 7.946667 8.800000 9.250000 9.520000 1.636595
11 13 8.400000 6.094872 4.083333 5.666667 6.176667 6.520000 6.530000 6.666667 6.866667 7.266667 7.676667 8.106667 8.310000 8.355000 8.382000 1.386081
12 13 8.000000 4.975641 3.366667 4.800000 4.920000 5.070000 5.280000 5.356667 5.366667 5.736667 6.076667 6.356667 7.070000 7.535000 7.814000 1.283759
13 13 7.200000 4.260256 2.300000 3.816667 4.356667 4.760000 4.890000 5.053333 5.233333 5.383333 5.720000 6.430000 6.880000 7.040000 7.136000 1.610808
14 13 5.933333 2.921795 1.166667 2.683333 2.703333 2.800000 3.050000 3.246667 3.416667 3.586667 3.980000 4.820000 5.433333 5.683333 5.833333 1.373347
15 13 3.233333 0.970513 0.000000 0.916667 0.966667 1.066667 1.266667 1.346667 1.366667 1.466667 1.546667 1.586667 2.253333 2.743333 3.037333 0.887259
16 13 2.000000 0.635897 0.000000 0.516667 0.526667 0.540000 0.560000 0.640000 0.750000 1.340000 1.746667 1.786667 1.880000 1.940000 1.976000 0.739208
17 13 0.833333 0.230769 0.000000 0.000000 0.050000 0.116667 0.216667 0.283333 0.333333 0.583333 0.750000 0.750000 0.783333 0.808333 0.823333 0.330113
18 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
19 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
20 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
21 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
22 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
23 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
5 0 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
3 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
4 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

In [41]:
occ_stats_summary = occ_stats.unstack()
arr_stats_summary = arr_stats.unstack()
dep_stats_summary = dep_stats.unstack()

In [32]:
print (occ_stats_summary[200:220].values) # Let's peek into the middle of the table.


[[  1.30000000e+01   5.00000000e-01   8.33333333e-02   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e-01   2.16666667e-01
    3.66666667e-01   4.50000000e-01   4.75000000e-01   4.90000000e-01
    1.73472167e-01]
 [  1.30000000e+01   2.35000000e+00   5.34615385e-01   0.00000000e+00
    4.66666667e-01   4.76666667e-01   4.86666667e-01   4.96666667e-01
    5.60000000e-01   6.50000000e-01   6.90000000e-01   7.83333333e-01
    9.83333333e-01   1.57000000e+00   1.96000000e+00   2.19400000e+00
    6.33642525e-01]
 [  1.30000000e+01   6.01666667e+00   3.26410256e+00   1.25000000e+00
    3.10000000e+00   3.26000000e+00   3.41333333e+00   3.55333333e+00
    3.63333333e+00   3.68333333e+00   3.78333333e+00   4.11333333e+00
    4.90333333e+00   5.50666667e+00   5.76166667e+00   5.91466667e+00
    1.26978502e+00]
 [  1.30000000e+01   9.55000000e+00   5.77179487e+00   3.43333333e+00
    5.43333333e+00   5.47333333e+00   5.51000000e+00   5.54000000e+00
    5.90333333e+00   6.43333333e+00   6.69333333e+00   7.14000000e+00
    7.96000000e+00   8.76000000e+00   9.15500000e+00   9.39200000e+00
    1.64928975e+00]
 [  1.30000000e+01   8.13333333e+00   5.15641026e+00   1.16666667e+00
    5.41666667e+00   5.44666667e+00   5.57666667e+00   5.90666667e+00
    6.11666667e+00   6.26666667e+00   6.43666667e+00   6.72333333e+00
    7.24333333e+00   7.70333333e+00   7.91833333e+00   8.04733333e+00
    1.92178710e+00]
 [  1.30000000e+01   7.41666667e+00   4.99871795e+00   1.21666667e+00
    5.20000000e+00   5.27000000e+00   5.39333333e+00   5.62333333e+00
    6.18666667e+00   6.91666667e+00   6.99666667e+00   7.06333333e+00
    7.10333333e+00   7.23666667e+00   7.32666667e+00   7.38066667e+00
    1.87804643e+00]
 [  1.30000000e+01   7.38333333e+00   4.92051282e+00   3.21666667e+00
    4.86666667e+00   5.04666667e+00   5.25666667e+00   5.52666667e+00
    5.71666667e+00   5.86666667e+00   6.07666667e+00   6.23000000e+00
    6.27000000e+00   6.72333333e+00   7.05333333e+00   7.25133333e+00
    1.29336186e+00]
 [  1.30000000e+01   8.55000000e+00   4.81410256e+00   1.71666667e+00
    5.16666667e+00   5.32666667e+00   5.45666667e+00   5.52666667e+00
    5.55666667e+00   5.56666667e+00   6.13666667e+00   6.59000000e+00
    6.81000000e+00   7.55000000e+00   8.05000000e+00   8.35000000e+00
    1.97326865e+00]
 [  1.30000000e+01   6.91666667e+00   3.95128205e+00   2.60000000e+00
    3.20000000e+00   3.44000000e+00   3.73000000e+00   4.12000000e+00
    4.29000000e+00   4.35000000e+00   5.07000000e+00   5.75000000e+00
    6.35000000e+00   6.69666667e+00   6.80666667e+00   6.87266667e+00
    1.49453420e+00]
 [  1.30000000e+01   8.88333333e+00   3.88974359e+00   9.83333333e-01
    3.31666667e+00   3.36666667e+00   3.59333333e+00   4.17333333e+00
    4.70666667e+00   5.21666667e+00   5.35666667e+00   5.65000000e+00
    6.25000000e+00   7.42333333e+00   8.15333333e+00   8.59133333e+00
    2.16674720e+00]
 [  1.30000000e+01   5.08333333e+00   2.66025641e+00   0.00000000e+00
    2.41666667e+00   2.85666667e+00   3.16666667e+00   3.21666667e+00
    3.42666667e+00   3.71666667e+00   4.26666667e+00   4.70666667e+00
    4.92666667e+00   5.03333333e+00   5.05833333e+00   5.07333333e+00
    1.66110932e+00]
 [  1.30000000e+01   3.15000000e+00   2.07820513e+00   7.33333333e-01
    2.06666667e+00   2.09666667e+00   2.14000000e+00   2.21000000e+00
    2.32000000e+00   2.45000000e+00   2.67000000e+00   2.83000000e+00
    2.87000000e+00   2.99000000e+00   3.07000000e+00   3.11800000e+00
    6.77221468e-01]
 [  1.30000000e+01   3.06666667e+00   1.21666667e+00   1.66666667e-01
    1.08333333e+00   1.17333333e+00   1.25000000e+00   1.30000000e+00
    1.31666667e+00   1.31666667e+00   1.32666667e+00   1.45000000e+00
    1.80000000e+00   2.37666667e+00   2.72166667e+00   2.92866667e+00
    6.90779216e-01]
 [  1.30000000e+01   2.56666667e+00   1.24102564e+00   0.00000000e+00
    1.25000000e+00   1.39000000e+00   1.50333333e+00   1.56333333e+00
    1.73666667e+00   1.96666667e+00   1.98666667e+00   2.01333333e+00
    2.05333333e+00   2.26666667e+00   2.41666667e+00   2.50666667e+00
    7.91377590e-01]
 [  1.30000000e+01   1.81666667e+00   4.84615385e-01   0.00000000e+00
    4.16666667e-01   4.16666667e-01   4.33333333e-01   4.83333333e-01
    5.66666667e-01   6.66666667e-01   7.16666667e-01   8.00000000e-01
    9.50000000e-01   1.32666667e+00   1.57166667e+00   1.71866667e+00
    5.03481469e-01]
 [  1.30000000e+01   1.38333333e+00   4.14102564e-01   0.00000000e+00
    2.50000000e-01   4.00000000e-01   5.16666667e-01   5.66666667e-01
    6.50000000e-01   7.50000000e-01   7.50000000e-01   7.70000000e-01
    8.30000000e-01   1.06333333e+00   1.22333333e+00   1.31933333e+00
    4.32630100e-01]
 [  1.30000000e+01   1.63333333e+00   2.42307692e-01   0.00000000e+00
    0.00000000e+00   0.00000000e+00   3.33333333e-03   1.33333333e-02
    1.66666667e-02   1.66666667e-02   4.46666667e-01   7.36666667e-01
    7.46666667e-01   1.10333333e+00   1.36833333e+00   1.52733333e+00
    5.00444959e-01]
 [  1.30000000e+01   7.50000000e-01   1.88461538e-01   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.26666667e-01   3.16666667e-01   5.16666667e-01   6.66666667e-01
    7.16666667e-01   7.40000000e-01   7.45000000e-01   7.48000000e-01
    3.11090124e-01]
 [  1.30000000e+01   9.00000000e-01   1.19230769e-01   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   4.00000000e-02   1.70000000e-01
    4.80000000e-01   7.10000000e-01   8.05000000e-01   8.62000000e-01
    2.84299509e-01]
 [  1.30000000e+01   5.83333333e-01   7.05128205e-02   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   6.66666667e-02
    2.66666667e-01   4.33333333e-01   5.08333333e-01   5.53333333e-01
    1.79525332e-01]]

Let's fire these guys out to csv files so we can check them out and maybe play with them in spreadsheet.


In [33]:
file_occ_csv = 'data/occ_stats_' + scenario_name + '.csv'
file_arr_csv = 'data/arr_stats_' + scenario_name + '.csv'
file_dep_csv = 'data/dep_stats_' + scenario_name + '.csv'

occ_stats_summary.to_csv('data/occ_stats_summary.csv')
arr_stats_summary.to_csv('data/arr_stats_summary.csv')
dep_stats_summary.to_csv('data/dep_stats_summary.csv')

Put it all together

Below I've strung together all the pieces to do an entire Hillmaker run. Change inputs as needed (e.g. scenario_name and associated parameter values) and run all the cells below. You can skip rereading the main input file if that isn't changing.

Read main stop data file


In [42]:
file_stopdata = 'data/ShortStay.csv'
df = pd.read_csv(file_stopdata, parse_dates=['InRoomTS','OutRoomTS'])

Set input parameters


In [43]:
scenario_name = 'sstest_60'
in_fld_name = 'InRoomTS'
out_fld_name = 'OutRoomTS'
cat_fld_name = 'PatType'
start_analysis = '1/2/1996'
end_analysis = '3/31/1996 23:45'
bin_size_mins = 60

# This next field wasn't in original Hillmaker. Use it to specify the name to use for the overall totals.
tot_fld_name = 'Total'

## Convert string dates to actual datetimes
start_analysis_dt = pd.Timestamp(start_analysis)
end_analysis_dt = pd.Timestamp(end_analysis)

Create the by datetime table


In [44]:
bydt_df = bydatetime.make_bydatetime(df,
                                     in_fld_name,
                                     out_fld_name,
                                     cat_fld_name,
                                     start_analysis_dt,
                                     end_analysis_dt,
                                     tot_fld_name,
                                     bin_size_mins)


rng_bydt created: 0.0009996110002248315
found unique categories: 0.005060685000898957
Seeded bydatetime DataFrame created: 0.03853595900000073
dayofweek, bin_of_day, bin_of_week computed: 0.3139086690007389
Multi-index on bydatetime DataFrame created: 0.3179300960000546
Multi-index fully lexsorted: 0.32116212800065114
Num inner: 19793
Done processing 19793 stop recs: 15.905689233000885

Compute summary stats


In [23]:
def get_occstats(group, stub=''):
    return {stub+'count': group.count(), stub+'mean': group.mean(), 
            stub+'min': group.min(),
            stub+'max': group.max(), 'stdev': group.std(), 
            stub+'p50': group.quantile(0.5), stub+'p55': group.quantile(0.55),
            stub+'p60': group.quantile(0.6), stub+'p65': group.quantile(0.65),
            stub+'p70': group.quantile(0.7), stub+'p75': group.quantile(0.75),
            stub+'p80': group.quantile(0.8), stub+'p85': group.quantile(0.85),
            stub+'p90': group.quantile(0.9), stub+'p95': group.quantile(0.95),
            stub+'p975': group.quantile(0.975), 
            stub+'p99': group.quantile(0.99)}

In [45]:
bydt_dfgrp2 = bydt_df.groupby(['category','day_of_week','bin_of_day'])

occ_stats = bydt_dfgrp2['occupancy'].apply(get_occstats)
arr_stats = bydt_dfgrp2['arrivals'].apply(get_occstats)
dep_stats = bydt_dfgrp2['departures'].apply(get_occstats)

occ_stats_summary = occ_stats.unstack()
arr_stats_summary = arr_stats.unstack()
dep_stats_summary = dep_stats.unstack()

Write summaries and by datetime out to CSV


In [46]:
file_bydt_csv = 'data/bydate_' + scenario_name + '.csv'
bydt_df.to_csv(file_bydt_csv)

file_occ_csv = 'data/occ_stats_' + scenario_name + '.csv'
file_arr_csv = 'data/arr_stats_' + scenario_name + '.csv'
file_dep_csv = 'data/dep_stats_' + scenario_name + '.csv'

occ_stats_summary.to_csv('data/occ_stats_summary.csv')
arr_stats_summary.to_csv('data/arr_stats_summary.csv')
dep_stats_summary.to_csv('data/dep_stats_summary.csv')