In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [29]:
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
#import seaborn as sns
from pandas import Timestamp
from datetime import datetime
from datetime import timedelta
import re
import hillmaker as hm
In [4]:
file_stopdata = 'data/unit_stop_log_Experiment1_Scenario1_Rep1.csv'
scenario_name = 'log_unitocc_test_steadystate'
in_fld_name = 'EnteredTS'
out_fld_name = 'ExitedTS'
cat_fld_name = 'Unit'
start_analysis = '6/24/2015 00:00'
end_analysis = '6/16/2016 00:00'
# Optional inputs
tot_fld_name = 'OBTot'
bin_size_mins = 1440
includecats = ['LDR','PP']
In [5]:
rx = re.compile(r'Scenario([0-9]){1,3}_Rep([0-9]){1,2}')
In [7]:
stops_df = pd.read_csv(file_stopdata,index_col=0)
basedate = Timestamp('20150215 00:00:00')
stops_df['EnteredTS'] = stops_df.apply(lambda row:
Timestamp(round((basedate + pd.DateOffset(hours=row['Entered'])).value,-9)), axis=1)
stops_df['ExitedTS'] = stops_df.apply(lambda row:
Timestamp(round((basedate + pd.DateOffset(hours=row['Exited'])).value,-9)), axis=1)
stops_df = stops_df[stops_df[cat_fld_name].isin(includecats)]
start_analysis_dt = pd.Timestamp(start_analysis)
end_analysis_dt = pd.Timestamp(end_analysis)
stops_df.shape
Out[7]:
In [8]:
# Now filter the stops dataframe so only records that will count for occupancy are included. This
# will allow us to analyze the blocking related columns directly (since no blocking during warmup will be included)
stops_df = stops_df[(stops_df['EnteredTS'] <= end_analysis_dt) & (stops_df['ExitedTS'] >= start_analysis_dt)]
stops_df.shape
Out[8]:
In [9]:
m = re.search(rx, file_stopdata)
scenario_num = m.group(1)
rep_num = m.group(2)
print (scenario_num, rep_num)
In [66]:
def dt_floor_original(dt, binsizemins):
"""
Find floor of a datetime object to specified number of minutes.
dt : Pandas Timestamp object
floor_minutes : Closest number of minutes to round to.
"""
#nsmin=minutes*60*1000000000 # 5 minutes in nanoseconds
#totns = dt.total_seconds*1000000000
#pd.DatetimeIndex(((df.index.astype(np.int64) // ns5min + 1 ) * ns5min))
#ns = (totns // nsmin) * nsmin
#return pd.to_datetime(ns, unit='ns')
floor_seconds = binsizemins * 60
#dateDelta=pd.Timedelta(minutes=binsizemins)
#floor_seconds = dateDelta.total_seconds()
dt_date = Timestamp(dt.date())
delta = dt - dt_date
# #print(delta)
tot_seconds = (dt - dt_date).seconds
# #print(tot_seconds)
#
floor_time = (tot_seconds // floor_seconds) * floor_seconds
# #print(floor_time)
# #gap_seconds = tot_seconds - floor_time
# #print(dt_date + pd.DateOffset(seconds=floor_time))
return dt_date + pd.DateOffset(seconds=floor_time)
#return dt + pd.Timedelta(seconds=floor_time)
def dt_floor(dt, binsizemins=60):
"""Round a datetime object to a multiple of a timedelta
dt : datetime.datetime object, default now.
dateDelta : timedelta object, we round to a multiple of this, default 1 minute.
Author: Thierry Husson 2012 - Use it as you want but don't blame me.
Stijn Nevens 2014 - Changed to use only datetime objects as variables
"""
dateDelta=pd.Timedelta(minutes=binsizemins)
roundTo = dateDelta.total_seconds()
totseconds = (dt - dt.min).seconds
# // is a floor division, not a comment on following line:
rounding = (totseconds // roundTo) * roundTo
return dt + timedelta(0,rounding-totseconds, -dt.microsecond)
In [12]:
testdate = Timestamp('20150215 14:24:00')
In [76]:
%load_ext Cython
In [79]:
%%cython
import pandas as pd
from pandas import Timestamp
def dt_floor_cython(dt, binsizemins):
"""
Find floor of a datetime object to specified number of minutes.
dt : Pandas Timestamp object
floor_minutes : Closest number of minutes to round to.
"""
#nsmin=minutes*60*1000000000 # 5 minutes in nanoseconds
#totns = dt.total_seconds*1000000000
#pd.DatetimeIndex(((df.index.astype(np.int64) // ns5min + 1 ) * ns5min))
#ns = (totns // nsmin) * nsmin
#return pd.to_datetime(ns, unit='ns')
floor_seconds = binsizemins * 60
#dateDelta=pd.Timedelta(minutes=binsizemins)
#floor_seconds = dateDelta.total_seconds()
dt_date = Timestamp(dt.date())
delta = dt - dt_date
# #print(delta)
tot_seconds = (dt - dt_date).seconds
# #print(tot_seconds)
#
floor_time = (tot_seconds // floor_seconds) * floor_seconds
# #print(floor_time)
# #gap_seconds = tot_seconds - floor_time
# #print(dt_date + pd.DateOffset(seconds=floor_time))
return dt_date + pd.DateOffset(seconds=floor_time)
#return dt + pd.Timedelta(seconds=floor_time)
In [50]:
testdate.minute
Out[50]:
In [38]:
dateDelta = timedelta(minutes=60)
In [39]:
dateDelta
Out[39]:
In [40]:
dateDeltaPD = pd.Timedelta(minutes=60)
In [41]:
dateDeltaPD
Out[41]:
In [24]:
%%timeit
x=5
x^2
In [59]:
dt_floor(testdate,60)
Out[59]:
In [80]:
%timeit dt_floor_original(testdate,1440)
In [81]:
%timeit dt_floor_cython(testdate,1440)
In [8]:
fn_bydatetime = 'testing/bydatetime_' + scenario_name + '.csv'
fn_occ_summary = 'testing/occ_stats_summary_' + scenario_name + '.csv'
fn_arr_summary = 'testing/arr_stats_summary_' + scenario_name + '.csv'
fn_dep_summary = 'testing/dep_stats_summary_' + scenario_name + '.csv'
In [82]:
%prun -l 50 hm.run_hillmaker(scenario_name,stops_df,in_fld_name, out_fld_name,cat_fld_name,start_analysis,end_analysis,tot_fld_name,bin_size_mins,categories=includecats,outputpath='./testing')
5368175 function calls (5360007 primitive calls) in 17.618 seconds
Ordered by: internal time List reduced from 955 to 10 due to restriction <10>
ncalls tottime percall cumtime percall filename:lineno(function) 70524 4.363 0.000 8.160 0.000 hillpylib.py:97(dt_floor) 17631 1.436 0.000 5.527 0.000 hillpylib.py:146(occ_frac) 70524 1.129 0.000 1.235 0.000 offsets.py:198(apply) 1 0.871 0.871 16.276 16.276 bydatetime.py:25(make_bydatetime) 70526 0.800 0.000 1.124 0.000 offsets.py:177(_determine_offset) 17631 0.749 0.000 0.780 0.000 hillpylib.py:136(numbins) 70524 0.648 0.000 2.123 0.000 offsets.py:44(wrapper) 68659 0.494 0.000 0.494 0.000 {method 'set_value' of 'pandas.index.IndexEngine' objects} 70968 0.476 0.000 0.480 0.000 {method 'get_value' of 'pandas.index.IndexEngine' objects} 68659 0.466 0.000 1.868 0.000 indexing.py:1518(setitem)
In [40]:
hm.run_hillmaker(scenario_name,stops_df,in_fld_name, out_fld_name,cat_fld_name,start_analysis,end_analysis,tot_fld_name,bin_size_mins,categories=includecats,outputpath='./testing')
In [9]:
occ_df = pd.read_csv(fn_occ_summary)
In [10]:
bydt_df = pd.read_csv(fn_bydatetime)
In [11]:
def num_gt_0(column):
return (column != 0).sum()
def get_stats(group, stub=''):
return {stub+'count': group.count(), stub+'mean': group.mean(),
stub+'min': group.min(), stub+'num_gt_0': num_gt_0(group),
stub+'max': group.max(), stub+'stdev': group.std(),
stub+'p01': group.quantile(0.01), stub+'p025': group.quantile(0.025),
stub+'p05': group.quantile(0.05), stub+'p25': group.quantile(0.25),
stub+'p50': group.quantile(0.5), stub+'p75': group.quantile(0.75),
stub+'p90': group.quantile(0.9), stub+'p95': group.quantile(0.95),
stub+'p975': group.quantile(0.975), stub+'p99': group.quantile(0.99)}
In [14]:
pp_occ = bydt_df[(bydt_df['category'] == 'PP')]['occupancy']
In [15]:
pp_occ.describe()
Out[15]:
In [16]:
get_stats(pp_occ)
Out[16]:
In [17]:
ldr_occ = bydt_df[(bydt_df['category'] == 'LDR')]['occupancy']
In [22]:
ldr_occ.describe()
Out[22]:
In [15]:
get_stats(ldr_occ)
Out[15]:
In [15]:
plt.hist(pp_occ.values,12)
Out[15]:
In [16]:
plt.hist(ldr_occ.values,20)
Out[16]:
In [17]:
bydt_df.head()
Out[17]:
In [10]:
sns.tsplot(pp_occ);
In [32]:
pp_occ.head()
Out[32]:
In [33]:
occ_df
Out[33]:
In [18]:
ldr_occ_stats = get_stats(ldr_occ)
pp_occ_stats = get_stats(pp_occ)
In [26]:
grp_all = stops_df.groupby(['Unit'])
In [28]:
blocked_uncond_stats = grp_all['Entered_TriedToEnter'].apply(get_stats,'delay_')
blocked_uncond_stats
Out[28]:
In [29]:
grp_blocked = stops_df[(stops_df['Entered_TriedToEnter'] > 0)].groupby(['Unit'])
In [30]:
blocked_cond_stats = grp_blocked['Entered_TriedToEnter'].apply(get_stats,'delay_')
blocked_cond_stats
Out[30]:
In [23]:
blocked_cond_stats.index
Out[23]:
In [24]:
blocked_cond_stats[(1,'LDR','test_mean')]
Out[24]:
In [31]:
newrec = {}
newrec['scenario'] = scenario_num
newrec['rep'] = rep_num
newrec['occ_mean_ldr'] = ldr_occ_stats['mean']
newrec['occ_p05_ldr'] = ldr_occ_stats['p05']
newrec['occ_p25_ldr'] = ldr_occ_stats['p25']
newrec['occ_p50_ldr'] = ldr_occ_stats['p50']
newrec['occ_p75_ldr'] = ldr_occ_stats['p75']
newrec['occ_p95_ldr'] = ldr_occ_stats['p95']
newrec['occ_mean_pp'] = pp_occ_stats['mean']
newrec['occ_p05_pp'] = pp_occ_stats['p05']
newrec['occ_p25_pp'] = pp_occ_stats['p25']
newrec['occ_p50_pp'] = pp_occ_stats['p50']
newrec['occ_p75_pp'] = pp_occ_stats['p75']
newrec['occ_p95_pp'] = pp_occ_stats['p95']
newrec['pct_waitq_ldr'] = blocked_uncond_stats[('LDR','delay_num_gt_0')]/blocked_uncond_stats[('LDR','delay_count')]
newrec['waitq_ldr_mean'] = blocked_cond_stats[('LDR','delay_mean')]
newrec['waitq_ldr_p95'] = blocked_cond_stats[('LDR','delay_p95')]
newrec['pct_blocked_ldr'] = blocked_uncond_stats[('PP','delay_num_gt_0')]/blocked_uncond_stats[('PP','delay_count')]
newrec['blocked_ldr_mean'] = blocked_cond_stats[('PP','delay_mean')]
newrec['blocked_ldr_p95'] = blocked_cond_stats[('PP','delay_p95')]
print(newrec)
In [4]:
a_start = pd.Timestamp(start_analysis)
a_end = pd.Timestamp(end_analysis)
print(a_start,a_end)
In [5]:
left_PP_df = df[(df['EnteredTS'] < a_start) & (a_start <= df['ExitedTS']) & (df['ExitedTS'] < a_end) & (df['Unit'] == 'PP')]
In [9]:
right_PP_df = df[(a_start <= df['EnteredTS']) & (df['EnteredTS'] < a_end) & (df['ExitedTS'] >= a_end) & (df['Unit'] == 'PP')]
In [10]:
print(right_PP_df.shape)
right_PP_df[:][['EnteredTS','ExitedTS']]
Out[10]:
In [39]:
print(left_PP_df.shape)
left_PP_df[:][['EnteredTS','ExitedTS']]
Out[39]: