In [1]:
import pandas as pd
f = pd.read_csv('../data/BLUETH_20150819.filtered.BT', header=None, names=['Site', 'Unix Time', 'Anonymized Bluetooth ID'])
f.head()
Out[1]:
In [2]:
f_sorted = f.sort_values(by=['Anonymized Bluetooth ID', 'Unix Time'])
f_sorted.head()
Out[2]:
In [3]:
f_groups = f_sorted.groupby(['Anonymized Bluetooth ID'])
In [4]:
# trace of vehicle with bluetooth id 000B18...
sample_veh = f_groups.get_group('000B1865B7FAA931B56B92C344F6B56B')
sample_veh
Out[4]:
In [5]:
def segments(df):
"""
Convert ordered table of visited sites into segments between adjacent nodes.
dataframe -- site, time, bluetooth_id
"""
results = []
last_row = None
for index, row in df.iterrows():
if last_row is not None and row["Site"] != last_row["Site"]:
segment = (last_row["Anonymized Bluetooth ID"],
last_row["Site"],
row["Site"],
last_row["Unix Time"],
row["Unix Time"])
results.append(segment)
last_row = row
return results
segments(sample_veh)
Out[5]:
In [6]:
results = []
for bt_id, data in f_groups:
for segment in segments(data):
results.append(segment)
all_segments = pd.DataFrame(results,
columns=('Anonymized Bluetooth ID', 'Site A', 'Site B', 'Time A', 'Time B'))
In [7]:
all_segments.head()
Out[7]:
In [8]:
inbound = all_segments[all_segments["Site A"] == 2409]
In [9]:
inbound = inbound.copy()
inbound.head()
Out[9]:
In [10]:
travel_time = inbound["Time B"] - inbound["Time A"]
inbound["Travel Time"] = travel_time
In [11]:
inbound.head()
Out[11]:
In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
tt = list(travel_time)
bins = np.linspace(0, 2000, 101)
plt.hist(tt, bins=bins)
plt.show()
In [13]:
import calendar
plt.figure(figsize=(16,8))
start_of_day = calendar.timegm((2015,8,19,0,0,0))
plt.scatter(list((inbound["Time A"] - start_of_day)/3600), list(inbound["Travel Time"]))
plt.title("Travel time from site 2409 (Chapel St) to 2425 (Warrigal Rd) along Princes Highway (Outbound). Wed 19 Aug 2015.")
plt.ylabel("Travel Time (seconds)")
plt.xlabel("Time Leave (Hour)")
plt.xticks(np.arange(24))
plt.xlim([0,24])
plt.ylim([0,2000])
plt.axhline(y=1800, color='grey') # threshold
plt.show()
In [14]:
# Filter extreme travel times
inbound = inbound[inbound["Travel Time"] <= 1800]
In [15]:
max(inbound["Travel Time"])
Out[15]:
Experiment with timezones. Times should be stated as seconds since the Unix Epoch (00:00:00 UTC 1970), which would result in the start of the Australian day being 10:00:00 UTC. Since this is not the case, it seems that VicRoads has set their server clock to localtime rather than UTC. This is wrong, but convenient.
In [16]:
import datetime
start_of_day = min(inbound["Time A"])
print (start_of_day)
print (datetime.datetime.utcfromtimestamp(start_of_day))
In [17]:
import datetime
def parse_date(unix_time):
d_utc = datetime.datetime.utcfromtimestamp(unix_time)
# Unix servers *should* have their system clock set to UTC.
# So we theoretically, we need to convert from UTC to AEST (localtime).
# However, VicRoads seems to have set their operating system clock to AEST.
# The easiest way to deal with this, is to treat all datetimes as naive (ignore timezone).
# TLDR; VicRoads didn't handle timezones correctly. We need to copy their error for consistency.
d_local = d_utc # Naive datetime. It's already shifted to AEST (but shouldn't be)
return d_local
ts = pd.Series(list(inbound["Travel Time"]),
index=list([parse_date(t) for t in inbound["Time A"]]))
In [18]:
ts.sort_index().tail()
Out[18]:
In [19]:
ts_resampled = ts.resample('15Min', how='median')
# Index over entire day, even if some times are missing. Last 15 minutes usualy not present.
rng = pd.date_range('2015-08-19 00:00:00', periods=24*4, freq='15Min')
ts_resampled = pd.Series(ts_resampled, index=rng)
# Fill in missing values
ts_resampled = ts_resampled.fillna(method='pad')
In [20]:
ts_resampled.tail()
Out[20]:
In [21]:
plt.figure(figsize=(16,8))
plt.scatter(np.arange(len(ts_resampled)), ts_resampled.values)
plt.title("Travel time from site 2409 (Chapel St) to 2425 (Warrigal Rd) along Princes Highway (Outbound). Wed 19 Aug 2015")
plt.ylabel("Travel Time (seconds)")
plt.xlabel("Time Leave (15 min offset)")
plt.xlim([0,95])
plt.ylim([0,2000])
plt.show()