When we are working on spatio-temporal data sets, it will be handy if we can visualize the spatial components of data while understanding their relations with time series. In this post, I present an example of how to visualize the bus bunching (buses of the same service number arriving at the same stop) of New York city.
The original data set are available at New York City Bus Data. In addition, we download the bus stop information of New York City
This jupyter notebook is available at my Github page: VisualizeBusBunching.ipynb, and it is part of the repository jqlearning.
In [1]:
import pandas as pd
import numpy as np
In [2]:
df = pd.read_csv("mta_1706.csv")
In [3]:
# Set to datetime object
df['RecordedAtTime'] = pd.to_datetime(df['RecordedAtTime'])
In [4]:
df = df[(df['RecordedAtTime'] < pd.Timestamp('2017-06-02')) & (df['RecordedAtTime'] > pd.Timestamp('2017-05-31'))]
In [5]:
# filter missing values
df = df.dropna(axis=0, how='any')
In [6]:
# BusCoord records both Longitude and Latidue info
df['BusCoord'] = list(zip(df["VehicleLocation.Longitude"], df["VehicleLocation.Latitude"]))
In [7]:
df.head(5)
Out[7]:
In [8]:
vehicle_gb = df.groupby(["RecordedAtTime", "PublishedLineName", "DirectionRef", "VehicleRef"])
In [9]:
vehicle_cnt_df = vehicle_gb.count()
In [10]:
# The following demonstrates an example of duplicate record
vehicle_gb.get_group(vehicle_cnt_df[vehicle_cnt_df["BusCoord"] > 1].index[0])
Out[10]:
In [11]:
bus_df = vehicle_gb.head(1).copy()
In [12]:
stop_bronx_df = pd.read_csv("stops_bronx.txt")
stop_brooklyn_df = pd.read_csv("stops_brooklyn.txt")
stop_manhattan_df = pd.read_csv("stops_manhattan.txt")
stop_queens_df = pd.read_csv("stops_queens.txt")
stop_staten_island_df = pd.read_csv("stops_staten_island.txt")
In [13]:
stop_new_york_df = pd.concat([stop_bronx_df,
stop_brooklyn_df,
stop_manhattan_df,
stop_queens_df,
stop_staten_island_df], axis=0)
In [14]:
stop_new_york_df.drop_duplicates(inplace=True)
In [15]:
bus_at_stop_df = bus_df[bus_df["ArrivalProximityText"] == "at stop"]
In [16]:
# For bus bunching, buses of the same service number arriving at the same stop
bus_at_stop_gb = bus_at_stop_df.groupby(["RecordedAtTime", "PublishedLineName", "DirectionRef", "NextStopPointName"])
bus_at_stop_cnt = bus_at_stop_gb.count()
In [17]:
# Bunched buses have multiple locations
bunched_bus_df = bus_at_stop_cnt[bus_at_stop_cnt["BusCoord"] > 1]
In [18]:
# An example of a bus bunchin scenario
bus_at_stop_gb.get_group(bunched_bus_df.index[0])
Out[18]:
In [19]:
# Initilialize as False
bus_df["BunchedStatus"] = False
bus_df["BunchedStatus"] = bus_df.apply(lambda x: True if (x["RecordedAtTime"],
x["PublishedLineName"],
x["DirectionRef"],
x["NextStopPointName"]) in bunched_bus_df.index else False, axis=1)
In [20]:
bus_df['TimeInterval'] = bus_df['RecordedAtTime'].map(lambda x: x.floor('30s'))
In [21]:
from pyproj import Proj, transform
# helper function to convert lat/long to easting/northing for mapping
def LongLat_to_EN(long, lat):
try:
easting, northing = transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), long, lat)
return easting, northing
except:
return None, None
In [22]:
bus_df['VehicleLocation.E'], bus_df['VehicleLocation.N'] = zip(*bus_df.apply(
lambda x: LongLat_to_EN(x['VehicleLocation.Longitude'], x['VehicleLocation.Latitude']), axis=1))
In [23]:
stop_new_york_df["stop.E"], stop_new_york_df["stop.N"] = zip(*stop_new_york_df.apply(
lambda x: LongLat_to_EN(x["stop_lon"], x["stop_lat"]), axis=1))
In [24]:
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.io import output_notebook, push_notebook
from bokeh.models import HoverTool
output_notebook()
In [25]:
def busSources(datetime):
# given a datetime, separate bunched bus and non-bunched bus sources.
# Selected data sourceNonBunched for bunched bus
sourceBunched = ColumnDataSource(data=dict(
lon=bus_df[(bus_df['BunchedStatus'] == True) & (bus_df['TimeInterval'] == datetime)]["VehicleLocation.E"],
lat=bus_df[(bus_df['BunchedStatus'] == True) & (bus_df['TimeInterval'] == datetime)]["VehicleLocation.N"],
PublishedLineName=bus_df[(bus_df['BunchedStatus'] == True) & (bus_df['TimeInterval'] == datetime)]["PublishedLineName"],
DirectionRef=bus_df[(bus_df['BunchedStatus'] == True) & (bus_df['TimeInterval'] == datetime)]["DirectionRef"],
VehicleRef=bus_df[(bus_df['BunchedStatus'] == True) & (bus_df['TimeInterval'] == datetime)]["VehicleRef"],
RecordedAtTime=bus_df[(bus_df['BunchedStatus'] == True) & (bus_df['TimeInterval'] == datetime)]["RecordedAtTime"],
NextStopPoint=bus_df[(bus_df['BunchedStatus'] == True) & (bus_df['TimeInterval'] == datetime)]["NextStopPointName"]
))
# Selected data sourceNonBunchedfor non-bunching bus
sourceNonBunched = ColumnDataSource(data=dict(
lon=bus_df[(bus_df['BunchedStatus'] == False) & (bus_df['TimeInterval'] == datetime)]["VehicleLocation.E"],
lat=bus_df[(bus_df['BunchedStatus'] == False) & (bus_df['TimeInterval'] == datetime)]["VehicleLocation.N"],
PublishedLineName=bus_df[(bus_df['BunchedStatus'] == False) & (bus_df['TimeInterval'] == datetime)]["PublishedLineName"],
DirectionRef=bus_df[(bus_df['BunchedStatus'] == False) & (bus_df['TimeInterval'] == datetime)]["DirectionRef"],
VehicleRef=bus_df[(bus_df['BunchedStatus'] == False) & (bus_df['TimeInterval'] == datetime)]["VehicleRef"],
RecordedAtTime=bus_df[(bus_df['BunchedStatus'] == False) & (bus_df['TimeInterval'] == datetime)]["RecordedAtTime"],
NextStopPoint=bus_df[(bus_df['BunchedStatus'] == False) & (bus_df['TimeInterval'] == datetime)]["NextStopPointName"]
))
return sourceBunched, sourceNonBunched
In [26]:
def visualize_selected_time(datetime):
plot = figure(x_range=(bus_df["VehicleLocation.E"].min(), bus_df["VehicleLocation.E"].max()),
y_range=(bus_df["VehicleLocation.N"].min(), bus_df["VehicleLocation.N"].max()),
x_axis_type="mercator", y_axis_type="mercator")
plot.add_tile(CARTODBPOSITRON)
sourceBunched, sourceNonBunched = busSources(datetime)
# Bus Stop Info
bus_stops = ColumnDataSource(data=dict(
lon=stop_new_york_df["stop.E"],
lat=stop_new_york_df["stop.N"],
Name=stop_new_york_df["stop_name"]))
circle1 = plot.circle('lon', 'lat', size=2.5, color="orange", alpha=0.3, source=bus_stops)
# add a circle renderer with a size, color, and alpha
circle2 = plot.circle('lon', 'lat', size=5, color="navy", alpha=0.5, source=sourceNonBunched)
circle3 = plot.circle('lon', 'lat', size=8, color="red", alpha=0.5, source=sourceBunched)
plot.add_tools(HoverTool(renderers=[circle2, circle3], tooltips=[("BusService", "@PublishedLineName"),
("Direction", "@DirectionRef"),
("Vehicle", "@VehicleRef"),
# The timestamp will be automatically converted to epoch time by default
# https://bokeh.pydata.org/en/latest/docs/reference/models/formatters.html#bokeh.models.formatters.NumeralTickFormatter.format
("Time", "@RecordedAtTime{%F %T}"),
("NextStopPoint", "@NextStopPoint")],
formatters={"RecordedAtTime": "datetime"}))
plot.add_tools(HoverTool(renderers=[circle1], tooltips=[
("Stop Name", "@Name")]))
return plot, sourceBunched, sourceNonBunched
In [27]:
plot, sourceBunched, sourceNonBunched = visualize_selected_time(pd.Timestamp("20170601 12:44:00"))
In [28]:
show(plot, notebook_handle=True)
Out[28]:
In [29]:
TimeIntervalStr = bus_df["TimeInterval"].astype(str)
UniqueTimeIntervalStr = TimeIntervalStr.unique()
UniqueTimeIntervalStr = sorted(UniqueTimeIntervalStr)
In [30]:
def update_plot(datetime="2017-06-01 12:44:00"):
timestamp = pd.Timestamp(datetime)
newBunched, newNonBunched = busSources(timestamp)
sourceBunched.data = newBunched.data
sourceNonBunched.data = newNonBunched.data
push_notebook()
In [31]:
from ipywidgets import interact
interact_panel = interact(update_plot, datetime=UniqueTimeIntervalStr)