In [1]:
# Make all plots inline.
%matplotlib inline
# Import dependencies
import folium
from folium.plugins import HeatMap
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
from IPython.core.display import display
import folium.colormap as cm
In [110]:
# Load accident data.
accident_data = pd.read_csv('./data/NYPD_Motor_Vehicle_Collisions_sampled.csv')
In [111]:
accident_data.head()
Out[111]:
In [112]:
accident_data.describe()
Out[112]:
In [113]:
# Num rows in data.
print(accident_data.count())
In [114]:
# Creating the map object to hold the different layers.
# Starting coordinates to load map view.
NYC_coordinates = (40.7142700, -74.0059700)
# Create Map object.
map = folium.Map(location=NYC_coordinates,
zoom_start=12,
tiles = 'Cartodb Positron',
control_scale = 'True')
# Create layer group for collisions (optional overlay)
col_group = folium.FeatureGroup(name='Collisions').add_to(map)
# Limit to 1000 records
MAX_RECORDS = 1000
# Add marker clusters to the feature group for collisions
collision_cluster = folium.MarkerCluster().add_to(col_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
# Only plot point if lat/long is available.
if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE'])):
accident_metadata = """
<ul>
<li><strong>On street</strong>: {0}</li>
<li><strong>Cross street</strong>: {1}</li>
<li><strong>Reason</strong>: {2}</li>
</ul>""".format(
str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
str(row[1]['CONTRIBUTING FACTOR VEHICLE 1']))
iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
popup = folium.Popup(iframe, max_width=2650)
mark_color = 'red'
if row[1]['NUMBER OF CYCLIST INJURED'] >= 1:
mark_color = 'blue'
folium.Marker(
location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
icon = folium.Icon(color = mark_color, icon='asterisk'),
popup=popup).add_to(collision_cluster)
In [116]:
folium.LayerControl().add_to(map)
map
Out[116]:
In [2]:
# Load pickle files containing aggregate data.
import pickle
agg_accident_data = pickle.load(open("./../traffic-density/accident_counts.pkl", "rb"))
agg_traffic_data = pickle.load(open("./../traffic-density/intersection_counts_yellow_updated.pkl", "rb"))
nodes_to_coord_data = pickle.load(open("./../traffic-density/nodes_to_coordinates.pkl", "rb"))
# Accident Counts by Year
acc_count_2013 = pickle.load(open("./../traffic-density/accident_counts_2013.pkl", "rb"))
acc_count_2014 = pickle.load(open("./../traffic-density/accident_counts_2014.pkl", "rb"))
acc_count_2015 = pickle.load(open("./../traffic-density/accident_counts_2015.pkl", "rb"))
acc_count_2016 = pickle.load(open("./../traffic-density/accident_counts_2016.pkl", "rb"))
In [3]:
# Function that provides 10 percentile buckets for provided array.
def computePercentileRanges(array):
percentiles = []
for x in np.linspace(0, 90, num=10):
percentiles.append(np.nanpercentile(array, x))
return percentiles
# Function that computes a rank between 1-10 for a node's score
# based on what percentile it falls in.
def computeNodeRank(score, percentiles):
# Don't compute rank if score is nan.
if score == np.nan:
return np.nan
for i, x in reversed(list(enumerate(percentiles))):
if score >= x:
return i + 1
sampleData = False
sampleSize = 1000
def computeData(input, output, file_suffix):
detailed_output = []
for k, num_accidents in input.items():
if (sampleData and k > sampleSize):
break;
else:
# Get lat/long of node.
node = nodes_to_coord_data[k]
node_id = node[0]
lat = node[1]['lat']
lon = node[1]['lon']
cross_streets = node[1]['intersection_name']
if cross_streets == "":
cross_streets = "N/A"
# Get amount of traffic at that node.
traffic = agg_traffic_data[node_id]
# There are some weird cases where traffic is less than the num accidents.
# For now, we update traffic to equal the num_accidents.
# Maybe this assumption should be revisited?
if (traffic < num_accidents):
traffic = num_accidents
# Calculate intersection score.
score = num_accidents/traffic
nan_score = np.nan if traffic == num_accidents else score
# Third value in array is a placeholder for the percentile bucket
# which will be computed later.
row = [lat, lon, 0, score, num_accidents, traffic]
# First value in array is a placeholder for the percentile bucket
# which will be computed later.
detailed_row = np.array([0, cross_streets, lat, lon, nan_score, num_accidents, traffic])
output.append(row)
detailed_output.append(detailed_row)
# Compute percentiles for scores.
pd_detailed_output = pd.DataFrame(detailed_output,
columns=['Bucketed Score', 'Intersection', 'Lat', 'Long',
'Score', 'Accidents', 'Traffic'])
dtype = dtype={'Lat':'float64', 'Long':'float64','Accidents':'int64',
'Score': 'float64','Bucketed Score':'int64','Intersection':'str'}
for k,v in dtype.items():
pd_detailed_output[k] = pd_detailed_output[k].astype(v)
output_percentiles = computePercentileRanges(np.array(pd_detailed_output['Score']))
# Populate percentiles in output and detailed_output arrays.
for i, x in enumerate(output):
output[i][2] = computeNodeRank(x[3], output_percentiles)/10
pd_detailed_output['Bucketed Score'] = [computeNodeRank(x, output_percentiles) \
for x in pd_detailed_output['Score']]
# Write detailed data to file.
pd_detailed_output_sorted = pd_detailed_output.sort_values(['Bucketed Score', 'Score'],
ascending=False)
# Round raw score to 4 decimal places.
pd_detailed_output_sorted["Score"] = [round(x*10000, 2) for x in pd_detailed_output_sorted["Score"]]
pd_detailed_output_sorted.to_csv('../website/data/accident_scores_data_' + file_suffix + '.csv',
index=False)
In [4]:
# All Accidents
data = []
computeData(agg_accident_data, data, 'all')
# 2013 Accidents
data_13 = []
computeData(acc_count_2013, data_13, '13')
# 2014 Accidents
data_14 = []
computeData(acc_count_2014, data_14, '14')
# 2015 Accidents
data_15 = []
computeData(acc_count_2015, data_15, '15')
# 2016 Accidents
data_16 = []
computeData(acc_count_2016, data_16, '16')
In [159]:
# Starting heatmap coordinates.
NYC_coordinates_HM = (40.78, -73.98)
radius = 15
blur = 15
min_opacity = 0.1
max_zoom = 13
max_val = 0.8
# Heatmap
map2 = folium.Map(location=NYC_coordinates_HM,
zoom_start=12,
tiles = 'Cartodb Positron',
control_scale = 'True')
# Can adjust radius and max_val to change the heatmap concentrations
HeatMap(data = data, name='Traffic Score (All Years)', radius=radius, blur=blur,
min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
# Adding Layers for Each Year
# Can only make it radio button if I set overlay=False, but this causes problems with map loading/refreshing
#HeatMap(data = data_13, name='Traffic Score (2013)', radius=radius, blur=blur,
# min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
#HeatMap(data = data_14, name='Traffic Score (2014)', radius=radius, blur=blur,
# min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
#HeatMap(data = data_15, name='Traffic Score (2015)', radius=radius, blur=blur,
# min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
#HeatMap(data = data_16, name='Traffic Score (2016)', radius=radius, blur=blur,
# min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
folium.LayerControl().add_to(map2)
map2
Out[159]:
In [59]:
# Playing around with legend.
colormap = cm.LinearColormap(['blue', 'cyan', 'lime', 'yellow', 'red'],
vmin=0, vmax=10)
colormap.caption = 'Bucketed Score (Heatmaps)'
colormap
Out[59]:
In [11]:
sampleData = True
sampleSize = 500
# Plot traffic score markers
NYC_coordinates_HM = (40.78, -73.98)
radius = 15
blur = 15
min_opacity = 0.2
max_zoom = 16
max_val = 1
# Map Object
map3 = folium.Map(location=NYC_coordinates_HM,
zoom_start=14,
min_zoom=12,
tiles = 'Cartodb Positron',
control_scale = 'True')
# Create feature group for markers.
feature_group = folium.FeatureGroup(name='Detailed Markers')
for k, row in enumerate(data_16):
if (sampleData and k > sampleSize):
break;
lat = row[0]
lon = row[1]
bucketed_score = row[2] * 10
raw_score = row[3]
num_accidents = row[4]
traffic = row[5]
# Plot point.
node_metadata = """
<ul>
<li><strong>Num accidents</strong>: {0}</li>
<li><strong>Traffic</strong>: {1}</li>
<li><strong>Accidents per 10,000 trips</strong>: {2}</li>
<li><strong>Bucketed score</strong>: {3}</li>
</ul>""".format(
str(num_accidents), str(traffic), str(round(raw_score*10000, 2)), str(bucketed_score))
iframe = folium.element.IFrame(html=node_metadata, width=250, height=100)
popup = folium.Popup(iframe, max_width=2650)
score_color = "gray"
if (bucketed_score == 10 or bucketed_score == 9):
score_color = "red"
elif (bucketed_score == 8 or bucketed_score == 7):
score_color = "orange"
elif (bucketed_score == 6 or bucketed_score == 5):
score_color = "green"
elif (bucketed_score == 4 or bucketed_score == 3):
score_color = "blue"
elif (bucketed_score == 2 or bucketed_score == 1):
score_color = "purple"
feature_group.add_children(folium.Marker(
location = [lat, lon],
icon = folium.Icon(color=score_color),
popup=popup))
feature_group.add_to(map3)
# Can adjust radius and max_val to change the heatmap concentrations
HeatMap(data = data, name='Traffic Score (All Years)', radius=radius, blur=blur,
min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
# Adding Layers for Each Year
# Can only make it radio button if I set overlay=False, but this causes problems with map loading/refreshing
HeatMap(data = data_13, name='Traffic Score (2013)', radius=radius, blur=blur,
min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
HeatMap(data = data_14, name='Traffic Score (2014)', radius=radius, blur=blur,
min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
HeatMap(data = data_15, name='Traffic Score (2015)', radius=radius, blur=blur,
min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
HeatMap(data = data_16, name='Traffic Score (2016)', radius=radius, blur=blur,
min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
# Add legend for heatmaps.
# Don't add to map because it's hard to see when it's overlaying map.
# colormap = cm.LinearColormap(['blue', 'cyan', 'lime', 'yellow', 'red'],
# vmin=0, vmax=10)
# colormap.caption = 'Bucketed Score (Heatmaps)'
# map3.add_child(colormap)
# Only print the map in the notebook if sampling is on.
# Otherwise, it will crash your notebook.
# If not sampling the data, run the method in the next section
# to write out the generated html file and open that to see the results.
folium.LayerControl().add_to(map3)
if (sampleData):
display(map3)
In [57]:
# Save html version of map.
map3.save('../website/heatmap_traffic_scores_v5.html')
In [60]:
# Creating the map object to hold the different layers.
# Starting coordinates to load map view.
NYC_coordinates = (40.7142700, -74.0059700)
# Create Map object.
map = folium.Map(location=NYC_coordinates,
zoom_start=12,
tiles = 'Cartodb Positron',
control_scale = 'True')
# Tile layer for cycle map
# cycle_tile = folium.TileLayer(tiles = 'http://b.tile.opencyclemap.org/cycle/{z}/{x}/{y}.png',
attr='Attributed').add_to(map)
# cycle_tile.layer_name = 'Cycling Route Map'
# Create a feature group for controlling multiple aspects
col_group = folium.FeatureGroup(name='Collisions').add_to(map)
injury_group = folium.FeatureGroup(name='Injuries').add_to(map)
# Plot accidents.
# Limit number of points to plot for testing.
MAX_RECORDS = 1000
collision_cluster = folium.MarkerCluster().add_to(col_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
# Only plot point if lat/long is available.
if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE'])):
accident_metadata = """
<ul>
<li><strong>On street</strong>: {0}</li>
<li><strong>Cross street</strong>: {1}</li>
<li><strong>Reason</strong>: {2}</li>
</ul>""".format(
str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
str(row[1]['CONTRIBUTING FACTOR VEHICLE 1']))
iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
popup = folium.Popup(iframe, max_width=2650)
folium.Marker(
location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
icon = folium.Icon(color='red', icon='asterisk'),
popup=popup).add_to(collision_cluster)
# Create new cluster group for cyclist injuries
bike_cluster = folium.MarkerCluster().add_to(injury_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
# Only plot point if lat/long is available.
if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE']) and row[1]['NUMBER OF CYCLIST INJURED'] >= 1 or row[1]['NUMBER OF CYCLIST KILLED'] >= 1):
accident_metadata = """
<ul>
<li><strong>On street</strong>: {0}</li>
<li><strong>Cross street</strong>: {1}</li>
<li><strong>Cyclists Injured</strong>: {2}</li>
<li><strong>Cyclists Killed</strong>: {3}</li>
</ul>""".format(
str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
str(row[1]['NUMBER OF CYCLIST INJURED']), str(row[1]['NUMBER OF CYCLIST KILLED']))
iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
popup = folium.Popup(iframe, max_width=2650)
folium.Marker(
location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
icon = folium.Icon(color='blue', icon='asterisk'),
popup=popup).add_to(bike_cluster)
# Create new cluster group for pedestrian injuries
ped_cluster = folium.MarkerCluster().add_to(injury_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
# Only plot point if lat/long is available.
if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE']) and row[1]['NUMBER OF PEDESTRIANS INJURED'] >= 1 or row[1]['NUMBER OF PEDESTRIANS KILLED'] >= 1):
accident_metadata = """
<ul>
<li><strong>On street</strong>: {0}</li>
<li><strong>Cross street</strong>: {1}</li>
<li><strong>Pedestrians Injured</strong>: {2}</li>
<li><strong>Pedestrians Killed</strong>: {3}</li>
</ul>""".format(
str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
str(row[1]['NUMBER OF PEDESTRIANS INJURED']), str(row[1]['NUMBER OF PEDESTRIANS KILLED']))
iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
popup = folium.Popup(iframe, max_width=2650)
folium.Marker(
location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
icon = folium.Icon(color='green', icon='asterisk'),
popup=popup).add_to(ped_cluster)
# Create new cluster group for automobile injuries
auto_cluster = folium.MarkerCluster().add_to(injury_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
# Only plot point if lat/long is available.
if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE']) and row[1]['NUMBER OF MOTORIST INJURED'] >= 1 or row[1]['NUMBER OF MOTORIST KILLED'] >= 1):
accident_metadata = """
<ul>
<li><strong>On street</strong>: {0}</li>
<li><strong>Cross street</strong>: {1}</li>
<li><strong>Motorists Injured</strong>: {2}</li>
<li><strong>Motorists Killed</strong>: {3}</li>
</ul>""".format(
str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
str(row[1]['NUMBER OF MOTORIST INJURED']), str(row[1]['NUMBER OF MOTORIST KILLED']))
iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
popup = folium.Popup(iframe, max_width=2650)
folium.Marker(
location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
icon = folium.Icon(color='red', icon='asterisk'),
popup=popup).add_to(auto_cluster)
# layer names for items separately
# collision_cluster.layer_name = 'Collisions'
# bike_cluster.layer_name = 'Cyclist Injuries'
# ped_cluster.layer_name = 'Pedestrian Injuries'
folium.LayerControl().add_to(map)
map
Out[60]:
In [20]:
# Get sizes of data sets.
print(len(agg_accident_data))
print(len(agg_traffic_data))
print(len(nodes_to_coord_data))
In [13]:
# Print sample of datasets.
dict(list(agg_accident_data.items())[0:5])
Out[13]:
In [14]:
dict(list(agg_traffic_data.items())[0:5])
Out[14]:
In [57]:
nodes_to_coord_data[0:5]
Out[57]:
In [210]:
# Compute histogram of accident counts.
accident_numbers_list = list(agg_accident_data.values())
print(min(accident_numbers_list))
print(max(accident_numbers_list))
plt.xlim(min(accident_numbers_list), 300)
plt.hist(accident_numbers_list, bins=200)
plt.show()
In [211]:
# Compute histogram of traffic counts.
traffic_numbers_list = list(agg_traffic_data.values())
print(min(traffic_numbers_list))
print(max(traffic_numbers_list))
plt.xlim(min(traffic_numbers_list), 300000)
plt.hist(traffic_numbers_list, bins=200)
plt.show()
In [274]:
# Compute scores.
scores = []
for k, num_accidents in agg_accident_data.items():
if (num_accidents > 0):
node = nodes_to_coord_data[k]
node_id = node[0]
# Get amount of traffic at that node.
traffic = agg_traffic_data[node_id]
# There are some weird cases where traffic is 0 but accidents is > 0.
# For now, we update traffic to equal the num_accidents.
# Maybe this assumption should be revisited?
if (traffic < num_accidents):
traffic = num_accidents
# Calculate intersection score.
score = num_accidents/traffic
scores.append(score)
# Compute histogram of scores.
plt.xlim(0, 1)
plt.hist(np.sqrt(scores), bins=100)
plt.show()
# Compute histogram of scores after filtering out high scores.
filtered_scores = [x for x in scores if x < 0.01]
plt.xlim(0, 0.1)
plt.hist(np.sqrt(filtered_scores), bins = 50)
plt.show()
In [275]:
# putting scores on a 1 to 10 scale
scores_1to10 = [(1 + (x - min(filtered_scores))*(10-1)/(max(filtered_scores)-min(filtered_scores))) for x in filtered_scores]
min(scores_1to10), max(scores_1to10)
Out[275]:
In [276]:
plt.xlim(1, 10)
plt.hist(scores_1to10, bins=100)
plt.show()
In [ ]: