In [1]:

    
# Make all plots inline.
%matplotlib inline 

# Import dependencies
import folium
from folium.plugins import HeatMap
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
from IPython.core.display import display
import folium.colormap as cm



In [110]:

    
# Load accident data.
accident_data = pd.read_csv('./data/NYPD_Motor_Vehicle_Collisions_sampled.csv')

Accident Data Summary



In [111]:

    
accident_data.head()









    Out[111]:






  
    
      
      DATE
      TIME
      BOROUGH
      ZIP CODE
      LATITUDE
      LONGITUDE
      LOCATION
      ON STREET NAME
      CROSS STREET NAME
      OFF STREET NAME
      ...
      CONTRIBUTING FACTOR VEHICLE 2
      CONTRIBUTING FACTOR VEHICLE 3
      CONTRIBUTING FACTOR VEHICLE 4
      CONTRIBUTING FACTOR VEHICLE 5
      UNIQUE KEY
      VEHICLE TYPE CODE 1
      VEHICLE TYPE CODE 2
      VEHICLE TYPE CODE 3
      VEHICLE TYPE CODE 4
      VEHICLE TYPE CODE 5
    
  
  
    
      0
      06/18/2016
      5:20
      BRONX
      10456.0
      40.824067
      -73.908710
      (40.8240665, -73.9087095)
      EAST 163 STREET
      3 AVENUE
      NaN
      ...
      Unspecified
      NaN
      NaN
      NaN
      3463614
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
      NaN
    
    
      1
      06/18/2016
      7:10
      BRONX
      10472.0
      40.826916
      -73.872030
      (40.8269163, -73.8720302)
      METCALF AVENUE
      WATSON AVENUE
      NaN
      ...
      Unspecified
      NaN
      NaN
      NaN
      3464214
      PASSENGER VEHICLE
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
    
    
      2
      06/18/2016
      7:20
      NaN
      NaN
      40.701455
      -73.989620
      (40.7014547, -73.9896203)
      NaN
      NaN
      NaN
      ...
      Unspecified
      NaN
      NaN
      NaN
      3463782
      PASSENGER VEHICLE
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
    
    
      3
      06/18/2016
      7:30
      NaN
      NaN
      NaN
      NaN
      NaN
      47 STREET
      NaN
      NaN
      ...
      Unspecified
      NaN
      NaN
      NaN
      3465413
      PASSENGER VEHICLE
      OTHER
      NaN
      NaN
      NaN
    
    
      4
      06/18/2016
      7:45
      QUEENS
      11422.0
      40.665256
      -73.735334
      (40.665256, -73.7353338)
      SOUTH CONDUIT AVENUE
      FRANCIS LEWIS BOULEVARD
      NaN
      ...
      Unspecified
      NaN
      NaN
      NaN
      3463318
      PASSENGER VEHICLE
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
    
  

5 rows × 29 columns



In [112]:

    
accident_data.describe()









    Out[112]:






  
    
      
      ZIP CODE
      LATITUDE
      LONGITUDE
      NUMBER OF PERSONS INJURED
      NUMBER OF PERSONS KILLED
      NUMBER OF PEDESTRIANS INJURED
      NUMBER OF PEDESTRIANS KILLED
      NUMBER OF CYCLIST INJURED
      NUMBER OF CYCLIST KILLED
      NUMBER OF MOTORIST INJURED
      NUMBER OF MOTORIST KILLED
      UNIQUE KEY
    
  
  
    
      count
      6395.000000
      6687.000000
      6687.000000
      10000.000000
      10000.000000
      10000.000000
      10000.000000
      10000.000000
      10000.0
      10000.000000
      10000.000000
      1.000000e+04
    
    
      mean
      10846.141048
      40.723501
      -73.917075
      0.253500
      0.000900
      0.057600
      0.000600
      0.013900
      0.0
      0.206300
      0.000400
      3.554134e+06
    
    
      std
      554.570810
      0.079065
      0.086132
      0.625361
      0.029988
      0.260939
      0.024489
      0.131562
      0.0
      0.702417
      0.024493
      1.272283e+05
    
    
      min
      10000.000000
      40.504925
      -74.247136
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.0
      0.000000
      0.000000
      1.259120e+05
    
    
      25%
      10309.000000
      40.667784
      -73.974463
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.0
      0.000000
      0.000000
      3.525040e+06
    
    
      50%
      11207.000000
      40.721536
      -73.926085
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.0
      0.000000
      0.000000
      3.586995e+06
    
    
      75%
      11238.000000
      40.767516
      -73.861872
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.0
      0.000000
      0.000000
      3.605235e+06
    
    
      max
      11697.000000
      40.912295
      -73.700597
      12.000000
      1.000000
      6.000000
      1.000000
      3.000000
      0.0
      14.000000
      2.000000
      3.612908e+06



In [113]:

    
# Num rows in data.
print(accident_data.count())









    



DATE                             10000
TIME                             10000
BOROUGH                           6396
ZIP CODE                          6395
LATITUDE                          6687
LONGITUDE                         6687
LOCATION                          6687
ON STREET NAME                    6610
CROSS STREET NAME                 5197
OFF STREET NAME                   2174
NUMBER OF PERSONS INJURED        10000
NUMBER OF PERSONS KILLED         10000
NUMBER OF PEDESTRIANS INJURED    10000
NUMBER OF PEDESTRIANS KILLED     10000
NUMBER OF CYCLIST INJURED        10000
NUMBER OF CYCLIST KILLED         10000
NUMBER OF MOTORIST INJURED       10000
NUMBER OF MOTORIST KILLED        10000
CONTRIBUTING FACTOR VEHICLE 1     9950
CONTRIBUTING FACTOR VEHICLE 2     8320
CONTRIBUTING FACTOR VEHICLE 3      700
CONTRIBUTING FACTOR VEHICLE 4      196
CONTRIBUTING FACTOR VEHICLE 5       40
UNIQUE KEY                       10000
VEHICLE TYPE CODE 1               9779
VEHICLE TYPE CODE 2               7002
VEHICLE TYPE CODE 3                623
VEHICLE TYPE CODE 4                173
VEHICLE TYPE CODE 5                 34
dtype: int64

Mapping Collisions



In [114]:

    
# Creating the map object to hold the different layers.

# Starting coordinates to load map view.
NYC_coordinates = (40.7142700, -74.0059700)

# Create Map object.
map = folium.Map(location=NYC_coordinates,
                 zoom_start=12,
                tiles = 'Cartodb Positron',
                control_scale = 'True')

# Create layer group for collisions (optional overlay)
col_group = folium.FeatureGroup(name='Collisions').add_to(map)

# Limit to 1000 records
MAX_RECORDS = 1000

# Add marker clusters to the feature group for collisions
collision_cluster = folium.MarkerCluster().add_to(col_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
    # Only plot point if lat/long is available.
    if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE'])):
        accident_metadata = """
                <ul>
                    <li><strong>On street</strong>: {0}</li>
                    <li><strong>Cross street</strong>: {1}</li>
                    <li><strong>Reason</strong>: {2}</li>
                </ul>""".format(
            str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
            str(row[1]['CONTRIBUTING FACTOR VEHICLE 1']))
        iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
        popup = folium.Popup(iframe, max_width=2650)
        mark_color = 'red'
        if row[1]['NUMBER OF CYCLIST INJURED'] >= 1:
            mark_color = 'blue'
        folium.Marker(
                location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
                icon = folium.Icon(color = mark_color, icon='asterisk'),
                popup=popup).add_to(collision_cluster)



In [116]:

    
folium.LayerControl().add_to(map)
map









    Out[116]:



In [2]:

    
# Load pickle files containing aggregate data.
import pickle

agg_accident_data = pickle.load(open("./../traffic-density/accident_counts.pkl", "rb"))
agg_traffic_data = pickle.load(open("./../traffic-density/intersection_counts_yellow_updated.pkl", "rb"))
nodes_to_coord_data = pickle.load(open("./../traffic-density/nodes_to_coordinates.pkl", "rb"))

# Accident Counts by Year
acc_count_2013 = pickle.load(open("./../traffic-density/accident_counts_2013.pkl", "rb"))
acc_count_2014 = pickle.load(open("./../traffic-density/accident_counts_2014.pkl", "rb"))
acc_count_2015 = pickle.load(open("./../traffic-density/accident_counts_2015.pkl", "rb"))
acc_count_2016 = pickle.load(open("./../traffic-density/accident_counts_2016.pkl", "rb"))

Traffic Score Data



In [3]:

    
# Function that provides 10 percentile buckets for provided array.
def computePercentileRanges(array):
    percentiles = []
    for x in np.linspace(0, 90, num=10):
        percentiles.append(np.nanpercentile(array, x))
    return percentiles

# Function that computes a rank between 1-10 for a node's score
# based on what percentile it falls in.
def computeNodeRank(score, percentiles):
    # Don't compute rank if score is nan.
    if score == np.nan:
        return np.nan
    for i, x in reversed(list(enumerate(percentiles))):
        if score >= x:
            return i + 1

sampleData = False
sampleSize = 1000

def computeData(input, output, file_suffix):
    detailed_output = []
    for k, num_accidents in input.items():
        if (sampleData and k > sampleSize):
            break;
        else:
            # Get lat/long of node.
            node = nodes_to_coord_data[k]
            node_id = node[0]
            lat = node[1]['lat']
            lon = node[1]['lon']
            cross_streets = node[1]['intersection_name']
            if cross_streets == "":
                cross_streets = "N/A"

            # Get amount of traffic at that node.
            traffic = agg_traffic_data[node_id]
            # There are some weird cases where traffic is less than the num accidents.
            # For now, we update traffic to equal the num_accidents.
            # Maybe this assumption should be revisited?
            if (traffic < num_accidents):
                traffic = num_accidents

            # Calculate intersection score.
            score = num_accidents/traffic
            nan_score = np.nan if traffic == num_accidents else score

            # Third value in array is a placeholder for the percentile bucket
            # which will be computed later.
            row = [lat, lon, 0, score, num_accidents, traffic]
            # First value in array is a placeholder for the percentile bucket
            # which will be computed later.
            detailed_row = np.array([0, cross_streets, lat, lon, nan_score, num_accidents, traffic])
        output.append(row)
        detailed_output.append(detailed_row)

    # Compute percentiles for scores.
    pd_detailed_output = pd.DataFrame(detailed_output,
                                       columns=['Bucketed Score', 'Intersection', 'Lat', 'Long',
                                                'Score', 'Accidents', 'Traffic'])
    dtype = dtype={'Lat':'float64', 'Long':'float64','Accidents':'int64',
                   'Score': 'float64','Bucketed Score':'int64','Intersection':'str'}
    for k,v in dtype.items():
        pd_detailed_output[k] = pd_detailed_output[k].astype(v)
    output_percentiles = computePercentileRanges(np.array(pd_detailed_output['Score']))
    # Populate percentiles in output and detailed_output arrays.
    for i, x in enumerate(output):
        output[i][2] = computeNodeRank(x[3], output_percentiles)/10
    pd_detailed_output['Bucketed Score'] = [computeNodeRank(x, output_percentiles) \
                                            for x in pd_detailed_output['Score']]

    # Write detailed data to file.
    pd_detailed_output_sorted = pd_detailed_output.sort_values(['Bucketed Score', 'Score'],
                                                               ascending=False)
    # Round raw score to 4 decimal places.
    pd_detailed_output_sorted["Score"] = [round(x*10000, 2) for x in pd_detailed_output_sorted["Score"]]
    pd_detailed_output_sorted.to_csv('../website/data/accident_scores_data_' + file_suffix + '.csv',
                                     index=False)



In [4]:

    
# All Accidents
data = []
computeData(agg_accident_data, data, 'all')

# 2013 Accidents
data_13 = []
computeData(acc_count_2013, data_13, '13')

# 2014 Accidents
data_14 = []
computeData(acc_count_2014, data_14, '14')

# 2015 Accidents
data_15 = []
computeData(acc_count_2015, data_15, '15')

# 2016 Accidents
data_16 = []
computeData(acc_count_2016, data_16, '16')

Heatmap



In [159]:

    
# Starting heatmap coordinates.
NYC_coordinates_HM = (40.78, -73.98)
radius = 15
blur = 15
min_opacity = 0.1
max_zoom = 13
max_val = 0.8


# Heatmap
map2 = folium.Map(location=NYC_coordinates_HM,
                 zoom_start=12,
                tiles = 'Cartodb Positron',
                control_scale = 'True')

# Can adjust radius and max_val to change the heatmap concentrations
HeatMap(data = data, name='Traffic Score (All Years)', radius=radius, blur=blur,
        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)


# Adding Layers for Each Year
# Can only make it radio button if I set overlay=False, but this causes problems with map loading/refreshing

#HeatMap(data = data_13, name='Traffic Score (2013)', radius=radius, blur=blur, 
#        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
#HeatMap(data = data_14, name='Traffic Score (2014)', radius=radius, blur=blur, 
#        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
#HeatMap(data = data_15, name='Traffic Score (2015)', radius=radius, blur=blur, 
#        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)
#HeatMap(data = data_16, name='Traffic Score (2016)', radius=radius, blur=blur, 
#        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map2)

folium.LayerControl().add_to(map2)
map2









    Out[159]:



In [59]:

    
# Playing around with legend.
colormap = cm.LinearColormap(['blue', 'cyan', 'lime', 'yellow', 'red'],
                           vmin=0, vmax=10)
colormap.caption = 'Bucketed Score (Heatmaps)'
colormap









    Out[59]:

Traffic Score Markers + Heatmap



In [11]:

    
sampleData = True
sampleSize = 500

# Plot traffic score markers
NYC_coordinates_HM = (40.78, -73.98)
radius = 15
blur = 15
min_opacity = 0.2
max_zoom = 16
max_val = 1


# Map Object
map3 = folium.Map(location=NYC_coordinates_HM,
                  zoom_start=14,
                  min_zoom=12,
                  tiles = 'Cartodb Positron',
                  control_scale = 'True')

# Create feature group for markers.
feature_group = folium.FeatureGroup(name='Detailed Markers')
for k, row in enumerate(data_16):
    if (sampleData and k > sampleSize):
        break;
        
    lat = row[0]
    lon = row[1]
    bucketed_score = row[2] * 10
    raw_score = row[3]
    num_accidents = row[4]
    traffic = row[5]

    # Plot point.
    node_metadata = """
                <ul>
                    <li><strong>Num accidents</strong>: {0}</li>
                    <li><strong>Traffic</strong>: {1}</li>
                    <li><strong>Accidents per 10,000 trips</strong>: {2}</li>
                    <li><strong>Bucketed score</strong>: {3}</li>
                </ul>""".format(
            str(num_accidents), str(traffic), str(round(raw_score*10000, 2)), str(bucketed_score))
    iframe = folium.element.IFrame(html=node_metadata, width=250, height=100)
    popup = folium.Popup(iframe, max_width=2650)
    score_color = "gray"
    if (bucketed_score == 10 or bucketed_score == 9):
        score_color = "red"
    elif (bucketed_score == 8 or bucketed_score == 7):
        score_color = "orange"
    elif (bucketed_score == 6 or bucketed_score == 5):
        score_color = "green"
    elif (bucketed_score == 4 or bucketed_score == 3):
        score_color = "blue"
    elif (bucketed_score == 2 or bucketed_score == 1):
        score_color = "purple"
    feature_group.add_children(folium.Marker(
            location = [lat, lon],
            icon = folium.Icon(color=score_color),
            popup=popup))
feature_group.add_to(map3)

        
# Can adjust radius and max_val to change the heatmap concentrations
HeatMap(data = data, name='Traffic Score (All Years)', radius=radius, blur=blur,
        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)


# Adding Layers for Each Year
# Can only make it radio button if I set overlay=False, but this causes problems with map loading/refreshing

HeatMap(data = data_13, name='Traffic Score (2013)', radius=radius, blur=blur, 
        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
HeatMap(data = data_14, name='Traffic Score (2014)', radius=radius, blur=blur, 
        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
HeatMap(data = data_15, name='Traffic Score (2015)', radius=radius, blur=blur, 
        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)
HeatMap(data = data_16, name='Traffic Score (2016)', radius=radius, blur=blur, 
        min_opacity=min_opacity, max_zoom=max_zoom, max_val=max_val).add_to(map3)

# Add legend for heatmaps.
# Don't add to map because it's hard to see when it's overlaying map.
# colormap = cm.LinearColormap(['blue', 'cyan', 'lime', 'yellow', 'red'],
#                            vmin=0, vmax=10)
# colormap.caption = 'Bucketed Score (Heatmaps)'
# map3.add_child(colormap)


# Only print the map in the notebook if sampling is on.
# Otherwise, it will crash your notebook.
# If not sampling the data, run the method in the next section
# to write out the generated html file and open that to see the results.

folium.LayerControl().add_to(map3)
if (sampleData):
    display(map3)



In [57]:

    
# Save html version of map.
map3.save('../website/heatmap_traffic_scores_v5.html')

Prior Analysis



In [60]:

    
# Creating the map object to hold the different layers.

# Starting coordinates to load map view.
NYC_coordinates = (40.7142700, -74.0059700)

# Create Map object.
map = folium.Map(location=NYC_coordinates,
                 zoom_start=12,
                tiles = 'Cartodb Positron',
                control_scale = 'True')

# Tile layer for cycle map
# cycle_tile = folium.TileLayer(tiles = 'http://b.tile.opencyclemap.org/cycle/{z}/{x}/{y}.png', 
                              attr='Attributed').add_to(map)
# cycle_tile.layer_name = 'Cycling Route Map'

# Create a feature group for controlling multiple aspects

col_group = folium.FeatureGroup(name='Collisions').add_to(map)
injury_group = folium.FeatureGroup(name='Injuries').add_to(map)


# Plot accidents.
# Limit number of points to plot for testing.
MAX_RECORDS = 1000
collision_cluster = folium.MarkerCluster().add_to(col_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
    # Only plot point if lat/long is available.
    if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE'])):
        accident_metadata = """
                <ul>
                    <li><strong>On street</strong>: {0}</li>
                    <li><strong>Cross street</strong>: {1}</li>
                    <li><strong>Reason</strong>: {2}</li>
                </ul>""".format(
            str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
            str(row[1]['CONTRIBUTING FACTOR VEHICLE 1']))
        iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
        popup = folium.Popup(iframe, max_width=2650)
        folium.Marker(
                location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
                icon = folium.Icon(color='red', icon='asterisk'),
                popup=popup).add_to(collision_cluster)

# Create new cluster group for cyclist injuries
bike_cluster = folium.MarkerCluster().add_to(injury_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
    # Only plot point if lat/long is available.
    if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE']) and row[1]['NUMBER OF CYCLIST INJURED'] >= 1 or row[1]['NUMBER OF CYCLIST KILLED'] >= 1):
        accident_metadata = """
                <ul>
                    <li><strong>On street</strong>: {0}</li>
                    <li><strong>Cross street</strong>: {1}</li>
                    <li><strong>Cyclists Injured</strong>: {2}</li>
                    <li><strong>Cyclists Killed</strong>: {3}</li>
                </ul>""".format(
            str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
            str(row[1]['NUMBER OF CYCLIST INJURED']), str(row[1]['NUMBER OF CYCLIST KILLED']))
        iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
        popup = folium.Popup(iframe, max_width=2650)
        folium.Marker(
                location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
                icon = folium.Icon(color='blue', icon='asterisk'),
                popup=popup).add_to(bike_cluster)
        
# Create new cluster group for pedestrian injuries
ped_cluster = folium.MarkerCluster().add_to(injury_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
    # Only plot point if lat/long is available.
    if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE']) and row[1]['NUMBER OF PEDESTRIANS INJURED'] >= 1 or row[1]['NUMBER OF PEDESTRIANS KILLED'] >= 1):
        accident_metadata = """
                <ul>
                    <li><strong>On street</strong>: {0}</li>
                    <li><strong>Cross street</strong>: {1}</li>
                    <li><strong>Pedestrians Injured</strong>: {2}</li>
                    <li><strong>Pedestrians Killed</strong>: {3}</li>
                </ul>""".format(
            str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
            str(row[1]['NUMBER OF PEDESTRIANS INJURED']), str(row[1]['NUMBER OF PEDESTRIANS KILLED']))
        iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
        popup = folium.Popup(iframe, max_width=2650)
        folium.Marker(
                location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
                icon = folium.Icon(color='green', icon='asterisk'),
                popup=popup).add_to(ped_cluster)

# Create new cluster group for automobile injuries
auto_cluster = folium.MarkerCluster().add_to(injury_group)
for row in accident_data[0:MAX_RECORDS].iterrows():
    # Only plot point if lat/long is available.
    if (not np.isnan(row[1]['LATITUDE']) and not np.isnan(row[1]['LONGITUDE']) and row[1]['NUMBER OF MOTORIST INJURED'] >= 1 or row[1]['NUMBER OF MOTORIST KILLED'] >= 1):
        accident_metadata = """
                <ul>
                    <li><strong>On street</strong>: {0}</li>
                    <li><strong>Cross street</strong>: {1}</li>
                    <li><strong>Motorists Injured</strong>: {2}</li>
                    <li><strong>Motorists Killed</strong>: {3}</li>
                </ul>""".format(
            str(row[1]['ON STREET NAME']), str(row[1]['CROSS STREET NAME']),
            str(row[1]['NUMBER OF MOTORIST INJURED']), str(row[1]['NUMBER OF MOTORIST KILLED']))
        iframe = folium.element.IFrame(html=accident_metadata, width=250, height=100)
        popup = folium.Popup(iframe, max_width=2650)
        folium.Marker(
                location = [row[1]['LATITUDE'], row[1]['LONGITUDE']],
                icon = folium.Icon(color='red', icon='asterisk'),
                popup=popup).add_to(auto_cluster)
        
# layer names for items separately        
# collision_cluster.layer_name = 'Collisions'
# bike_cluster.layer_name = 'Cyclist Injuries'
# ped_cluster.layer_name = 'Pedestrian Injuries'


folium.LayerControl().add_to(map)
map









    Out[60]:

New Aggregate Intersection Counts



In [20]:

    
# Get sizes of data sets.
print(len(agg_accident_data))
print(len(agg_traffic_data))
print(len(nodes_to_coord_data))



In [13]:

    
# Print sample of datasets.
dict(list(agg_accident_data.items())[0:5])









    Out[13]:





{0: 21, 1: 19, 2: 58, 3: 34, 5: 16}



In [14]:

    
dict(list(agg_traffic_data.items())[0:5])









    Out[14]:





{42426374: 8459,
 42459137: 250757,
 373268484: 496523,
 1773060099: 8202,
 3914862593: 4540}



In [57]:

    
nodes_to_coord_data[0:5]









    Out[57]:





[(42421728,
  {'highway': 'traffic_signals',
   'intersection_name': 'Central Park West & West 106th Street',
   'lat': 40.7980472,
   'lon': -73.96004,
   'osmid': '42421728',
   'ref': nan,
   'x': 587729.35665455484,
   'y': 4516859.0270689027}),
 (42421731,
  {'highway': 'traffic_signals',
   'intersection_name': 'Manhattan Avenue & West 106th Street',
   'lat': 40.798645,
   'lon': -73.9614743,
   'osmid': '42421731',
   'ref': nan,
   'x': 587607.57342485804,
   'y': 4516923.9536821311}),
 (42421737,
  {'highway': 'traffic_signals',
   'intersection_name': 'Columbus Avenue & West 106th Street',
   'lat': 40.7992369,
   'lon': -73.962876,
   'osmid': '42421737',
   'ref': nan,
   'x': 587488.55019145994,
   'y': 4516988.2598536853}),
 (42421741,
  {'highway': 'traffic_signals',
   'intersection_name': 'Amsterdam Avenue & West 106th Street',
   'lat': 40.8004313,
   'lon': -73.9657046,
   'osmid': '42421741',
   'ref': nan,
   'x': 587248.37035306217,
   'y': 4517118.0294734156}),
 (42421745,
  {'highway': 'traffic_signals',
   'intersection_name': 'Broadway & West 106th Street',
   'lat': 40.8014007,
   'lon': -73.9679942,
   'osmid': '42421745',
   'ref': nan,
   'x': 587053.96052313899,
   'y': 4517223.3649709234})]



In [210]:

    
# Compute histogram of accident counts.
accident_numbers_list = list(agg_accident_data.values())
print(min(accident_numbers_list))
print(max(accident_numbers_list))
plt.xlim(min(accident_numbers_list), 300)
plt.hist(accident_numbers_list, bins=200)
plt.show()



In [211]:

    
# Compute histogram of traffic counts.
traffic_numbers_list = list(agg_traffic_data.values())
print(min(traffic_numbers_list))
print(max(traffic_numbers_list))
plt.xlim(min(traffic_numbers_list), 300000)
plt.hist(traffic_numbers_list, bins=200)
plt.show()



In [274]:

    
# Compute scores.
scores = []
for k, num_accidents in agg_accident_data.items():
    if (num_accidents > 0):
        node = nodes_to_coord_data[k]
        node_id = node[0]
        # Get amount of traffic at that node.
        traffic = agg_traffic_data[node_id]
        # There are some weird cases where traffic is 0 but accidents is > 0.
        # For now, we update traffic to equal the num_accidents.
        # Maybe this assumption should be revisited?
        if (traffic < num_accidents):
            traffic = num_accidents
        # Calculate intersection score.
        score = num_accidents/traffic
        scores.append(score)
        
# Compute histogram of scores.
plt.xlim(0, 1)
plt.hist(np.sqrt(scores), bins=100)
plt.show()


# Compute histogram of scores after filtering out high scores.
filtered_scores = [x for x in scores if x < 0.01]
plt.xlim(0, 0.1)
plt.hist(np.sqrt(filtered_scores), bins = 50)
plt.show()



In [275]:

    
# putting scores on a 1 to 10 scale
scores_1to10 = [(1 + (x - min(filtered_scores))*(10-1)/(max(filtered_scores)-min(filtered_scores))) for x in filtered_scores]

min(scores_1to10), max(scores_1to10)









    Out[275]:





(1.0, 10.0)



In [276]:

    
plt.xlim(1, 10)
plt.hist(scores_1to10, bins=100)
plt.show()



In [ ]:

	DATE	TIME	BOROUGH	ZIP CODE	LATITUDE	LONGITUDE	LOCATION	ON STREET NAME	CROSS STREET NAME	OFF STREET NAME	...	CONTRIBUTING FACTOR VEHICLE 2	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	UNIQUE KEY	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5
0	06/18/2016	5:20	BRONX	10456.0	40.824067	-73.908710	(40.8240665, -73.9087095)	EAST 163 STREET	3 AVENUE	NaN	...	Unspecified	NaN	NaN	NaN	3463614	PASSENGER VEHICLE	NaN	NaN	NaN	NaN
1	06/18/2016	7:10	BRONX	10472.0	40.826916	-73.872030	(40.8269163, -73.8720302)	METCALF AVENUE	WATSON AVENUE	NaN	...	Unspecified	NaN	NaN	NaN	3464214	PASSENGER VEHICLE	PASSENGER VEHICLE	NaN	NaN	NaN
2	06/18/2016	7:20	NaN	NaN	40.701455	-73.989620	(40.7014547, -73.9896203)	NaN	NaN	NaN	...	Unspecified	NaN	NaN	NaN	3463782	PASSENGER VEHICLE	PASSENGER VEHICLE	NaN	NaN	NaN
3	06/18/2016	7:30	NaN	NaN	NaN	NaN	NaN	47 STREET	NaN	NaN	...	Unspecified	NaN	NaN	NaN	3465413	PASSENGER VEHICLE	OTHER	NaN	NaN	NaN
4	06/18/2016	7:45	QUEENS	11422.0	40.665256	-73.735334	(40.665256, -73.7353338)	SOUTH CONDUIT AVENUE	FRANCIS LEWIS BOULEVARD	NaN	...	Unspecified	NaN	NaN	NaN	3463318	PASSENGER VEHICLE	PASSENGER VEHICLE	NaN	NaN	NaN

	ZIP CODE	LATITUDE	LONGITUDE	NUMBER OF PERSONS INJURED	NUMBER OF PERSONS KILLED	NUMBER OF PEDESTRIANS INJURED	NUMBER OF PEDESTRIANS KILLED	NUMBER OF CYCLIST INJURED	NUMBER OF CYCLIST KILLED	NUMBER OF MOTORIST INJURED	NUMBER OF MOTORIST KILLED	UNIQUE KEY
count	6395.000000	6687.000000	6687.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.0	10000.000000	10000.000000	1.000000e+04
mean	10846.141048	40.723501	-73.917075	0.253500	0.000900	0.057600	0.000600	0.013900	0.0	0.206300	0.000400	3.554134e+06
std	554.570810	0.079065	0.086132	0.625361	0.029988	0.260939	0.024489	0.131562	0.0	0.702417	0.024493	1.272283e+05
min	10000.000000	40.504925	-74.247136	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	1.259120e+05
25%	10309.000000	40.667784	-73.974463	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	3.525040e+06
50%	11207.000000	40.721536	-73.926085	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	3.586995e+06
75%	11238.000000	40.767516	-73.861872	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	3.605235e+06
max	11697.000000	40.912295	-73.700597	12.000000	1.000000	6.000000	1.000000	3.000000	0.0	14.000000	2.000000	3.612908e+06