In [1]:
import numpy as np
import pandas as pd
import itertools
import geoplotlib as glp
from collections import Counter
from geoplotlib.utils import BoundingBox, DataAccessObject
from __future__ import division
%matplotlib inline
In [2]:
dataset = 'datasets/NYPD_Motor_Vehicle_Collisions.csv'
collisions = pd.read_csv(dataset)
In [3]:
collisions = collisions[pd.notnull(collisions.BOROUGH)]
collisions.head()
Out[3]:
In [4]:
print "Number of registered collisions since 2012: " + str(len(collisions))
In [5]:
# Types of contributing factors
collisions['NUMBER OF PERSONS KILLED'].unique()
Out[5]:
In [3]:
# KDE MAP OF ALL COLLISION INCIDENTS
bbox = BoundingBox(north=collisions.LATITUDE.max()-0.055,\
west=collisions.LONGITUDE.min()+0.055,\
south=collisions.LATITUDE.min()-0.055,\
east=collisions.LONGITUDE.max()+0.055)
#bbox = BoundingBox(north=40.915256, west=-74.255735, south=40.496044, east=-73.700272)
coords = {'lat': collisions.LATITUDE.values.tolist(), 'lon': collisions.LONGITUDE.values.tolist()}
glp.kde(coords, bw=2, cut_below=1e-4)
glp.set_bbox(bbox)
glp.inline()
#glp.show()
In [6]:
def filter_cause(cause):
return collisions[(collisions['CONTRIBUTING FACTOR VEHICLE 1'] == cause) | \
(collisions['CONTRIBUTING FACTOR VEHICLE 2'] == cause) | \
(collisions['CONTRIBUTING FACTOR VEHICLE 3'] == cause) | \
(collisions['CONTRIBUTING FACTOR VEHICLE 4'] == cause) | \
(collisions['CONTRIBUTING FACTOR VEHICLE 5'] == cause)]
def draw_kde(data):
bbox = BoundingBox(north=data.LATITUDE.max()-0.055,\
west=data.LONGITUDE.min()+0.055,\
south=data.LATITUDE.min()-0.055,\
east=data.LONGITUDE.max()+0.055)
#bbox = BoundingBox(north=40.915256, west=-74.255735, south=40.496044, east=-73.700272)
coords = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
glp.kde(coords, bw=2, cut_below=1e-4)
glp.set_bbox(bbox)
glp.inline()
for c in collisions['CONTRIBUTING FACTOR VEHICLE 1'].unique().tolist():
filtered = filter_cause(c)
print "CAUSE: " + c
print "COUNT: " + str(len(filtered)) + " of " + str(len(collisions))
draw_kde(filtered)
In [8]:
def draw_dot(data, type_color):
gridDots = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
bbox = BoundingBox(north=collisions.LATITUDE.max()-0.055,\
west=collisions.LONGITUDE.min()+0.055,\
south=collisions.LATITUDE.min()-0.055,\
east=collisions.LONGITUDE.max()+0.055)
glp.set_bbox(bbox)
glp.dot(gridDots, color=type_color)
draw_dot(collisions, 'r')
glp.inline()
#collisions.head()
In [9]:
inj_cyclists = collisions[collisions['NUMBER OF CYCLIST INJURED'] > 0]
inj_pedestrians = collisions[collisions['NUMBER OF PEDESTRIANS INJURED'] > 0]
inj_motorists = collisions[collisions['NUMBER OF MOTORIST INJURED'] > 0]
print "CYCLIST INJURIES: " + str(len(inj_cyclists))
print "PEDESTRIANS INJURIES: " + str(len(inj_pedestrians))
print "MOTORISTS INJURIES: " + str(len(inj_motorists))
In [10]:
draw_dot(inj_motorists, 'b')
draw_dot(inj_pedestrians, 'r')
draw_dot(inj_cyclists, 'g')
glp.inline()
In [11]:
kill_cyclists = collisions[collisions['NUMBER OF CYCLIST KILLED'] > 0]
kill_pedestrians = collisions[collisions['NUMBER OF PEDESTRIANS KILLED'] > 0]
kill_motorists = collisions[collisions['NUMBER OF MOTORIST KILLED'] > 0]
print "CYCLIST KILLED: " + str(len(kill_cyclists))
print "PEDESTRIANS KILLED: " + str(len(kill_pedestrians))
print "MOTORISTS KILLED: " + str(len(kill_motorists))
In [12]:
draw_dot(kill_motorists, 'b')
draw_dot(kill_pedestrians, 'r')
draw_dot(kill_cyclists, 'g')
glp.inline()
In [13]:
# Collisions on the hour for each borough
series = {}
for b in collisions.BOROUGH.unique():
filtered = collisions[collisions.BOROUGH == b]
collisions_on_hour = Counter([int(c.split(":")[0]) for c in filtered.TIME.values])
series[b] = pd.Series([collisions_on_hour[h] for h in range(0,24)], index=range(0,24))
d = pd.DataFrame(series)
d.plot(kind='bar', figsize=(16,6), subplots=True, layout=(3,2), legend=False)
d.plot(kind='bar', figsize=(16,6), subplots=False, legend=True)
Out[13]:
In [14]:
# GET ALL ENTRIES FROM THE DATASET WHERE CAUSATION AND KILLED ARE FILLED OUT
killed = collisions[((collisions['NUMBER OF PERSONS KILLED'] > 0))]
killed2 = killed[(killed['CONTRIBUTING FACTOR VEHICLE 1'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 1'])) | \
(killed['CONTRIBUTING FACTOR VEHICLE 2'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 2'])) | \
(killed['CONTRIBUTING FACTOR VEHICLE 3'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 3'])) | \
(killed['CONTRIBUTING FACTOR VEHICLE 4'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 4'])) | \
(killed['CONTRIBUTING FACTOR VEHICLE 5'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 5']))]
print str(len(killed2)) + " of " + str(len(killed)) + " KILLS"
killed2.head()[['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5']]
Out[14]:
In [15]:
# GET ALL ENTRIES FROM THE DATASET WHERE CAUSATION AND INJURED ARE FILLED OUT
injured = collisions[((collisions['NUMBER OF PERSONS INJURED'] > 0))]
injured2 = injured[(injured['CONTRIBUTING FACTOR VEHICLE 1'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 1'])) | \
(injured['CONTRIBUTING FACTOR VEHICLE 2'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 2'])) | \
(injured['CONTRIBUTING FACTOR VEHICLE 3'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 3'])) | \
(injured['CONTRIBUTING FACTOR VEHICLE 4'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 4'])) | \
(killed['CONTRIBUTING FACTOR VEHICLE 5'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 5']))]
injured
print str(len(injured2)) + " of " + str(len(injured)) + " INJURED"
injured2.head()[['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5']]
Out[15]:
In [16]:
collisions.columns
Out[16]:
In [17]:
print "TOP THREE MOST DANGEROUS STREET BY KILL COUNT"
collisions.groupby(by='ON STREET NAME').sum().sort_values(by='NUMBER OF PERSONS KILLED', ascending=False).head(3)
Out[17]:
In [18]:
print "TOP THREE MOST DANGEROUS STREET BY INJURY COUNT"
collisions.groupby(by='ON STREET NAME').sum().sort_values(by='NUMBER OF PERSONS INJURED', ascending=False).head(3)
Out[18]:
In [19]:
collisions[pd.isnull(collisions['ON STREET NAME'])].fillna(0)['NUMBER OF PERSONS INJURED'].sum()
Out[19]:
In [21]:
collisions[collisions['ON STREET NAME']].fillna(0)['NUMBER OF PERSONS INJURED'].sum()
In [ ]: