In [1]:
import numpy as np
import pandas as pd
import itertools
import geoplotlib as glp
from collections import Counter
from geoplotlib.utils import BoundingBox, DataAccessObject
from __future__ import division

%matplotlib inline

In [2]:
dataset = 'datasets/NYPD_Motor_Vehicle_Collisions.csv'
collisions = pd.read_csv(dataset)

In [3]:
collisions = collisions[pd.notnull(collisions.BOROUGH)]
collisions.head()


Out[3]:
DATE TIME BOROUGH ZIP CODE LATITUDE LONGITUDE LOCATION ON STREET NAME CROSS STREET NAME OFF STREET NAME ... CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3 CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5 UNIQUE KEY VEHICLE TYPE CODE 1 VEHICLE TYPE CODE 2 VEHICLE TYPE CODE 3 VEHICLE TYPE CODE 4 VEHICLE TYPE CODE 5
0 03/14/2016 3:27 QUEENS 11372 40.747734 -73.882999 (40.7477341, -73.8829986) ROOSEVELT AVENUE 83 STREET NaN ... NaN NaN NaN NaN 3405169 OTHER NaN NaN NaN NaN
3 03/14/2016 0:45 MANHATTAN 10035 40.808279 -73.938793 (40.8082795, -73.9387929) EAST 129 STREET MADISON AVENUE NaN ... Unspecified NaN NaN NaN 3405059 PASSENGER VEHICLE PASSENGER VEHICLE NaN NaN NaN
4 03/13/2016 23:00 BROOKLYN 11206 40.706653 -73.950406 (40.7066527, -73.9504063) UNION AVENUE MONTROSE AVENUE NaN ... Unspecified NaN NaN NaN 3405121 PASSENGER VEHICLE PASSENGER VEHICLE NaN NaN NaN
6 03/13/2016 9:48 BROOKLYN 11212 40.661997 -73.919593 (40.661997, -73.9195931) KINGS HIGHWAY EAST 98 STREET NaN ... Unspecified NaN NaN NaN 3404744 PASSENGER VEHICLE UNKNOWN NaN NaN NaN
9 03/13/2016 9:46 QUEENS 11106 40.756580 -73.929752 (40.75658, -73.9297516) 36 AVENUE 31 STREET NaN ... Unspecified NaN NaN NaN 3404995 LIVERY VEHICLE PASSENGER VEHICLE NaN NaN NaN

5 rows × 29 columns


In [4]:
print "Number of registered collisions since 2012: " + str(len(collisions))


Number of registered collisions since 2012: 584753

In [5]:
# Types of contributing factors
collisions['NUMBER OF PERSONS KILLED'].unique()


Out[5]:
array([0, 1, 2, 3, 4, 5])

In [3]:
# KDE MAP OF ALL COLLISION INCIDENTS
bbox = BoundingBox(north=collisions.LATITUDE.max()-0.055,\
                   west=collisions.LONGITUDE.min()+0.055,\
                   south=collisions.LATITUDE.min()-0.055,\
                   east=collisions.LONGITUDE.max()+0.055)

#bbox = BoundingBox(north=40.915256, west=-74.255735, south=40.496044, east=-73.700272)

coords = {'lat': collisions.LATITUDE.values.tolist(), 'lon': collisions.LONGITUDE.values.tolist()}

glp.kde(coords, bw=2, cut_below=1e-4)
glp.set_bbox(bbox)
glp.inline()
#glp.show()


('smallest non-zero count', 8.955579620337621e-09)
('max count:', 54.536583126377039)

In [6]:
def filter_cause(cause):
    return collisions[(collisions['CONTRIBUTING FACTOR VEHICLE 1'] == cause) | \
                (collisions['CONTRIBUTING FACTOR VEHICLE 2'] == cause) | \
                (collisions['CONTRIBUTING FACTOR VEHICLE 3'] == cause) | \
                (collisions['CONTRIBUTING FACTOR VEHICLE 4'] == cause) | \
                (collisions['CONTRIBUTING FACTOR VEHICLE 5'] == cause)]

def draw_kde(data):
    bbox = BoundingBox(north=data.LATITUDE.max()-0.055,\
                       west=data.LONGITUDE.min()+0.055,\
                       south=data.LATITUDE.min()-0.055,\
                       east=data.LONGITUDE.max()+0.055)

    #bbox = BoundingBox(north=40.915256, west=-74.255735, south=40.496044, east=-73.700272)

    coords = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
    
    glp.kde(coords, bw=2, cut_below=1e-4)
    glp.set_bbox(bbox)
    glp.inline()

for c in collisions['CONTRIBUTING FACTOR VEHICLE 1'].unique().tolist():
    filtered = filter_cause(c)
    print "CAUSE: " + c
    print "COUNT: " + str(len(filtered)) + " of " + str(len(collisions))
    draw_kde(filtered)


CAUSE: Pavement Slippery
COUNT: 9151 of 769054
('smallest non-zero count', 7.1647865443840454e-10)
('max count:', 0.23171796205253486)

In [8]:
def draw_dot(data, type_color):
    gridDots = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
    
    bbox = BoundingBox(north=collisions.LATITUDE.max()-0.055,\
                   west=collisions.LONGITUDE.min()+0.055,\
                   south=collisions.LATITUDE.min()-0.055,\
                   east=collisions.LONGITUDE.max()+0.055)
    glp.set_bbox(bbox)
    glp.dot(gridDots, color=type_color)

draw_dot(collisions, 'r')
glp.inline()
#collisions.head()



In [9]:
inj_cyclists = collisions[collisions['NUMBER OF CYCLIST INJURED'] > 0]
inj_pedestrians = collisions[collisions['NUMBER OF PEDESTRIANS INJURED'] > 0]
inj_motorists = collisions[collisions['NUMBER OF MOTORIST INJURED'] > 0]

print "CYCLIST INJURIES: " + str(len(inj_cyclists))
print "PEDESTRIANS INJURIES: " + str(len(inj_pedestrians))
print "MOTORISTS INJURIES: " + str(len(inj_motorists))


CYCLIST INJURIES: 13612
PEDESTRIANS INJURIES: 35368
MOTORISTS INJURIES: 61214

In [10]:
draw_dot(inj_motorists, 'b')
draw_dot(inj_pedestrians, 'r')
draw_dot(inj_cyclists, 'g')
glp.inline()



In [11]:
kill_cyclists = collisions[collisions['NUMBER OF CYCLIST KILLED'] > 0]
kill_pedestrians = collisions[collisions['NUMBER OF PEDESTRIANS KILLED'] > 0]
kill_motorists = collisions[collisions['NUMBER OF MOTORIST KILLED'] > 0]

print "CYCLIST KILLED: " + str(len(kill_cyclists))
print "PEDESTRIANS KILLED: " + str(len(kill_pedestrians))
print "MOTORISTS KILLED: " + str(len(kill_motorists))


CYCLIST KILLED: 49
PEDESTRIANS KILLED: 423
MOTORISTS KILLED: 218

In [12]:
draw_dot(kill_motorists, 'b')
draw_dot(kill_pedestrians, 'r')
draw_dot(kill_cyclists, 'g')
glp.inline()



In [13]:
# Collisions on the hour for each borough
series = {}

for b in collisions.BOROUGH.unique():
    filtered = collisions[collisions.BOROUGH == b]
    collisions_on_hour = Counter([int(c.split(":")[0]) for c in filtered.TIME.values])
    series[b] = pd.Series([collisions_on_hour[h] for h in range(0,24)], index=range(0,24))

d = pd.DataFrame(series)
d.plot(kind='bar', figsize=(16,6), subplots=True, layout=(3,2), legend=False)
d.plot(kind='bar', figsize=(16,6), subplots=False, legend=True)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x111f98310>

In [14]:
# GET ALL ENTRIES FROM THE DATASET WHERE CAUSATION AND KILLED ARE FILLED OUT
killed = collisions[((collisions['NUMBER OF PERSONS KILLED'] > 0))]

killed2 = killed[(killed['CONTRIBUTING FACTOR VEHICLE 1'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 1'])) | \
                (killed['CONTRIBUTING FACTOR VEHICLE 2'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 2'])) | \
                (killed['CONTRIBUTING FACTOR VEHICLE 3'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 3'])) | \
                (killed['CONTRIBUTING FACTOR VEHICLE 4'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 4'])) | \
                (killed['CONTRIBUTING FACTOR VEHICLE 5'] != "Unspecified") & (pd.notnull(killed['CONTRIBUTING FACTOR VEHICLE 5']))]

print str(len(killed2)) + " of " + str(len(killed)) +  " KILLS"
killed2.head()[['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5']]


372 of 688 KILLS
Out[14]:
CONTRIBUTING FACTOR VEHICLE 1 CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3 CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5
5718 Following Too Closely NaN NaN NaN NaN
7660 Failure to Yield Right-of-Way NaN NaN NaN NaN
8341 Driver Inattention/Distraction Unspecified NaN NaN NaN
8342 Driver Inattention/Distraction Unspecified NaN NaN NaN
8506 Physical Disability NaN NaN NaN NaN

In [15]:
# GET ALL ENTRIES FROM THE DATASET WHERE CAUSATION AND INJURED ARE FILLED OUT
injured = collisions[((collisions['NUMBER OF PERSONS INJURED'] > 0))]

injured2 = injured[(injured['CONTRIBUTING FACTOR VEHICLE 1'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 1'])) | \
                (injured['CONTRIBUTING FACTOR VEHICLE 2'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 2'])) | \
                (injured['CONTRIBUTING FACTOR VEHICLE 3'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 3'])) | \
                (injured['CONTRIBUTING FACTOR VEHICLE 4'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 4'])) | \
                (killed['CONTRIBUTING FACTOR VEHICLE 5'] != "Unspecified") & (pd.notnull(injured['CONTRIBUTING FACTOR VEHICLE 5']))]
injured
print str(len(injured2)) + " of " + str(len(injured)) +  " INJURED"
injured2.head()[['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5']]


51364 of 109552 INJURED
Out[15]:
CONTRIBUTING FACTOR VEHICLE 1 CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3 CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5
36 Other Vehicular Physical Disability NaN NaN NaN
45 Following Too Closely Unspecified NaN NaN NaN
50 Traffic Control Disregarded NaN NaN NaN NaN
78 Driver Inattention/Distraction Fatigued/Drowsy NaN NaN NaN
83 Physical Disability Physical Disability NaN NaN NaN

In [16]:
collisions.columns


Out[16]:
Index([u'DATE', u'TIME', u'BOROUGH', u'ZIP CODE', u'LATITUDE', u'LONGITUDE',
       u'LOCATION', u'ON STREET NAME', u'CROSS STREET NAME',
       u'OFF STREET NAME', u'NUMBER OF PERSONS INJURED',
       u'NUMBER OF PERSONS KILLED', u'NUMBER OF PEDESTRIANS INJURED',
       u'NUMBER OF PEDESTRIANS KILLED', u'NUMBER OF CYCLIST INJURED',
       u'NUMBER OF CYCLIST KILLED', u'NUMBER OF MOTORIST INJURED',
       u'NUMBER OF MOTORIST KILLED', u'CONTRIBUTING FACTOR VEHICLE 1',
       u'CONTRIBUTING FACTOR VEHICLE 2', u'CONTRIBUTING FACTOR VEHICLE 3',
       u'CONTRIBUTING FACTOR VEHICLE 4', u'CONTRIBUTING FACTOR VEHICLE 5',
       u'UNIQUE KEY', u'VEHICLE TYPE CODE 1', u'VEHICLE TYPE CODE 2',
       u'VEHICLE TYPE CODE 3', u'VEHICLE TYPE CODE 4', u'VEHICLE TYPE CODE 5'],
      dtype='object')

In [17]:
print "TOP THREE MOST DANGEROUS STREET BY KILL COUNT"
collisions.groupby(by='ON STREET NAME').sum().sort_values(by='NUMBER OF PERSONS KILLED', ascending=False).head(3)


TOP THREE MOST DANGEROUS STREET BY KILL COUNT
Out[17]:
ZIP CODE LATITUDE LONGITUDE NUMBER OF PERSONS INJURED NUMBER OF PERSONS KILLED NUMBER OF PEDESTRIANS INJURED NUMBER OF PEDESTRIANS KILLED NUMBER OF CYCLIST INJURED NUMBER OF CYCLIST KILLED NUMBER OF MOTORIST INJURED NUMBER OF MOTORIST KILLED UNIQUE KEY
ON STREET NAME
ATLANTIC AVENUE 77846497 281639.996307 -511764.262074 2227 13 299 9 88 0 1840 4 9898840005
HYLAN BOULEVARD 27364377 107699.757766 -196792.205979 890 11 132 4 13 1 745 6 3699354661
LINDEN BOULEVARD 43082235 155350.329904 -282160.813662 1708 11 171 8 52 0 1485 3 5387103651

In [18]:
print "TOP THREE MOST DANGEROUS STREET BY INJURY COUNT"
collisions.groupby(by='ON STREET NAME').sum().sort_values(by='NUMBER OF PERSONS INJURED', ascending=False).head(3)


TOP THREE MOST DANGEROUS STREET BY INJURY COUNT
Out[18]:
ZIP CODE LATITUDE LONGITUDE NUMBER OF PERSONS INJURED NUMBER OF PERSONS KILLED NUMBER OF PEDESTRIANS INJURED NUMBER OF PEDESTRIANS KILLED NUMBER OF CYCLIST INJURED NUMBER OF CYCLIST KILLED NUMBER OF MOTORIST INJURED NUMBER OF MOTORIST KILLED UNIQUE KEY
ON STREET NAME
BROADWAY 82963096 323402.733870 -586578.169632 2231 11 602 8 354 0 1275 3 11109402338
ATLANTIC AVENUE 77846497 281639.996307 -511764.262074 2227 13 299 9 88 0 1840 4 9898840005
LINDEN BOULEVARD 43082235 155350.329904 -282160.813662 1708 11 171 8 52 0 1485 3 5387103651

In [19]:
collisions[pd.isnull(collisions['ON STREET NAME'])].fillna(0)['NUMBER OF PERSONS INJURED'].sum()


Out[19]:
960

In [21]:
collisions[collisions['ON STREET NAME']].fillna(0)['NUMBER OF PERSONS INJURED'].sum()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-5a56a3110d95> in <module>()
----> 1 collisions[collisions['ON STREET NAME']].fillna(0)['NUMBER OF PERSONS INJURED'].sum()

/Users/masve/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1961         if isinstance(key, (Series, np.ndarray, Index, list)):
   1962             # either boolean or fancy integer index
-> 1963             return self._getitem_array(key)
   1964         elif isinstance(key, DataFrame):
   1965             return self._getitem_frame(key)

/Users/masve/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_array(self, key)
   1988     def _getitem_array(self, key):
   1989         # also raises Exception if object array with NA values
-> 1990         if com.is_bool_indexer(key):
   1991             # warning here just in case -- previously __setitem__ was
   1992             # reindexing but __getitem__ was not; it seems more reasonable to

/Users/masve/anaconda/lib/python2.7/site-packages/pandas/core/common.pyc in is_bool_indexer(key)
   1737             if not lib.is_bool_array(key):
   1738                 if isnull(key).any():
-> 1739                     raise ValueError('cannot index with vector containing '
   1740                                      'NA / NaN values')
   1741                 return False

ValueError: cannot index with vector containing NA / NaN values

Now lets focus on some contributing factors that happens to occur in specific areas...


In [ ]: