In [ ]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
import geoplotlib as glp
import geoplotlib as gpl
from geoplotlib.utils import BoundingBox, DataAccessObject

%matplotlib inline
pd.set_option('display.max_columns', None)

In [ ]:


In [ ]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions.csv'

collisions = pd.read_csv(filePath)
collisions = collisions[pd.notnull(collisions.BOROUGH)]
collisions.head()

In [ ]:


In [ ]:
collisions['CROSS STREET NAME'].value_counts()

In [ ]:
mask = ((collisions['CROSS STREET NAME'] == 'BUSHWICK AVENUE') & (collisions['ON STREET NAME'] == 'COOK STREET'))
#mask = ((collisions['CROSS STREET NAME'] == 'BUSHWICK AVENUE'))

bushwick = collisions[mask]
bushwick

In [ ]:
locationMask = (collisions.LOCATION == '(40.702713, -73.9373173)')

collisions[locationMask]

In [ ]:
locationsCounts = collisions.LOCATION.value_counts()
locationsCountsFrame  = locationsCounts.to_frame()
locationsCountsFrame
#locationsCountsFrame.count

In [ ]:
locationsCountsFrame['LAT'] = locationsCountsFrame.index.str.split(',').str.get(0).str.replace('(', '')
locationsCountsFrame['LON'] = locationsCountsFrame.index.str.split(',').str.get(1).str.replace(')', '')

#locationsCountsFrame.index

In [ ]:
#locationsCountsFrame

sums = collisions.groupby(['LOCATION'])[['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']].sum()

In [ ]:
result = locationsCountsFrame.combine_first(sums)
result

In [ ]:


In [ ]:
result.sort_values(by='LOCATION')

In [ ]:
intersections = result

In [ ]:
result.to_csv('datasets/collision_counts.csv', sep=',')

In [ ]:
collisions['year'] = collisions.DATE.str.split('/').str.get(2)

top10Intersections = collisions.LOCATION.value_counts()[:10]


top10 = collisions[collisions.LOCATION.isin(top10Intersections.index)]

In [ ]:


In [ ]:
top10Intersections.to_frame()

In [ ]:
def addStreetName(e):
    return top10[top10.LOCATION == e.name][['ON STREET NAME', 'CROSS STREET NAME']].iloc[0]
    
top10InDf = top10Intersections.to_frame()
    
    
top10InDf[['ON STREET NAME', 'CROSS STREET NAME']] = top10InDf.apply(addStreetName, axis=1)
top10InDf.rename(columns={'LOCATION': 'COLLISIONS'}, inplace=True)
top10InDf.index.name = "LOCATION"
top10InDf.to_csv('datasets/intersection_top_10.csv', sep=',')

In [ ]:
series = {};

for location in top10.LOCATION:
    a_location = top10[top10.LOCATION == location]
    #print a_location.year
    index = []
    data = []
    key = a_location['ON STREET NAME'].iloc[0] + ' / ' + a_location['CROSS STREET NAME'].iloc[0]
    for year in range(2012, 2016):
        index.append(year)
        data.append(len(a_location[a_location.year == str(year)].index))
    series[key] = pd.Series(data, index=index);

Here are the top 10 most dangerous intersection plotted by year

Note that 2012 and 2016 are not complete.


In [ ]:
pd.DataFrame(series).plot(kind='bar',figsize=(15,10), subplots=True, layout=(5,2), legend=False, sharey=True)

Here we see the location of these 10 intersection


In [ ]:
lat = top10.LATITUDE.values
lon = top10.LONGITUDE.values

glp.dot({'lat': lat, 'lon': lon}, color="r")
glp.inline()

lat

That the intersections with the most collions

However how many get injured or killed in these?


In [ ]:
top10Injuries = collisions.groupby(['LOCATION'])['NUMBER OF PERSONS INJURED'].sum().sort_values(ascending=False)[:10]
top10Injuries

In [ ]:
top10Injuries

In [ ]:
top10Killed = collisions.groupby(['LOCATION'])['NUMBER OF PERSONS KILLED'].sum().sort_values(ascending=False)[:11]
top10Killed

In [ ]:
def splitLocation(df):
    result = df.copy()
    result['LAT'] = result.index.str.split(',').str.get(0).str.replace('(', '')
    result['LON'] = result.index.str.split(',').str.get(1).str.replace(')', '')
    return result
    

top10KilledDf = splitLocation(top10Killed.to_frame())
top10InjuredDf = splitLocation(top10Injuries.to_frame())

In [ ]:
lon = top10KilledDf.LON.astype(float).values
lat = top10KilledDf.LAT.astype(float).values

glp.dot({'lat': lat, 'lon': lon}, color="r")
glp.dot({'lat': top10.LATITUDE.values, 'lon': top10.LONGITUDE.values}, color='g')
glp.dot({'lat': top10InjuredDf.LAT.astype(float).values, 'lon': top10InjuredDf.LON.astype(float).values}, color='b')

glp.inline()

Lets save them


In [ ]:
def get_intersection_name(e):
    name = collisions[collisions.LOCATION == e.name][['CROSS STREET NAME','ON STREET NAME']].iloc[0]
    return name['ON STREET NAME'] + ' / '+ name['CROSS STREET NAME']
    
top10InjuredDf['NAME'] = top10InjuredDf.apply(get_intersection_name, axis=1, raw=True)
top10KilledDf['NAME'] = top10KilledDf.apply(get_intersection_name, axis=1, raw=True)

In [ ]:
top10InjuredDf

In [ ]:
top10KilledDf

In [ ]:
top10InjuredDf.to_csv('datasets/intersection_top10_injuries.csv', sep=',')
top10KilledDf.to_csv('datasets/intersection_top10_killed.csv', sep=',')

Finding outliers

Finding the intersection with the most injured, relative to the collisions


In [ ]:
# Lets find the average collisions per intersection

means = intersection.mean()
means

In [ ]:
print "For each collision %f number of persons gets injured" % (means['NUMBER OF PERSONS INJURED'] / means.LOCATION)

In [ ]:
top10InjuredDf

In [ ]:
top10Means = top10.mean()
top10InjuredMeans = top10InjuredDf.mean()

top10KilledMean = top10KilledDf.mean()
print "In the intersections with the most collisions, only %f amount of persons where injured\n" % top10Means['NUMBER OF PERSONS INJURED']

print "However looking at the top most intersection with the most injured,\n then we have %f persons injured per collison.\n"\
% (top10InjuredMeans['NUMBER OF PERSONS INJURED'] / top10InjuredMeans['COLLISIONS'])

#print "And lastly looking at the intersections with the most killed people, only %f persons get injured (not including killed) per collision"\
#% (top10KilledMean['NUMBER OF PERSONS INJURED'] / top10KilledMean['COLLISIONS'])

In [ ]:
loc_count = collisions.LOCATION.value_counts()


def get_collision_count(e):
    return loc_count[e.name]
    

top10InjuredDf['COLLISIONS'] = top10InjuredDf.copy().apply(get_collision_count, axis=1)

top10KilledDf['COLLISIONS'] = top10KilledDf.copy().apply(get_collision_count, axis=1)

In [ ]:
collisions.groupby(['LOCATION']).value_counts()

Look for some intersections that have a perticular high ration

Meaning where is it perticular dangous to drive.


In [ ]:
def calc_collision_inju_ration(e):
    return (e['NUMBER OF PERSONS INJURED'] / e.LOCATION)

intersection[intersection.LOCATION > 20].apply(calc_collision_inju_ration, axis=1, raw=True).sort_values()

In [ ]:
collisions[collisions.LOCATION == '(40.6790346, -73.9385077)']

In [ ]:
top10['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()

In [ ]:
def causesTop(n):
    danger_zone = collisions.LOCATION.value_counts()[n-1:n]
    return collisions[collisions.LOCATION.isin(danger_zone.index)]['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()


series = {}

for i in range(1,10):
    series[i] = causesTop(i)
    
pd.DataFrame(series).plot(kind='bar', figsize=(15,10), subplots=True)

In [ ]:
numberTop6 = collisions[collisions.LOCATION.isin(collisions.LOCATION.value_counts()[5:6].index)]

In [ ]:
pd.to_datetime(numberTop6.DATE).value_counts().plot(figsize=(15,15))


#pd.Series(range(0,numberTop6['NUMBER OF PERSONS INJURED'].size), index=pd.to_datetime(numberTop6.DATE)).plot(kind='bar',figsize=(15,15))

In [ ]:
numberTop6.size

In [ ]:
top10

In [ ]:
#top10['HOUR'] = top10.TIME.str.split(':').str.get(0).astype('int')
#top1['HOUR'] = top1.TIME.str.split(':').str.get(0).astype('int')
collisions['HOUR'] = collisions.TIME.str.split(':').str.get(0).astype('int')

In [ ]:
top10.HOUR.value_counts().sort_index().plot(figsize=(15,10), kind='bar')

In [ ]:
top1.HOUR.value_counts().sort_index().plot(figsize=(15,10), kind='bar')

In [ ]:
top10.HOUR.value_counts().index

In [ ]:
top1 = collisions[collisions.LOCATION.isin(collisions.LOCATION.value_counts()[0:1].index)]

In [ ]:
series = {}

for r in collisions['CONTRIBUTING FACTOR VEHICLE 1'].unique():
    series[r] = collisions[collisions['CONTRIBUTING FACTOR VEHICLE 1'] == r].HOUR.value_counts().sort_index()
series['Combined'] = collisions.HOUR.value_counts().sort_index()
pd.DataFrame(series).plot(figsize=(15,100), subplots=True)

In [ ]:
collisions['CONTRIBUTING FACTOR VEHICLE 1'].unique()

In [ ]:
collisions

In [ ]:
collisions.['']