In [ ]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
import geoplotlib as glp
import geoplotlib as gpl
from geoplotlib.utils import BoundingBox, DataAccessObject
%matplotlib inline
pd.set_option('display.max_columns', None)
In [ ]:
In [ ]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions.csv'
collisions = pd.read_csv(filePath)
collisions = collisions[pd.notnull(collisions.BOROUGH)]
collisions.head()
In [ ]:
In [ ]:
collisions['CROSS STREET NAME'].value_counts()
In [ ]:
mask = ((collisions['CROSS STREET NAME'] == 'BUSHWICK AVENUE') & (collisions['ON STREET NAME'] == 'COOK STREET'))
#mask = ((collisions['CROSS STREET NAME'] == 'BUSHWICK AVENUE'))
bushwick = collisions[mask]
bushwick
In [ ]:
locationMask = (collisions.LOCATION == '(40.702713, -73.9373173)')
collisions[locationMask]
In [ ]:
locationsCounts = collisions.LOCATION.value_counts()
locationsCountsFrame = locationsCounts.to_frame()
locationsCountsFrame
#locationsCountsFrame.count
In [ ]:
locationsCountsFrame['LAT'] = locationsCountsFrame.index.str.split(',').str.get(0).str.replace('(', '')
locationsCountsFrame['LON'] = locationsCountsFrame.index.str.split(',').str.get(1).str.replace(')', '')
#locationsCountsFrame.index
In [ ]:
#locationsCountsFrame
sums = collisions.groupby(['LOCATION'])[['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']].sum()
In [ ]:
result = locationsCountsFrame.combine_first(sums)
result
In [ ]:
In [ ]:
result.sort_values(by='LOCATION')
In [ ]:
intersections = result
In [ ]:
result.to_csv('datasets/collision_counts.csv', sep=',')
In [ ]:
collisions['year'] = collisions.DATE.str.split('/').str.get(2)
top10Intersections = collisions.LOCATION.value_counts()[:10]
top10 = collisions[collisions.LOCATION.isin(top10Intersections.index)]
In [ ]:
In [ ]:
top10Intersections.to_frame()
In [ ]:
def addStreetName(e):
return top10[top10.LOCATION == e.name][['ON STREET NAME', 'CROSS STREET NAME']].iloc[0]
top10InDf = top10Intersections.to_frame()
top10InDf[['ON STREET NAME', 'CROSS STREET NAME']] = top10InDf.apply(addStreetName, axis=1)
top10InDf.rename(columns={'LOCATION': 'COLLISIONS'}, inplace=True)
top10InDf.index.name = "LOCATION"
top10InDf.to_csv('datasets/intersection_top_10.csv', sep=',')
In [ ]:
series = {};
for location in top10.LOCATION:
a_location = top10[top10.LOCATION == location]
#print a_location.year
index = []
data = []
key = a_location['ON STREET NAME'].iloc[0] + ' / ' + a_location['CROSS STREET NAME'].iloc[0]
for year in range(2012, 2016):
index.append(year)
data.append(len(a_location[a_location.year == str(year)].index))
series[key] = pd.Series(data, index=index);
In [ ]:
pd.DataFrame(series).plot(kind='bar',figsize=(15,10), subplots=True, layout=(5,2), legend=False, sharey=True)
In [ ]:
lat = top10.LATITUDE.values
lon = top10.LONGITUDE.values
glp.dot({'lat': lat, 'lon': lon}, color="r")
glp.inline()
lat
In [ ]:
top10Injuries = collisions.groupby(['LOCATION'])['NUMBER OF PERSONS INJURED'].sum().sort_values(ascending=False)[:10]
top10Injuries
In [ ]:
top10Injuries
In [ ]:
top10Killed = collisions.groupby(['LOCATION'])['NUMBER OF PERSONS KILLED'].sum().sort_values(ascending=False)[:11]
top10Killed
In [ ]:
def splitLocation(df):
result = df.copy()
result['LAT'] = result.index.str.split(',').str.get(0).str.replace('(', '')
result['LON'] = result.index.str.split(',').str.get(1).str.replace(')', '')
return result
top10KilledDf = splitLocation(top10Killed.to_frame())
top10InjuredDf = splitLocation(top10Injuries.to_frame())
In [ ]:
lon = top10KilledDf.LON.astype(float).values
lat = top10KilledDf.LAT.astype(float).values
glp.dot({'lat': lat, 'lon': lon}, color="r")
glp.dot({'lat': top10.LATITUDE.values, 'lon': top10.LONGITUDE.values}, color='g')
glp.dot({'lat': top10InjuredDf.LAT.astype(float).values, 'lon': top10InjuredDf.LON.astype(float).values}, color='b')
glp.inline()
In [ ]:
def get_intersection_name(e):
name = collisions[collisions.LOCATION == e.name][['CROSS STREET NAME','ON STREET NAME']].iloc[0]
return name['ON STREET NAME'] + ' / '+ name['CROSS STREET NAME']
top10InjuredDf['NAME'] = top10InjuredDf.apply(get_intersection_name, axis=1, raw=True)
top10KilledDf['NAME'] = top10KilledDf.apply(get_intersection_name, axis=1, raw=True)
In [ ]:
top10InjuredDf
In [ ]:
top10KilledDf
In [ ]:
top10InjuredDf.to_csv('datasets/intersection_top10_injuries.csv', sep=',')
top10KilledDf.to_csv('datasets/intersection_top10_killed.csv', sep=',')
In [ ]:
# Lets find the average collisions per intersection
means = intersection.mean()
means
In [ ]:
print "For each collision %f number of persons gets injured" % (means['NUMBER OF PERSONS INJURED'] / means.LOCATION)
In [ ]:
top10InjuredDf
In [ ]:
top10Means = top10.mean()
top10InjuredMeans = top10InjuredDf.mean()
top10KilledMean = top10KilledDf.mean()
print "In the intersections with the most collisions, only %f amount of persons where injured\n" % top10Means['NUMBER OF PERSONS INJURED']
print "However looking at the top most intersection with the most injured,\n then we have %f persons injured per collison.\n"\
% (top10InjuredMeans['NUMBER OF PERSONS INJURED'] / top10InjuredMeans['COLLISIONS'])
#print "And lastly looking at the intersections with the most killed people, only %f persons get injured (not including killed) per collision"\
#% (top10KilledMean['NUMBER OF PERSONS INJURED'] / top10KilledMean['COLLISIONS'])
In [ ]:
loc_count = collisions.LOCATION.value_counts()
def get_collision_count(e):
return loc_count[e.name]
top10InjuredDf['COLLISIONS'] = top10InjuredDf.copy().apply(get_collision_count, axis=1)
top10KilledDf['COLLISIONS'] = top10KilledDf.copy().apply(get_collision_count, axis=1)
In [ ]:
collisions.groupby(['LOCATION']).value_counts()
In [ ]:
def calc_collision_inju_ration(e):
return (e['NUMBER OF PERSONS INJURED'] / e.LOCATION)
intersection[intersection.LOCATION > 20].apply(calc_collision_inju_ration, axis=1, raw=True).sort_values()
In [ ]:
collisions[collisions.LOCATION == '(40.6790346, -73.9385077)']
In [ ]:
top10['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()
In [ ]:
def causesTop(n):
danger_zone = collisions.LOCATION.value_counts()[n-1:n]
return collisions[collisions.LOCATION.isin(danger_zone.index)]['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()
series = {}
for i in range(1,10):
series[i] = causesTop(i)
pd.DataFrame(series).plot(kind='bar', figsize=(15,10), subplots=True)
In [ ]:
numberTop6 = collisions[collisions.LOCATION.isin(collisions.LOCATION.value_counts()[5:6].index)]
In [ ]:
pd.to_datetime(numberTop6.DATE).value_counts().plot(figsize=(15,15))
#pd.Series(range(0,numberTop6['NUMBER OF PERSONS INJURED'].size), index=pd.to_datetime(numberTop6.DATE)).plot(kind='bar',figsize=(15,15))
In [ ]:
numberTop6.size
In [ ]:
top10
In [ ]:
#top10['HOUR'] = top10.TIME.str.split(':').str.get(0).astype('int')
#top1['HOUR'] = top1.TIME.str.split(':').str.get(0).astype('int')
collisions['HOUR'] = collisions.TIME.str.split(':').str.get(0).astype('int')
In [ ]:
top10.HOUR.value_counts().sort_index().plot(figsize=(15,10), kind='bar')
In [ ]:
top1.HOUR.value_counts().sort_index().plot(figsize=(15,10), kind='bar')
In [ ]:
top10.HOUR.value_counts().index
In [ ]:
top1 = collisions[collisions.LOCATION.isin(collisions.LOCATION.value_counts()[0:1].index)]
In [ ]:
series = {}
for r in collisions['CONTRIBUTING FACTOR VEHICLE 1'].unique():
series[r] = collisions[collisions['CONTRIBUTING FACTOR VEHICLE 1'] == r].HOUR.value_counts().sort_index()
series['Combined'] = collisions.HOUR.value_counts().sort_index()
pd.DataFrame(series).plot(figsize=(15,100), subplots=True)
In [ ]:
collisions['CONTRIBUTING FACTOR VEHICLE 1'].unique()
In [ ]:
collisions
In [ ]:
collisions.['']