In [2]:
import pandas as pd
import numpy as np
In [246]:
crashes = pd.read_csv('../crash_data/crashes.csv', names=['DATE', 'TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME',
'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
'CONTRIBUTING FACTOR VEHICLE 5', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4',
'VEHICLE TYPE CODE 5'])
crashes.head()
Out[246]:
In [245]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
# CLEAN CRASH DATA SET
crashes = open('../crash_data/crashes.csv', 'r')
heading = crashes.readline().strip().split(',') # skip the first line
w = open('../crash_data/crashes_clean.csv', 'w')
# Example: METROPOLITAN AVENUE and 74 AVENUE
def latlon_geocode(latlon):
r = requests.get('http://nominatim.openstreetmap.org/search?q='+latlon+'&addressdetails=1&format=json')
return r.json()
def address_geocode(address):
r = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address='+address)
return r.json()
#go through each line and clean it
count = 0
for crash in crashes:
time.sleep(1)
if count > 4:
break
count+=1
#parse the line
line = crash.strip().split(',')
print line[:6],
bor = line[2]
zip_ = line[3]
lat = line[4]
lon = line[5]
st1 = line[6]
st2 = line[7]
st3 = line[8]
#find missing info if needed
try:
if bor != '' and zip_ != '' and st1 != '' and st2 != '': # this line is okay
print 'okay'
pass
elif lat != '' and lon != '':
print 'lat/lon geocoding'
geo = latlon_geocode(lat+','+lon)
bor = geo[0]['address']['county'][:geo[0]['address']['county'].index(' ')]
zip_ = geo[0]['address']['postcode']
st1 = geo[0]['address']['road']
elif st1 != '' and st2 != '':
print 'st1/st2 geocoding'
geo = address_geocode(st1+' and '+st2)
bor = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'sublocality' in ac['types']][0]
zip_ = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'postal_code' in ac['types']][0]
lat = geo['results'][0]['geometry']['location']['lat']
lon = geo['results'][0]['geometry']['location']['lng']
#neigh = [ac['short_name'] ac in geo['results'][0]['address_components'] if 'neighborhood' in ac['types']][0]
elif st3 != '':
print 'st3 geocoding'
geo = address_geocode(st3)
bor = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'sublocality' in ac['types']][0]
zip_ = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'postal_code' in ac['types']][0]
lat = geo['results'][0]['geometry']['location']['lat']
lon = geo['results'][0]['geometry']['location']['lng']
#neigh = [ac['short_name'] ac in geo['results'][0]['address_components'] if 'neighborhood' in ac['types']][0]
else:
print "no location details!:"
except Exception as e:
print "exception"
#replace line entries
line[2] = bor.upper()
line[3] = zip_
line[4] = str(lat)
line[5] = str(lon)
line[6] = st1.upper()
line[7] = st2.upper()
w.write(','.join(line)+'\n')
crashes.close()
w.close()
In [247]:
# add index for intersection
crashes['INTERSECTION'] = crashes['LATITUDE'].apply(str) + "_" + crashes['LONGITUDE'].apply(str)
crashes['CRASHES'] = 1
print crashes.columns
crashes.head(n=5)
Out[247]:
In [257]:
# GROUP CRASHES BY INTERSECTION AGGREGATING CRASHES THROWING AWAY OTHER
gb = crashes.groupby(by='INTERSECTION', as_index=False)
longest = lambda x: x.get(x.map(lambda x: len(str(x))).idxmax()) # longest string
mirror = lambda x: x.iget(0)
agg_fun = {u'BOROUGH':longest,
u'ZIP CODE':mirror,
u'LATITUDE':mirror,
u'LONGITUDE':mirror,
u'ON STREET NAME':longest,
u'CROSS STREET NAME':longest,
u'OFF STREET NAME':longest,
u'NUMBER OF PERSONS INJURED':np.sum,
u'NUMBER OF PERSONS KILLED':np.sum,
u'NUMBER OF PEDESTRIANS INJURED':np.sum,
u'NUMBER OF PEDESTRIANS KILLED':np.sum,
u'NUMBER OF CYCLIST INJURED':np.sum,
u'NUMBER OF CYCLIST KILLED':np.sum,
u'NUMBER OF MOTORIST INJURED':np.sum,
u'NUMBER OF MOTORIST KILLED':np.sum,
'CRASHES':np.sum
}
groupby = gb.aggregate(agg_fun) # aggregate
#REMOVE ANY INTERSECTIONS WITH BAD GEOCODING -- lat north/south, lon east/west
#define a bounding box
top_lat = 40.935206
bot_lat = 40.488640
right_lon = -73.699619
left_lon = -74.261982
intersections = groupby[ (groupby.LONGITUDE > left_lon) & (groupby.LONGITUDE < right_lon) &
(groupby.LATITUDE > bot_lat) & (groupby.LATITUDE < top_lat) ]
intersections.to_csv('../crash_data/intersections.csv', index=False) # save to csv
intersections.sort('INTERSECTION').head()
Out[257]:
In [256]:
#Select only those that are within new york bounding box
#lat north/south, lon east/west
top_lat = 40.935206
bot_lat = 40.488640
right_lon = -73.699619
left_lon = -74.261982
intersections = pd.read_csv('../crash_data/intersections.csv')
print 'old number intersections', len(intersections)
print 'initial number of intersections:', len(groupby)
new_intersections = groupby[ (groupby.LONGITUDE > left_lon) & (groupby.LONGITUDE < right_lon) &
(groupby.LATITUDE > bot_lat) & (groupby.LATITUDE < top_lat) ]
print 'new intersections:', len(new_intersections)
new_intersections.sort('INTERSECTION').head()
Out[256]:
In [93]:
#!!! THIS PART HAS BEEN MADE OBSOLUTE!!!
# CREATE COUNTS FILE SAVE AND THEN MERGE
#counts = pd.DataFrame(gb.size())
#counts.to_csv('groupby_counts_junk.csv') # save to save csv
#read csvs and merge
#groupby = pd.read_csv('groupby_agg_junk.csv')
#counts = pd.read_csv('groupby_counts_junk.csv', names=["INTERSECTION", "CRASHES"])
#merge together and clean up
#df = groupby.merge(counts, on="INTERSECTION")
#df.drop(df.columns[:1], axis=1, inplace=True) # remove indexing column
#df = df[df["INTERSECTION"]!="nan_nan"] # eliminating all intersections with lat/lon ~80k crashes
#df.rename(columns={u"0":"CRASHES"}, inplace=True) # rename "0" to "CRASHES"
#df.to_csv('crash_data/intersections.csv', index=False)
In [ ]:
# ADD YELP COLUMN TO INTERSECTIONS
import pandas as pd
import numpy as np
import time
import oauth2
import requests
def number_businesses(intersection):
"""
Queries yelp api for number of businesses
around a given lat and lon.
"""
split = intersection.split('_')
if len(split) < 2:
return 0
lat, lon = map(float, split)
#radius = 80 # approx 1/2 block in meters
# OAuth credentials from Yelp
CONSUMER_KEY = 'T-urieHnNUVui5RcKb1XyQ'
CONSUMER_SECRET = '7rpMeGcyngvXcmXbNuMlGq1VqwU'
TOKEN = 'byRjsqCx1h7lHuu0xAq32L8WB2pEWQHR'
TOKEN_SECRET = '43aSvtErO0nH7YO5Rmg8XSWYuvU'
#build the request
url = 'http://api.yelp.com/v2/search?term=FOOD&ll={0},{1}&radius_filter={2}'.format(lat, lon, radius)
consumer = oauth2.Consumer(CONSUMER_KEY, CONSUMER_SECRET)
oauth_request = oauth2.Request(method="GET", url=url, parameters={})
oauth_request.update(
{
'oauth_nonce': oauth2.generate_nonce(),
'oauth_timestamp': oauth2.generate_timestamp(),
'oauth_token': TOKEN,
'oauth_consumer_key': CONSUMER_KEY
}
)
token = oauth2.Token(TOKEN, TOKEN_SECRET)
oauth_request.sign_request(oauth2.SignatureMethod_HMAC_SHA1(), consumer, token)
signed_url = oauth_request.to_url()
try:
r = requests.get(signed_url)
#print r.json()
time.sleep(1.0)
print lat, lon, len(r.json()['businesses'])
return len(r.json()[u'businesses'])
except Exception as e:
print lat, lon, "ERROR!", e
return -1
intersections = pd.read_csv('crash_data/intersections.csv')
radius = 80
intersections['YELP'+str(radius)] = intersections["INTERSECTION"].apply(number_businesses)
#intersections.head(n=10)
intersections.to_csv('crash_data/intersections.csv', index=False)
In [17]:
# JOIN THE YELP COLUMN
import os
files = os.popen('ls ../*.csv').read().split('\n')[:-1]
intersections = pd.read_csv('../crash_data/intersections.csv')
for fn in files:
yelp_intersections = pd.read_csv(fn)
new_column = yelp_intersections.columns[-1]
intersections = intersections.merge(yelp_intersections[["INTERSECTION", new_column]], on="INTERSECTION")
#intersections.drop("YELP80", axis=1, inplace=True)
intersections.to_csv('../crash_data/intersections.csv', index=False)
intersections.head()
Out[17]:
In [ ]:
# ADD FOUR SQUARE COLUMNS
import pandas as pd
import numpy as np
import time
import requests
# READ IN INTERSECTION DATA
intersections = pd.read_csv('crash_data/intersections.csv')
def number_businesses(intersection):
"""
Queries four square api for number of businesses
around a given lat and lon.
"""
split = intersection.split('_')
if len(split) < 2:
return 0
lat, lon = map(float, split)
#radius = 80 # approx 1/2 block in meters
# creds from four square
client_id=""
client_secret=""
#build the request
url ='https://api.foursquare.com/v2/venues/search?limit=50&client_id={0}&client_secret={1}&ll={2},{3}&radius={4}&query={5}'
url = url.format(client_id, client_secret, lat, lon, radius, term)
try:
time.sleep(0.5)
r = requests.get(url)
checkins = sum( [v['stats']['checkinsCount'] for v in j['response']['venues']] )
#print r.json()
print lat, lon, checkins
return checkins
except Exception as e:
print lat, lon, "ERROR!", e, "|", url, "|", r.json()
return -1
radius = 80
term = 'FOOD'
intersections['4SQ_'+term+'_'+str(radius)] = intersections["INTERSECTION"].apply(number_businesses)
#intersections.head(n=10)
intersections.to_csv('intersections_4SQ_'+term+str(radius)+'.csv', index=False)
In [274]:
# ADD SPEED HUMP COLUMN TO INTERSECTIONS
def speed_hump(st):
"""
Go through humps file and check if intersections
has hump
"""
return not humps[(st["CROSS STREET NAME"] == humps.FIRST_Main) & (st["ON STREET NAME"] == humps.FIRST_ToSt)].empty or \
not humps[(st["ON STREET NAME"] == humps.FIRST_Main) & (st["CROSS STREET NAME"] == humps.FIRST_ToSt)].empty
intersections = pd.read_csv('crash_data/intersections.csv')
intersections['SPEED HUMP'] = intersections[["ON STREET NAME", "CROSS STREET NAME"]].apply(speed_hump, axis=1)
#intersections[['ON STREET NAME','CROSS STREET NAME','SPEED HUMP']].head(n=25)
intersections.to_csv('crash_data/intersections.csv', index=False)
In [24]:
# READ IN INTERSECTIONS TO CHECK
intersections = pd.read_csv('../crash_data/clean_intersections.csv')
#intersections.drop(intersections.name=="0.0_0.0", inplace=True)
intersections.drop(intersections.name=="44.686796_-75.5084676", inplace=True)
intersections.sort("name", inplace=True, ascending=False,)
print len(intersections)
#intersections[intersections.name=="44.686796_-75.5084676"].head(n=10)
intersections.to_csv('../crash_data/clean_intersections.csv', index=False)
intersections.head()
Out[24]:
In [ ]: