In [2]:
import pandas as pd
import numpy as np
In [4]:
crashes = pd.read_csv('../crash_data/crashes.csv')
crashes.head()
Out[4]:
In [245]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
# CLEAN CRASH DATA SET
crashes = open('../crash_data/crashes.csv', 'r')
heading = crashes.readline().strip().split(',') # skip the first line
w = open('../crash_data/crashes_clean.csv', 'w')
# Example: METROPOLITAN AVENUE and 74 AVENUE
def latlon_geocode(latlon):
r = requests.get('http://nominatim.openstreetmap.org/search?q='+latlon+'&addressdetails=1&format=json')
return r.json()
def address_geocode(address):
r = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address='+address)
return r.json()
#go through each line and clean it
count = 0
for crash in crashes:
time.sleep(1)
if count > 4:
break
count+=1
#parse the line
line = crash.strip().split(',')
print line[:6],
bor = line[2]
zip_ = line[3]
lat = line[4]
lon = line[5]
st1 = line[6]
st2 = line[7]
st3 = line[8]
#find missing info if needed
try:
if bor != '' and zip_ != '' and st1 != '' and st2 != '': # this line is okay
print 'okay'
pass
elif lat != '' and lon != '':
print 'lat/lon geocoding'
geo = latlon_geocode(lat+','+lon)
bor = geo[0]['address']['county'][:geo[0]['address']['county'].index(' ')]
zip_ = geo[0]['address']['postcode']
st1 = geo[0]['address']['road']
elif st1 != '' and st2 != '':
print 'st1/st2 geocoding'
geo = address_geocode(st1+' and '+st2)
bor = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'sublocality' in ac['types']][0]
zip_ = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'postal_code' in ac['types']][0]
lat = geo['results'][0]['geometry']['location']['lat']
lon = geo['results'][0]['geometry']['location']['lng']
#neigh = [ac['short_name'] ac in geo['results'][0]['address_components'] if 'neighborhood' in ac['types']][0]
elif st3 != '':
print 'st3 geocoding'
geo = address_geocode(st3)
bor = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'sublocality' in ac['types']][0]
zip_ = [ac['short_name'] for ac in geo['results'][0]['address_components'] if 'postal_code' in ac['types']][0]
lat = geo['results'][0]['geometry']['location']['lat']
lon = geo['results'][0]['geometry']['location']['lng']
#neigh = [ac['short_name'] ac in geo['results'][0]['address_components'] if 'neighborhood' in ac['types']][0]
else:
print "no location details!:"
except Exception as e:
print "exception"
#replace line entries
line[2] = bor.upper()
line[3] = zip_
line[4] = str(lat)
line[5] = str(lon)
line[6] = st1.upper()
line[7] = st2.upper()
w.write(','.join(line)+'\n')
crashes.close()
w.close()
In [247]:
# add index for intersection
crashes['INTERSECTION'] = crashes['LATITUDE'].apply(str) + "_" + crashes['LONGITUDE'].apply(str)
crashes['CRASHES'] = 1
print crashes.columns
crashes.head(n=5)
Out[247]:
In [257]:
# GROUP CRASHES BY INTERSECTION AGGREGATING CRASHES THROWING AWAY OTHER
gb = crashes.groupby(by='INTERSECTION', as_index=False)
longest = lambda x: x.get(x.map(lambda x: len(str(x))).idxmax()) # longest string
mirror = lambda x: x.iget(0)
agg_fun = {u'BOROUGH':longest,
u'ZIP CODE':mirror,
u'LATITUDE':mirror,
u'LONGITUDE':mirror,
u'ON STREET NAME':longest,
u'CROSS STREET NAME':longest,
u'OFF STREET NAME':longest,
u'NUMBER OF PERSONS INJURED':np.sum,
u'NUMBER OF PERSONS KILLED':np.sum,
u'NUMBER OF PEDESTRIANS INJURED':np.sum,
u'NUMBER OF PEDESTRIANS KILLED':np.sum,
u'NUMBER OF CYCLIST INJURED':np.sum,
u'NUMBER OF CYCLIST KILLED':np.sum,
u'NUMBER OF MOTORIST INJURED':np.sum,
u'NUMBER OF MOTORIST KILLED':np.sum,
'CRASHES':np.sum
}
groupby = gb.aggregate(agg_fun) # aggregate
#REMOVE ANY INTERSECTIONS WITH BAD GEOCODING -- lat north/south, lon east/west
#define a bounding box
top_lat = 40.935206
bot_lat = 40.488640
right_lon = -73.699619
left_lon = -74.261982
intersections = groupby[ (groupby.LONGITUDE > left_lon) & (groupby.LONGITUDE < right_lon) &
(groupby.LATITUDE > bot_lat) & (groupby.LATITUDE < top_lat) ]
intersections.to_csv('../crash_data/intersections.csv', index=False) # save to csv
intersections.sort('INTERSECTION').head()
Out[257]:
In [256]:
#Select only those that are within new york bounding box
#lat north/south, lon east/west
top_lat = 40.935206
bot_lat = 40.488640
right_lon = -73.699619
left_lon = -74.261982
intersections = pd.read_csv('../crash_data/intersections.csv')
print 'old number intersections', len(intersections)
print 'initial number of intersections:', len(groupby)
new_intersections = groupby[ (groupby.LONGITUDE > left_lon) & (groupby.LONGITUDE < right_lon) &
(groupby.LATITUDE > bot_lat) & (groupby.LATITUDE < top_lat) ]
print 'new intersections:', len(new_intersections)
new_intersections.sort('INTERSECTION').head()
Out[256]:
In [ ]:
# ADD YELP COLUMN TO INTERSECTIONS
import pandas as pd
import numpy as np
import time
import oauth2
import requests
def number_businesses(intersection):
"""
Queries yelp api for number of businesses
around a given lat and lon.
"""
split = intersection.split('_')
if len(split) < 2:
return 0
lat, lon = map(float, split)
#radius = 80 # approx 1/2 block in meters
# OAuth credentials from Yelp
CONSUMER_KEY = 'T-urieHnNUVui5RcKb1XyQ'
CONSUMER_SECRET = '7rpMeGcyngvXcmXbNuMlGq1VqwU'
TOKEN = 'byRjsqCx1h7lHuu0xAq32L8WB2pEWQHR'
TOKEN_SECRET = '43aSvtErO0nH7YO5Rmg8XSWYuvU'
#build the request
url = 'http://api.yelp.com/v2/search?term=FOOD&ll={0},{1}&radius_filter={2}'.format(lat, lon, radius)
consumer = oauth2.Consumer(CONSUMER_KEY, CONSUMER_SECRET)
oauth_request = oauth2.Request(method="GET", url=url, parameters={})
oauth_request.update(
{
'oauth_nonce': oauth2.generate_nonce(),
'oauth_timestamp': oauth2.generate_timestamp(),
'oauth_token': TOKEN,
'oauth_consumer_key': CONSUMER_KEY
}
)
token = oauth2.Token(TOKEN, TOKEN_SECRET)
oauth_request.sign_request(oauth2.SignatureMethod_HMAC_SHA1(), consumer, token)
signed_url = oauth_request.to_url()
try:
r = requests.get(signed_url)
#print r.json()
time.sleep(1.0)
print lat, lon, len(r.json()['businesses'])
return len(r.json()[u'businesses'])
except Exception as e:
print lat, lon, "ERROR!", e
return -1
intersections = pd.read_csv('crash_data/intersections.csv')
radius = 80
intersections['YELP'+str(radius)] = intersections["INTERSECTION"].apply(number_businesses)
#intersections.head(n=10)
intersections.to_csv('crash_data/intersections.csv', index=False)
In [17]:
# JOIN THE YELP COLUMN
import os
files = os.popen('ls ../*.csv').read().split('\n')[:-1]
intersections = pd.read_csv('../crash_data/intersections.csv')
for fn in files:
yelp_intersections = pd.read_csv(fn)
new_column = yelp_intersections.columns[-1]
intersections = intersections.merge(yelp_intersections[["INTERSECTION", new_column]], on="INTERSECTION")
#intersections.drop("YELP80", axis=1, inplace=True)
intersections.to_csv('../crash_data/intersections.csv', index=False)
intersections.head()
Out[17]:
In [ ]:
# ADD FOUR SQUARE COLUMNS
import pandas as pd
import numpy as np
import time
import requests
# READ IN INTERSECTION DATA
intersections = pd.read_csv('crash_data/intersections.csv')
def number_businesses(intersection):
"""
Queries four square api for number of businesses
around a given lat and lon.
"""
split = intersection.split('_')
if len(split) < 2:
return 0
lat, lon = map(float, split)
#radius = 80 # approx 1/2 block in meters
# creds from four square
client_id=""
client_secret=""
#build the request
url ='https://api.foursquare.com/v2/venues/search?limit=50&client_id={0}&client_secret={1}&ll={2},{3}&radius={4}&query={5}'
url = url.format(client_id, client_secret, lat, lon, radius, term)
try:
time.sleep(0.5)
r = requests.get(url)
checkins = sum( [v['stats']['checkinsCount'] for v in j['response']['venues']] )
#print r.json()
print lat, lon, checkins
return checkins
except Exception as e:
print lat, lon, "ERROR!", e, "|", url, "|", r.json()
return -1
radius = 80
term = 'FOOD'
intersections['4SQ_'+term+'_'+str(radius)] = intersections["INTERSECTION"].apply(number_businesses)
#intersections.head(n=10)
intersections.to_csv('intersections_4SQ_'+term+str(radius)+'.csv', index=False)
In [10]:
# JOIN THE 4SQ COLUMN
import os
files = os.popen('ls ../*.csv').read().split('\n')[:-2]
intersections = pd.read_csv('../crash_data/intersections.csv')
for fn in files:
foursq_intersections = pd.read_csv(fn)
new_column = foursq_intersections.columns[-1]
intersections = intersections.merge(foursq_intersections[["INTERSECTION", new_column]], on="INTERSECTION")
intersections.drop("INTERSECTION", inplace=True, axis=1)
#intersections.drop("YELP80", axis=1, inplace=True)
intersections.to_csv('../crash_data/intersections.csv', index=False)
intersections.head()
Out[10]:
In [25]:
#RENAME COLUMNS
old_names = ['YELP_BAR_80', 'YELP_DRINK_80', 'YELP_FOOD_80', 'YELP_MUSIC_80', 'YELP_SHOW_80', '4SQ_DRINK_80', '4SQ_FOOD_80']
new_names = ['YELP_BAR', 'YELP_DRINK', 'YELP_FOOD', 'YELP_MUSIC', 'YELP_SHOW', 'FOUR_SQUARE_DRINK', 'FOUR_SQUARE_FOOD']
intersections[new_names] = intersections[old_names]
intersections.drop(old_names, axis=1, inplace=True)
intersections.to_csv('../crash_data/intersections.csv', index=False)
intersections.head()
Out[25]:
In [274]:
# ADD SPEED HUMP COLUMN TO INTERSECTIONS
def speed_hump(st):
"""
Go through humps file and check if intersections
has hump
"""
return not humps[(st["CROSS STREET NAME"] == humps.FIRST_Main) & (st["ON STREET NAME"] == humps.FIRST_ToSt)].empty or \
not humps[(st["ON STREET NAME"] == humps.FIRST_Main) & (st["CROSS STREET NAME"] == humps.FIRST_ToSt)].empty
intersections = pd.read_csv('crash_data/intersections.csv')
intersections['SPEED HUMP'] = intersections[["ON STREET NAME", "CROSS STREET NAME"]].apply(speed_hump, axis=1)
#intersections[['ON STREET NAME','CROSS STREET NAME','SPEED HUMP']].head(n=25)
intersections.to_csv('crash_data/intersections.csv', index=False)
In [18]:
# READ IN INTERSECTIONS TO CHECK
intersections = pd.read_csv('../crash_data/intersections.csv')
intersections.sort("INTERSECTION", inplace=True, ascending=False,)
print len(intersections)
intersections.head(n=10)
Out[18]: