In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))
Out[2]:
In [6]:
import geocoder
import csv
location = [43.65501,-79.42021]
In [13]:
from numpy import random
from scipy.spatial import distance
In [14]:
campus = [[43.657946,-79.39993],
[43.663502,-79.40005],
[43.663051,-79.402196],
[43.665429,-79.398975]
]
campus_dict = {(43.657946,-79.39993):4495.8,
(43.663502,-79.400050):2653,
(43.663051,-79.402196):3574,
(43.665429,-79.398975):2304
}
waterfront = [[43.648208,-79.370923],
[43.642711,-79.394043],
[43.639944,-79.387032],
[43.640625,-79.3932],
[43.640093,-79.380152]
]
waterfront_dict = {(43.648208,-79.370923):330,
(43.642711,-79.394043):745,
(43.639944,-79.387032):128,
(43.640625,-79.3932):154,
(43.640093,-79.380152):235
}
castleFrank = [[43.673792,-79.368187]]
castleFrank_dict = {(43.673792,-79.368187):3413}
In [15]:
col_df = pd.DataFrame.from_csv('toronto_cycling_central_mode.csv', index_col='id')
col_df = col_df.sort_index()
col_df.head()
Out[15]:
In [102]:
coords = central_col_df[["stname1","stname_2","street_1_type","street_2_type"]]
coords= coords.values.tolist()
coordinate_list=[]
for pair in coords:
if isinstance(pair[1],str):
if pair[1] <= pair[0]:
temp = pair[0]
pair[0] = pair[1]
pair[1] = temp
temp = pair[2]
pair[2] = pair[3]
pair[3] = temp
coords2 = coords
for pair2 in coords2:
pair2 = str(pair2[0]) + ' ' + str(pair2[2]) + ' & ' + str(pair2[1]) + ' ' + str(pair2[3]) + ', Toronto'
coordinate_list.append(pair2)
In [ ]:
coords = col_df[["STNAME1","STNAME 2","STET 1 TYPE","STET 2 TYPE"]]
coords= coords.values.tolist()
coordinate_list=[]
for pair in coords:
if isinstance(pair[1],str):
if pair[1] <= pair[0]:
temp = pair[0]
pair[0] = pair[1]
pair[1] = temp
temp = pair[2]
pair[2] = pair[3]
pair[3] = temp
coords2 = coords
for pair2 in coords2:
pair2 = str(pair2[0]) + ' ' + str(pair2[2]) + ' & ' + str(pair2[1]) + ' ' + str(pair2[3]) + ', Toronto'
coordinate_list.append(pair2)
In [16]:
ped_counts_df = pd.DataFrame.from_csv('Vehicle and Pedestrian Counts/TrafficPedestrianVolumes_2011.csv')
Using the Power fit for the Bike/Pedestrian ratio, we get a function that predicts the bike numbers at any one intersection.
In [17]:
ped_counts_df['bike_prediction'] = (500.2146799711*ped_counts_df['8HrPedVol']**(-0.8950759596))*ped_counts_df['8HrPedVol']
In [18]:
ped_counts_df.head()
Out[18]:
In [19]:
ped_coords = ped_counts_df[['Latitude','Longitude']]
ped_coords = ped_coords.replace(np.nan,0)
ped_coordinate_list = ped_coords.values.tolist()
In [20]:
ped_counts_df['coordinates'] = ped_counts_df[['Latitude','Longitude']].apply(tuple, axis=1)
ped_counts_df.head()
Out[20]:
In [21]:
ped_dict = ped_counts_df.set_index('coordinates').to_dict()['bike_prediction']
In [22]:
col_df['coordinates'] = col_df[['lat','long']].apply(tuple, axis=1)
col_df.head()
Out[22]:
In [23]:
central_col_df = col_df
In [38]:
#only run after above cell already has produced output. Otherwise file doesn't exist.
import csv
with open('closest_traffic_count.csv', 'r') as f:
reader = csv.reader(f)
closest_traffic_count = list(reader)
In [24]:
import matplotlib.path as mplPath
import numpy as np
waterfront_Path = mplPath.Path(np.array([[43.635497, -79.398156],
[43.639000, -79.400725],
[43.640822, -79.401427],
[43.646984, -79.376977],
[43.649889, -79.370343],
[43.651614, -79.362725],
[43.648090, -79.361191],
[43.646451, -79.361937],
[43.641209, -79.376739],
[43.639969, -79.379965],
[43.637698, -79.391847],
[43.635666, -79.398368],
[43.636489, -79.399603]]))
waterfront_Path.contains_point((200, 100))
Out[24]:
In [25]:
campus_Path = mplPath.Path(np.array([[43.659838, -79.399772],
[43.661388, -79.401006],
[43.665592, -79.402705],
[43.666768, -79.401354],
[43.668213, -79.393958],
[43.663141, -79.392719],
[43.659264, -79.394100],
[43.658329, -79.398204]]
))
campus_Path.contains_point((43.661013, -79.397200))
Out[25]:
In [26]:
castleFrank_Path = mplPath.Path(np.array([[43.672105, -79.376696],
[43.671562, -79.370962],
[43.674418, -79.366821],
[43.676086, -79.358731],
[43.677056, -79.354021],
[43.677040, -79.355126],
[43.677622, -79.358516],
[43.676194, -79.359503],
[43.675170, -79.364760],
[43.674580, -79.367539],
[43.672019, -79.371112],
[43.672710, -79.376927]]
))
castleFrank_Path.contains_point((43.676120, -79.359035))
Out[26]:
In [27]:
import csv
closest_traffic_point = []
bike_traffic = []
i = 0
for i in range(0,len(central_col_df)):
point = central_col_df['coordinates'].iloc[i]
if waterfront_Path.contains_point(point):
closest = waterfront[distance.cdist([point], waterfront).argmin()]
closest_traffic_point.append(tuple(closest))
bike_traffic.append(waterfront_dict[tuple(closest)])
elif campus_Path.contains_point(point):
closest = campus[distance.cdist([point], campus).argmin()]
closest_traffic_point.append(tuple(closest))
bike_traffic.append(campus_dict[tuple(closest)])
elif castleFrank_Path.contains_point(point):
closest = castleFrank[distance.cdist([point], castleFrank).argmin()]
closest_traffic_point.append(tuple(closest))
bike_traffic.append(castleFrank_dict[tuple(closest)])
else:
closest = ped_coordinate_list[distance.cdist([point], ped_coordinate_list).argmin()]
closest_traffic_point.append(tuple(closest))
bike_traffic.append(ped_dict[tuple(closest)])
myfile3 = open('closest_intersection.csv', 'w')
wr = csv.writer(myfile3)
wr.writerow(closest_traffic_point)
myfile3.close()
myfile4 = open('closest_int_bike_predictions.csv', 'w')
wr = csv.writer(myfile4)
wr.writerow(bike_traffic)
myfile4.close()
In [28]:
bike_traffic[:10]
Out[28]:
In [29]:
central_col_df['closest_traffic'] = tuple(closest_traffic_point)
central_col_df['traffic_count'] = bike_traffic
central_col_df.rename(columns={'closest_traffic': 'closest_ped_count', 'traffic_count':'predicted_bike_count'})
central_col_df.head()
Out[29]:
In [41]:
intersection_df = central_col_df.groupby('intersection').agg({'traffic_count':[np.size,np.mean]}).sort_values(by=[('traffic_count','mean')],ascending=False)
intersection_df.head(10)
Out[41]:
In [40]:
intersection_df = central_col_df.groupby('intersection').agg({'traffic_count':[np.size,np.mean]}).sort_values(by=[('traffic_count','size')],ascending=False)
intersection_df.head(10)
Out[40]:
In [60]:
intersection_df['traffic_count','normalized_accident_rate'] = intersection_df['traffic_count','size']/(24*intersection_df['traffic_count','mean'])
intersection_df = intersection_df.sort_values(by=[('traffic_count','normalized_accident_rate')],ascending=False)
intersection_df.head(20)
Out[60]:
In [61]:
df_test = intersection_df['traffic_count','normalized_accident_rate']
df = df_test.to_frame(name='normalized yearly accident rate')
df['total collisions'] = intersection_df['traffic_count','size']
df['traffic estimate'] = intersection_df['traffic_count','mean']
df = df.dropna()
len(df)
Out[61]:
In [75]:
sns.set_context("notebook", font_scale=1.1)
scatter = sns.jointplot(x='total collisions', y = 'normalized yearly accident rate', data = df, ylim= (0,0.005))
In [47]:
intersection_df.to_csv('totals_test.csv')
df.head(10)
Out[47]:
In [52]:
import scipy.stats as stats
In [49]:
sns.distplot(df["traffic estimate"], bins = 100)
Out[49]:
In [35]:
sns.distplot(df["total collisions"], bins = 15)
Out[35]:
In [82]:
sns.distplot(df["total collisions"], fit=stats.gamma, bins=range(0, 75, 1), kde=False)
Out[82]:
A gamma distribution looks like a good choice. We know that we are dealing with sums of poission distributions where each intersection is basically a poisson process, with the mean accidents for a year being the parameter. So the above histogram is really summing up the number of accidents each year, over these poisson processes. While we could try and fit each intersection individually, this is a whole lot of work. And it's not clear how one would use these estimated parameters to compute the prior distribution for the ensemble.
A gamma distribution looks nice, and intuitively makes sense since the prior for a poisson distribution is a gamma. Trying to read up on the intuition for how a ensemble of Poisson processes, each with their own paramater (not merely the same), hasn't lead me to any intuition for why I should stick with a gamma distribution for the entire ensemble, but it's a nice fit.
In [80]:
stats.gamma.fit(df['total collisions'])
Out[80]:
In [ ]: