In [ ]:
import pandas as pd
import numpy as np
import datetime
from datetime import date
from dateutil.rrule import rrule, DAILY
from __future__ import division
import geoplotlib as glp
from geoplotlib.utils import BoundingBox, DataAccessObject
pd.set_option('display.max_columns', None)
%matplotlib inline
In [ ]:
start_date = date(2012, 7, 1)
end_date = date(2016, 2, 29)
# data = pd.DataFrame()
frames = []
url_template = 'https://www.wunderground.com/history/airport/KJFK/%s/%s/%s/DailyHistory.html?req_city=New+York&req_state=NY&req_statename=New+York&reqdb.zip=10001&reqdb.magic=4&reqdb.wmo=99999&format=1.csv'
month = ""
for dt in rrule(DAILY, dtstart=start_date, until=end_date):
if (month != dt.strftime("%m")):
month = dt.strftime("%m")
print 'Downloading to memory: ' + dt.strftime("%Y-%m")
frames.append(pd.read_csv(url_template % (dt.strftime("%Y"),dt.strftime("%m"), dt.strftime("%d"))))
print "Saving data to csv..."
data = pd.concat(frames)
data.to_csv('weather_data_nyc_kjfk.csv', sep=',')
In [ ]:
from datetime import datetime
from dateutil import tz
weather = pd.read_csv('datasets/weather_data_nyc_kjfk_clean.csv')
def UTCtoActual(utcDate):
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('America/New_York')
utc = datetime.strptime(utcDate.DateUTC, '%Y-%m-%d %H:%M:%S')\
.replace(tzinfo=from_zone)\
.astimezone(to_zone)
s = pd.Series([utc.year, utc.month, utc.day, utc.hour])
s.columns = ['Year', 'Month', 'Day', 'Hour']
return s
#weather['DateActual'] = weather.DateUTC.map()
In [ ]:
weather[['Year', 'Month', 'Day', 'Hour']] = weather.apply(UTCtoActual, axis=1)
weather.to_csv('datasets/weather_data_nyc_kjfk_clean2.csv')
In [ ]:
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions.csv')
weather = pd.read_csv('datasets/weather_data_nyc_kjfk_clean2.csv')
weather.head(1)
In [ ]:
weather[(weather.Year == 2015) & (weather.Month == 11) & (weather.Day == 27)]
In [ ]:
features0 = ['Conditions', 'TemperatureC']
features = ['Conditions', \
'Precipitationmm', \
'TemperatureC', 'VisibilityKm']
def lookup_weather2(year, month, day, hour):
w = weather[(weather.Year == year) & (weather.Month == month) & (weather.Day == day) & (weather.Hour == hour)]
return w
def lookup_weather(date, time):
month = int(date.split('/')[0])
day = int(date.split('/')[1])
year = int(date.split('/')[2])
hour = int(time.split(':')[0])
d = lookup_weather2(year, month, day, hour).head(1)
if (d.empty):
dt_back = datetime.datetime(year, month, day, hour) - datetime.timedelta(hours=1)
dt_forward = datetime.datetime(year, month, day, hour) + datetime.timedelta(hours=1)
d_back = lookup_weather2(dt_back.year, dt_back.month, dt_back.day, dt_back.hour)
if (not d_back.empty): return d_back
d_forward = lookup_weather2(dt_forward.year, dt_forward.month, dt_forward.day, dt_forward.hour)
if (not d_forward.empty): return d_forward
return d
def merge_weather(incident):
date = incident.DATE
time = incident.TIME
#print "0"
w = lookup_weather(date, time)
#[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = w.values[0]
#print "1"
try:
#print "2"
#print w
con = "-"
temp = "-"
rainmm = "-"
viskm = "-"
#print "2.5"
if (not pd.isnull(w['Conditions'].iloc[0])):
con = w['Conditions'].iloc[0]
if (not pd.isnull(w['TemperatureC'].iloc[0])):
temp = w['TemperatureC'].iloc[0]
if (not pd.isnull(w['Precipitationmm'].iloc[0])):
rainmm = w['Precipitationmm'].iloc[0]
if (not pd.isnull(w['VisibilityKm'].iloc[0])):
viskm = w['VisibilityKm'].iloc[0]
#print 'con %s, temp %s, rainmm %s, viskm %s' % (con, temp, rainmm, viskm)
#print "2.75"
s = pd.Series([con, rainmm, temp, viskm])
#print "3"
#print str(len(w.values[0]))
#s = pd.Series(w.values[0])
#s = pd.Series([w['Conditions'].iloc[0], w['Dew PointC'].iloc[0], w['Gust SpeedKm/h'].iloc[0]])
#s.columns = features
return s
except:
#print "4"
print date + "x" + time
s = pd.Series([None,None,None,None])
#s = pd.Series(["1","2","3","4","5","6","7","8","9"])
#s = pd.Series([])
#s.columns = features
return s
#lookup_weather2(2016, 2, 14, 7)
#lookup_weather('03/14/2016', '3:27').values[0]
#[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = lookup_weather('01/27/2016', '3:27').values[0]
In [ ]:
print "Applying weather data to incidents..."
incidents[features] = incidents[incidents.DATE.str.split('/').str.get(2) != '2016'].apply(merge_weather, axis=1)
print "Saving weather in-riched incident data..."
incidents.to_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv', sep=',')
In [ ]:
incidents[incidents.DATE.str.split('/').str.get(2) == '2016']
In [ ]:
# Read dataset
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv')
# Filter 2016 incidents
incidents = incidents[(incidents.DATE.str.split('/').str.get(2) != '2016')
& (pd.notnull(incidents.Conditions))
& (incidents.Conditions != "Mist")]
In [ ]:
incidents
In [ ]:
# Distribution of incidents by weather conditions
ys = []
xs = []
for c in incidents.Conditions.unique():
mask = (incidents.Conditions == c)
filtered_incidents = incidents[mask]
ys.append(len(filtered_incidents.index))
xs.append(c)
df = pd.DataFrame(pd.Series(ys, index=xs, name="Incidents by weather conditions").sort_values())
df.plot(kind='barh', figsize=(8,8))
In [ ]:
df
Now lets try to find out if there are any condition that causes more incidents than others. We do this by plotting out heatmaps to get an idea of the distributions in the NYC area
In [ ]:
def plot_zip_weather(condition, data):
ys = []
xs = []
for z in data['ZIP CODE'].unique():
mask = (data['ZIP CODE'] == z)
filtered_incidents = data[mask]
ys.append(len(filtered_incidents.index))
xs.append(z)
df = pd.DataFrame(pd.Series(ys, index=xs, name="%s incidents by zip code" % condition).sort_values())
df.plot(kind='barh', figsize=(8,32))
def draw_kde(data):
bbox = BoundingBox(north=data.LATITUDE.max()-0.055,\
west=data.LONGITUDE.min()+0.055,\
south=data.LATITUDE.min()-0.055,\
east=data.LONGITUDE.max()+0.055)
coords = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
glp.kde(coords, bw=5, cut_below=1e-4)
glp.set_bbox(bbox)
glp.inline()
def plot_stuff(conditions, data):
print "%s conditions" % conditions
plot_zip_weather(conditions, data)
draw_kde(data)
snowy = incidents[incidents['Conditions'].str.contains('Snow')]
rainy = incidents[incidents['Conditions'].str.contains('Rain')]
clear = incidents[incidents['Conditions'].str.contains('Clear')]
cloudy = incidents[(incidents['Conditions'].str.contains('Cloud')) | (incidents['Conditions'].str.contains('Overcast'))]
haze = incidents[incidents['Conditions'].str.contains('Haze')]
plot_stuff("Snowy", snowy)
plot_stuff("Rainy", rainy)
plot_stuff("Clear", clear)
plot_stuff("Cloudy", cloudy)
plot_stuff("Hazy", haze)
Finding the ratio between conditions that resulted in an incident. Borough level
In [ ]:
from collections import Counter
ConditionIncidentCounter = Counter(incidents.Conditions.values)
p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
p_incident[k] = v/len(incidents)
p_incident
In [ ]:
# What is the probability of an incident based on the weather condition?
# Normalize incidents based on the conditions.
from collections import Counter
ConditionIncidentCounter = Counter(incidents.Conditions.values)
p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
p_incident[k] = v/len(incidents)
p_incident
# Do the same again but for individual areas of NYC
p_incident_district = {}
l = len(incidents)
for district in incidents[pd.notnull(incidents.BOROUGH)].BOROUGH.unique():
filtered = incidents[incidents.BOROUGH == district]
counter = Counter(filtered.Conditions.values)
p_incident_district[district] = {}
for k,v in counter.most_common():
p_incident_district[district][k] = v / len(list(counter.elements()));
p_incident_district
# Are there any areas in NYC that experience incidents based
# on a condition unusually higher or lower compared to other areas?
# Calculate the ratio of incidents based on the condition.
def calcRatioForDistrict(districtCounter, overAllCounter, district):
ys = []
xs = []
for con in incidents.Conditions.unique():
ys.append(districtCounter[con] / overAllCounter[con])
xs.append(con)
return pd.Series(ys, index=xs)
series = {}
for b in incidents[pd.notnull(incidents.BOROUGH)].BOROUGH.unique():
series[b] = calcRatioForDistrict(p_incident_district[b], p_incident, b)
df = pd.DataFrame(series)
df.plot(kind="bar", subplots=True, figsize=(14,14),layout=(7,2), legend=False,sharey=True)
Let's try to look at zip codes in Brooklyn only
In [ ]:
# What is the probability of an incident based on the weather condition?
# Normalize incidents based on the conditions.
from collections import Counter
borough = incidents[incidents.BOROUGH == 'MANHATTAN']
ConditionIncidentCounter = Counter(borough.Conditions.values)
p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
p_incident[k] = v/len(borough)
p_incident
# Do the same again but for individual areas of NYC
p_incident_borough_zip = {}
l = len(borough)
for z in borough[pd.notnull(incidents['ZIP CODE'])]['ZIP CODE'].unique():
filtered = borough[incidents['ZIP CODE'] == z]
counter = Counter(filtered.Conditions.values)
# z = str(z).split(".")[0]
p_incident_borough_zip[z] = {}
for k,v in counter.most_common():
p_incident_borough_zip[z][k] = v / len(list(counter.elements()));
p_incident_borough_zip
# Are there any areas in NYC that experience incidents based
# on a condition unusually higher or lower compared to other areas?
# Calculate the ratio of incidents based on the condition.
def calcRatioForDistrict(districtCounter, overAllCounter, district):
ys = []
xs = []
for con in incidents.Conditions.unique():
if (con in districtCounter):
ys.append(districtCounter[con] / overAllCounter[con])
else:
ys.append(0)
xs.append(con)
return pd.Series(ys, index=xs)
series = {}
for z in borough[pd.notnull(incidents['ZIP CODE'])]['ZIP CODE'].unique():
series[z] = calcRatioForDistrict(p_incident_borough_zip[z], p_incident, b)
df = pd.DataFrame(series)
In [ ]:
df.plot(kind="bar", subplots=True, figsize=(14,100), layout=(50,2), legend=False, sharey=False)
In [ ]:
worst_day = incidents.DATE.value_counts().index[0]
worst_day_count = incidents.DATE.value_counts()[0]
incidents[incidents.DATE == worst_day]
In [ ]: