Weather and Motor Vehicle Collisions



In [ ]:

    
import pandas as pd
import numpy as np
import datetime
from datetime import date
from dateutil.rrule import rrule, DAILY
from __future__ import division
import geoplotlib as glp
from geoplotlib.utils import BoundingBox, DataAccessObject

pd.set_option('display.max_columns', None)
%matplotlib inline

Download weather data



In [ ]:

    
start_date = date(2012, 7, 1)
end_date = date(2016, 2, 29)

# data = pd.DataFrame()
frames = []
url_template = 'https://www.wunderground.com/history/airport/KJFK/%s/%s/%s/DailyHistory.html?req_city=New+York&req_state=NY&req_statename=New+York&reqdb.zip=10001&reqdb.magic=4&reqdb.wmo=99999&format=1.csv'

month = ""

for dt in rrule(DAILY, dtstart=start_date, until=end_date):
    if (month != dt.strftime("%m")):
        month = dt.strftime("%m")
        print 'Downloading to memory: ' + dt.strftime("%Y-%m")    
    frames.append(pd.read_csv(url_template % (dt.strftime("%Y"),dt.strftime("%m"), dt.strftime("%d"))))

print "Saving data to csv..."
data = pd.concat(frames)
data.to_csv('weather_data_nyc_kjfk.csv', sep=',')

Cleaning the weather dataset

Convert weather DateUTC to local time



In [ ]:

    
from datetime import datetime
from dateutil import tz

weather = pd.read_csv('datasets/weather_data_nyc_kjfk_clean.csv')

def UTCtoActual(utcDate):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('America/New_York')
    
    utc = datetime.strptime(utcDate.DateUTC, '%Y-%m-%d %H:%M:%S')\
                  .replace(tzinfo=from_zone)\
                  .astimezone(to_zone)
    s = pd.Series([utc.year, utc.month, utc.day, utc.hour])
    s.columns = ['Year', 'Month', 'Day', 'Hour']
    return s
    
#weather['DateActual'] = weather.DateUTC.map()



In [ ]:

    
weather[['Year', 'Month', 'Day', 'Hour']] = weather.apply(UTCtoActual, axis=1)
weather.to_csv('datasets/weather_data_nyc_kjfk_clean2.csv')

Merge weather and NYPD MVC datasets



In [ ]:

    
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions.csv')
weather = pd.read_csv('datasets/weather_data_nyc_kjfk_clean2.csv')
weather.head(1)



In [ ]:

    
weather[(weather.Year == 2015) & (weather.Month == 11) & (weather.Day == 27)]



In [ ]:

    
features0 = ['Conditions', 'TemperatureC']
features = ['Conditions', \
            'Precipitationmm', \
            'TemperatureC', 'VisibilityKm']

def lookup_weather2(year, month, day, hour):
    w = weather[(weather.Year == year) & (weather.Month == month) & (weather.Day == day) & (weather.Hour == hour)]
    return w

def lookup_weather(date, time):
    month = int(date.split('/')[0])
    day = int(date.split('/')[1])
    year = int(date.split('/')[2])
    hour = int(time.split(':')[0])
    d = lookup_weather2(year, month, day, hour).head(1)
    if (d.empty):
        dt_back = datetime.datetime(year, month, day, hour) - datetime.timedelta(hours=1)
        dt_forward = datetime.datetime(year, month, day, hour) + datetime.timedelta(hours=1)
        
        d_back = lookup_weather2(dt_back.year, dt_back.month, dt_back.day, dt_back.hour)
        if (not d_back.empty): return d_back
        
        d_forward = lookup_weather2(dt_forward.year, dt_forward.month, dt_forward.day, dt_forward.hour)
        if (not d_forward.empty): return d_forward
    return d



def merge_weather(incident):
    date = incident.DATE
    time = incident.TIME
    #print "0"
    w = lookup_weather(date, time)
    #[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = w.values[0]

    #print "1"
    try:
        #print "2"
        #print w
        con = "-"
        temp = "-"
        rainmm = "-"
        viskm = "-"
        #print "2.5"
        if (not pd.isnull(w['Conditions'].iloc[0])):
            con = w['Conditions'].iloc[0]
        if (not pd.isnull(w['TemperatureC'].iloc[0])):
            temp = w['TemperatureC'].iloc[0]
        if (not pd.isnull(w['Precipitationmm'].iloc[0])):
            rainmm = w['Precipitationmm'].iloc[0]
        if (not pd.isnull(w['VisibilityKm'].iloc[0])):
            viskm = w['VisibilityKm'].iloc[0]
            
        #print 'con %s, temp %s, rainmm %s, viskm %s' % (con, temp, rainmm, viskm)
        
        #print "2.75"
        s = pd.Series([con, rainmm, temp, viskm])
        #print "3"
        #print str(len(w.values[0]))
        #s = pd.Series(w.values[0])
        #s = pd.Series([w['Conditions'].iloc[0], w['Dew PointC'].iloc[0], w['Gust SpeedKm/h'].iloc[0]])

        #s.columns = features
        return s
    except:
        #print "4"
        print date + "x" + time
        s = pd.Series([None,None,None,None])
        #s = pd.Series(["1","2","3","4","5","6","7","8","9"])
        #s = pd.Series([])
        #s.columns = features
        return s
    
    
    

#lookup_weather2(2016, 2, 14, 7)
#lookup_weather('03/14/2016', '3:27').values[0]
#[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = lookup_weather('01/27/2016', '3:27').values[0]



In [ ]:

    
print "Applying weather data to incidents..."
incidents[features] = incidents[incidents.DATE.str.split('/').str.get(2) != '2016'].apply(merge_weather, axis=1)
print "Saving weather in-riched incident data..."
incidents.to_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv', sep=',')



In [ ]:

    
incidents[incidents.DATE.str.split('/').str.get(2) == '2016']

Make some nice data analysis



In [ ]:

    
# Read dataset
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv')
# Filter 2016 incidents
incidents = incidents[(incidents.DATE.str.split('/').str.get(2) != '2016') 
                      & (pd.notnull(incidents.Conditions))
                      & (incidents.Conditions != "Mist")]



In [ ]:

    
incidents



In [ ]:

    
# Distribution of incidents by weather conditions
ys = []
xs = []

for c in incidents.Conditions.unique():
    mask = (incidents.Conditions == c)
    filtered_incidents = incidents[mask]
    ys.append(len(filtered_incidents.index))
    xs.append(c)

df = pd.DataFrame(pd.Series(ys, index=xs, name="Incidents by weather conditions").sort_values())
df.plot(kind='barh', figsize=(8,8))



In [ ]:

    
df

Now lets try to find out if there are any condition that causes more incidents than others. We do this by plotting out heatmaps to get an idea of the distributions in the NYC area



In [ ]:

    
def plot_zip_weather(condition, data):
    ys = []
    xs = []

    for z in data['ZIP CODE'].unique():
        mask = (data['ZIP CODE'] == z)
        filtered_incidents = data[mask]
        ys.append(len(filtered_incidents.index))
        xs.append(z)

    df = pd.DataFrame(pd.Series(ys, index=xs, name="%s incidents by zip code" % condition).sort_values())
    df.plot(kind='barh', figsize=(8,32))

def draw_kde(data):
    bbox = BoundingBox(north=data.LATITUDE.max()-0.055,\
                       west=data.LONGITUDE.min()+0.055,\
                       south=data.LATITUDE.min()-0.055,\
                       east=data.LONGITUDE.max()+0.055)

    coords = {'lat': data.LATITUDE.values.tolist(), 'lon': data.LONGITUDE.values.tolist()}
    
    glp.kde(coords, bw=5, cut_below=1e-4)
    glp.set_bbox(bbox)
    glp.inline()
    
def plot_stuff(conditions, data):
    print "%s conditions" % conditions 
    plot_zip_weather(conditions, data)
    draw_kde(data)

snowy = incidents[incidents['Conditions'].str.contains('Snow')]
rainy = incidents[incidents['Conditions'].str.contains('Rain')]
clear = incidents[incidents['Conditions'].str.contains('Clear')]
cloudy = incidents[(incidents['Conditions'].str.contains('Cloud')) | (incidents['Conditions'].str.contains('Overcast'))]
haze = incidents[incidents['Conditions'].str.contains('Haze')]
plot_stuff("Snowy", snowy)
plot_stuff("Rainy", rainy)
plot_stuff("Clear", clear)
plot_stuff("Cloudy", cloudy)
plot_stuff("Hazy", haze)

Finding the ratio between conditions that resulted in an incident. Borough level



In [ ]:

    
from collections import Counter
ConditionIncidentCounter = Counter(incidents.Conditions.values)

p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
    p_incident[k] = v/len(incidents)

p_incident



In [ ]:

    
# What is the probability of an incident based on the weather condition?
# Normalize incidents based on the conditions.

from collections import Counter
ConditionIncidentCounter = Counter(incidents.Conditions.values)

p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
    p_incident[k] = v/len(incidents)

p_incident

# Do the same again but for individual areas of NYC
p_incident_district = {}
l = len(incidents)
for district in incidents[pd.notnull(incidents.BOROUGH)].BOROUGH.unique():
    filtered = incidents[incidents.BOROUGH == district]
    counter = Counter(filtered.Conditions.values)
    p_incident_district[district] = {}
    for k,v in counter.most_common():
            p_incident_district[district][k] = v / len(list(counter.elements()));
            
p_incident_district

# Are there any areas in NYC that experience incidents based 
#  on a condition unusually higher or lower compared to other areas?
# Calculate the ratio of incidents based on the condition.
def calcRatioForDistrict(districtCounter, overAllCounter, district):
    ys = []
    xs = []
    for con in incidents.Conditions.unique():
        ys.append(districtCounter[con] / overAllCounter[con])
        xs.append(con)
    return pd.Series(ys, index=xs)
    
series = {}
for b in incidents[pd.notnull(incidents.BOROUGH)].BOROUGH.unique():
    series[b] = calcRatioForDistrict(p_incident_district[b], p_incident, b)

df = pd.DataFrame(series)
df.plot(kind="bar", subplots=True, figsize=(14,14),layout=(7,2), legend=False,sharey=True)

Let's try to look at zip codes in Brooklyn only



In [ ]:

    
# What is the probability of an incident based on the weather condition?
# Normalize incidents based on the conditions.

from collections import Counter
borough = incidents[incidents.BOROUGH == 'MANHATTAN']
ConditionIncidentCounter = Counter(borough.Conditions.values)

p_incident = {}
for k,v in ConditionIncidentCounter.most_common():
    p_incident[k] = v/len(borough)

p_incident

# Do the same again but for individual areas of NYC
p_incident_borough_zip = {}
l = len(borough)
for z in borough[pd.notnull(incidents['ZIP CODE'])]['ZIP CODE'].unique():
    filtered = borough[incidents['ZIP CODE'] == z]
    counter = Counter(filtered.Conditions.values)
#     z = str(z).split(".")[0]
    p_incident_borough_zip[z] = {}
    for k,v in counter.most_common():
        p_incident_borough_zip[z][k] = v / len(list(counter.elements()));
            
p_incident_borough_zip

# Are there any areas in NYC that experience incidents based 
#  on a condition unusually higher or lower compared to other areas?
# Calculate the ratio of incidents based on the condition.
def calcRatioForDistrict(districtCounter, overAllCounter, district):
    ys = []
    xs = []
    for con in incidents.Conditions.unique():
        if (con in districtCounter):
            ys.append(districtCounter[con] / overAllCounter[con])
        else:
            ys.append(0)
        xs.append(con)
    return pd.Series(ys, index=xs)
    
series = {}
for z in borough[pd.notnull(incidents['ZIP CODE'])]['ZIP CODE'].unique():
    series[z] = calcRatioForDistrict(p_incident_borough_zip[z], p_incident, b)

df = pd.DataFrame(series)



In [ ]:

    
df.plot(kind="bar", subplots=True, figsize=(14,100), layout=(50,2), legend=False, sharey=False)



In [ ]:

    
worst_day = incidents.DATE.value_counts().index[0]
worst_day_count = incidents.DATE.value_counts()[0]

incidents[incidents.DATE == worst_day]



In [ ]: