In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
from datetime import datetime, timedelta
import operator
import matplotlib.pyplot as plt
from collections import namedtuple
%matplotlib notebook

In [2]:
rain_df = pd.read_csv('data/ohare_hourly_20160929.csv')
rain_df['datetime'] = pd.to_datetime(rain_df['datetime'])
rain_df = rain_df.set_index(pd.DatetimeIndex(rain_df['datetime']))
rain_df = rain_df['19700101':]
chi_rain_series = rain_df['HOURLYPrecip'].resample('1H', label='right').max()
chi_rain_series.head()


Out[2]:
1970-01-01 04:00:00    0.0
1970-01-01 05:00:00    NaN
1970-01-01 06:00:00    NaN
1970-01-01 07:00:00    0.0
1970-01-01 08:00:00    NaN
Freq: H, Name: HOURLYPrecip, dtype: float64

In [3]:
# We're going to define a storm ending as when there are at least 12 hours of no rain.
_hours_between_events = 12

# Really only track precipitation > than this inches
_measurable_precip = 0.04

In [4]:
hourly_precip = pd.DataFrame(chi_rain_series).fillna(0)
hourly_precip.tail()


Out[4]:
HOURLYPrecip
2016-08-28 19:00:00 0.0
2016-08-28 20:00:00 0.0
2016-08-28 21:00:00 0.0
2016-08-28 22:00:00 0.0
2016-08-28 23:00:00 0.0

In [5]:
hourly_precip.columns = ['hourly_precip']

In [6]:
def subtract_one_hour(timestamp):
    return timestamp - timedelta(hours=1)
hourly_precip['end_time'] = hourly_precip.index.values
hourly_precip['start_time'] =  hourly_precip['end_time'].apply(subtract_one_hour)
hourly_precip.head()


Out[6]:
hourly_precip end_time start_time
1970-01-01 04:00:00 0.0 1970-01-01 04:00:00 1970-01-01 03:00:00
1970-01-01 05:00:00 0.0 1970-01-01 05:00:00 1970-01-01 04:00:00
1970-01-01 06:00:00 0.0 1970-01-01 06:00:00 1970-01-01 05:00:00
1970-01-01 07:00:00 0.0 1970-01-01 07:00:00 1970-01-01 06:00:00
1970-01-01 08:00:00 0.0 1970-01-01 08:00:00 1970-01-01 07:00:00

In [7]:
# Now try to figure out the start of storms.  These are rows in which precipitation is greater than
# _measurable_precip and the previous rows rolling was zero.

# 12 hours before this row
hourly_precip['rolling'] = chi_rain_series.rolling(window=int(_hours_between_events), min_periods=0).sum().fillna(0).round(4)

# See the rolling from the row in front
hourly_precip['rolling_prev'] = hourly_precip['rolling'].shift().fillna(0)

hourly_precip['storm_start'] = (hourly_precip['hourly_precip'] > _measurable_precip) & (hourly_precip['rolling_prev'] < _measurable_precip)

hourly_precip.head()


Out[7]:
hourly_precip end_time start_time rolling rolling_prev storm_start
1970-01-01 04:00:00 0.0 1970-01-01 04:00:00 1970-01-01 03:00:00 0.0 0.0 False
1970-01-01 05:00:00 0.0 1970-01-01 05:00:00 1970-01-01 04:00:00 0.0 0.0 False
1970-01-01 06:00:00 0.0 1970-01-01 06:00:00 1970-01-01 05:00:00 0.0 0.0 False
1970-01-01 07:00:00 0.0 1970-01-01 07:00:00 1970-01-01 06:00:00 0.0 0.0 False
1970-01-01 08:00:00 0.0 1970-01-01 08:00:00 1970-01-01 07:00:00 0.0 0.0 False

In [8]:
# The end time will search the events dataframe for the first row after this one in which the previous 12 hours saw no
# rain.  Then subtract 12.
def find_endtime(timestamp):
    row_marker = hourly_precip[timestamp:][hourly_precip['rolling'] < _measurable_precip].iloc[0]
    return row_marker['start_time'] - timedelta(hours=(_hours_between_events-1))

ts = pd.to_datetime('1970-04-13 04:00:00')
find_endtime(ts)


d:\data_science_projects\chicagorain\virtualenvs\nyear-venv\lib\site-packages\ipykernel\__main__.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Out[8]:
Timestamp('1970-04-13 04:00:00')

In [10]:
# events will now contain the actual events - which is based on rows where rain starts and stops over a duration
events = pd.DataFrame(hourly_precip[hourly_precip['storm_start']][['start_time', 'end_time']])
events.head()


Out[10]:
start_time end_time
1970-03-19 19:00:00 1970-03-19 18:00:00 1970-03-19 19:00:00
1970-03-25 22:00:00 1970-03-25 21:00:00 1970-03-25 22:00:00
1970-04-13 04:00:00 1970-04-13 03:00:00 1970-04-13 04:00:00
1970-04-16 04:00:00 1970-04-16 03:00:00 1970-04-16 04:00:00
1970-04-19 04:00:00 1970-04-19 03:00:00 1970-04-19 04:00:00

In [11]:
events.columns = ['start_time', 'hours_end_time']
events['end_time'] = events['hours_end_time'].apply(find_endtime)
events.drop('hours_end_time', 1)
events = events.set_index(['start_time'])
events['start_time'] = events.index.values
events['duration_hrs'] = (events['end_time'] - events['start_time']).astype('timedelta64[h]')
events.head()


d:\data_science_projects\chicagorain\virtualenvs\nyear-venv\lib\site-packages\ipykernel\__main__.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Out[11]:
hours_end_time end_time start_time duration_hrs
start_time
1970-03-19 18:00:00 1970-03-19 19:00:00 1970-03-19 22:00:00 1970-03-19 18:00:00 4.0
1970-03-25 21:00:00 1970-03-25 22:00:00 1970-03-26 07:00:00 1970-03-25 21:00:00 10.0
1970-04-13 03:00:00 1970-04-13 04:00:00 1970-04-13 04:00:00 1970-04-13 03:00:00 1.0
1970-04-16 03:00:00 1970-04-16 04:00:00 1970-04-16 04:00:00 1970-04-16 03:00:00 1.0
1970-04-19 03:00:00 1970-04-19 04:00:00 1970-04-19 10:00:00 1970-04-19 03:00:00 7.0

In [12]:
for index, event in events.iterrows():
    events.loc[index, 'total_precip'] = hourly_precip[str(event['start_time']):str(event['end_time'])]['hourly_precip'].sum()
events.head()


Out[12]:
hours_end_time end_time start_time duration_hrs total_precip
start_time
1970-03-19 18:00:00 1970-03-19 19:00:00 1970-03-19 22:00:00 1970-03-19 18:00:00 4.0 0.18
1970-03-25 21:00:00 1970-03-25 22:00:00 1970-03-26 07:00:00 1970-03-25 21:00:00 10.0 0.27
1970-04-13 03:00:00 1970-04-13 04:00:00 1970-04-13 04:00:00 1970-04-13 03:00:00 1.0 0.24
1970-04-16 03:00:00 1970-04-16 04:00:00 1970-04-16 04:00:00 1970-04-16 03:00:00 1.0 0.06
1970-04-19 03:00:00 1970-04-19 04:00:00 1970-04-19 10:00:00 1970-04-19 03:00:00 7.0 0.29

In [13]:
events = events[['start_time', 'end_time', 'duration_hrs', 'total_precip']]
events.head()


Out[13]:
start_time end_time duration_hrs total_precip
start_time
1970-03-19 18:00:00 1970-03-19 18:00:00 1970-03-19 22:00:00 4.0 0.18
1970-03-25 21:00:00 1970-03-25 21:00:00 1970-03-26 07:00:00 10.0 0.27
1970-04-13 03:00:00 1970-04-13 03:00:00 1970-04-13 04:00:00 1.0 0.24
1970-04-16 03:00:00 1970-04-16 03:00:00 1970-04-16 04:00:00 1.0 0.06
1970-04-19 03:00:00 1970-04-19 03:00:00 1970-04-19 10:00:00 7.0 0.29

In [14]:
events.to_csv('data/rain_events_ohare.csv', index=False)

In [ ]: