In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd
from datetime import datetime, timedelta
import operator
import matplotlib.pyplot as plt
from collections import namedtuple
%matplotlib notebook
In [2]:
rain_df = pd.read_csv('data/ohare_hourly_20160929.csv')
rain_df['datetime'] = pd.to_datetime(rain_df['datetime'])
rain_df = rain_df.set_index(pd.DatetimeIndex(rain_df['datetime']))
rain_df = rain_df['19700101':]
chi_rain_series = rain_df['HOURLYPrecip'].resample('1H', label='right').max()
chi_rain_series.head()
Out[2]:
In [3]:
# We're going to define a storm ending as when there are at least 12 hours of no rain.
_hours_between_events = 12
# Really only track precipitation > than this inches
_measurable_precip = 0.04
In [4]:
hourly_precip = pd.DataFrame(chi_rain_series).fillna(0)
hourly_precip.tail()
Out[4]:
In [5]:
hourly_precip.columns = ['hourly_precip']
In [6]:
def subtract_one_hour(timestamp):
return timestamp - timedelta(hours=1)
hourly_precip['end_time'] = hourly_precip.index.values
hourly_precip['start_time'] = hourly_precip['end_time'].apply(subtract_one_hour)
hourly_precip.head()
Out[6]:
In [7]:
# Now try to figure out the start of storms. These are rows in which precipitation is greater than
# _measurable_precip and the previous rows rolling was zero.
# 12 hours before this row
hourly_precip['rolling'] = chi_rain_series.rolling(window=int(_hours_between_events), min_periods=0).sum().fillna(0).round(4)
# See the rolling from the row in front
hourly_precip['rolling_prev'] = hourly_precip['rolling'].shift().fillna(0)
hourly_precip['storm_start'] = (hourly_precip['hourly_precip'] > _measurable_precip) & (hourly_precip['rolling_prev'] < _measurable_precip)
hourly_precip.head()
Out[7]:
In [8]:
# The end time will search the events dataframe for the first row after this one in which the previous 12 hours saw no
# rain. Then subtract 12.
def find_endtime(timestamp):
row_marker = hourly_precip[timestamp:][hourly_precip['rolling'] < _measurable_precip].iloc[0]
return row_marker['start_time'] - timedelta(hours=(_hours_between_events-1))
ts = pd.to_datetime('1970-04-13 04:00:00')
find_endtime(ts)
Out[8]:
In [10]:
# events will now contain the actual events - which is based on rows where rain starts and stops over a duration
events = pd.DataFrame(hourly_precip[hourly_precip['storm_start']][['start_time', 'end_time']])
events.head()
Out[10]:
In [11]:
events.columns = ['start_time', 'hours_end_time']
events['end_time'] = events['hours_end_time'].apply(find_endtime)
events.drop('hours_end_time', 1)
events = events.set_index(['start_time'])
events['start_time'] = events.index.values
events['duration_hrs'] = (events['end_time'] - events['start_time']).astype('timedelta64[h]')
events.head()
Out[11]:
In [12]:
for index, event in events.iterrows():
events.loc[index, 'total_precip'] = hourly_precip[str(event['start_time']):str(event['end_time'])]['hourly_precip'].sum()
events.head()
Out[12]:
In [13]:
events = events[['start_time', 'end_time', 'duration_hrs', 'total_precip']]
events.head()
Out[13]:
In [14]:
events.to_csv('data/rain_events_ohare.csv', index=False)
In [ ]: