notebook.community

Edit and run



In [1]:

    
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
from datetime import datetime, timedelta
import operator
import matplotlib.pyplot as plt
from collections import namedtuple
%matplotlib notebook



In [2]:

    
rain_df = pd.read_csv('data/ohare_hourly_20160929.csv')
rain_df['datetime'] = pd.to_datetime(rain_df['datetime'])
rain_df = rain_df.set_index(pd.DatetimeIndex(rain_df['datetime']))
rain_df = rain_df['19700101':]
chi_rain_series = rain_df['HOURLYPrecip'].resample('1H', label='right').max()
chi_rain_series.head()









    Out[2]:





1970-01-01 04:00:00    0.0
1970-01-01 05:00:00    NaN
1970-01-01 06:00:00    NaN
1970-01-01 07:00:00    0.0
1970-01-01 08:00:00    NaN
Freq: H, Name: HOURLYPrecip, dtype: float64



In [3]:

    
# We're going to define a storm ending as when there are at least 12 hours of no rain.
_hours_between_events = 12

# Really only track precipitation > than this inches
_measurable_precip = 0.04



In [4]:

    
hourly_precip = pd.DataFrame(chi_rain_series).fillna(0)
hourly_precip.tail()









    Out[4]:






  
    
      
      HOURLYPrecip
    
  
  
    
      2016-08-28 19:00:00
      0.0
    
    
      2016-08-28 20:00:00
      0.0
    
    
      2016-08-28 21:00:00
      0.0
    
    
      2016-08-28 22:00:00
      0.0
    
    
      2016-08-28 23:00:00
      0.0



In [5]:

    
hourly_precip.columns = ['hourly_precip']



In [6]:

    
def subtract_one_hour(timestamp):
    return timestamp - timedelta(hours=1)
hourly_precip['end_time'] = hourly_precip.index.values
hourly_precip['start_time'] =  hourly_precip['end_time'].apply(subtract_one_hour)
hourly_precip.head()









    Out[6]:






  
    
      
      hourly_precip
      end_time
      start_time
    
  
  
    
      1970-01-01 04:00:00
      0.0
      1970-01-01 04:00:00
      1970-01-01 03:00:00
    
    
      1970-01-01 05:00:00
      0.0
      1970-01-01 05:00:00
      1970-01-01 04:00:00
    
    
      1970-01-01 06:00:00
      0.0
      1970-01-01 06:00:00
      1970-01-01 05:00:00
    
    
      1970-01-01 07:00:00
      0.0
      1970-01-01 07:00:00
      1970-01-01 06:00:00
    
    
      1970-01-01 08:00:00
      0.0
      1970-01-01 08:00:00
      1970-01-01 07:00:00



In [7]:

    
# Now try to figure out the start of storms.  These are rows in which precipitation is greater than
# _measurable_precip and the previous rows rolling was zero.

# 12 hours before this row
hourly_precip['rolling'] = chi_rain_series.rolling(window=int(_hours_between_events), min_periods=0).sum().fillna(0).round(4)

# See the rolling from the row in front
hourly_precip['rolling_prev'] = hourly_precip['rolling'].shift().fillna(0)

hourly_precip['storm_start'] = (hourly_precip['hourly_precip'] > _measurable_precip) & (hourly_precip['rolling_prev'] < _measurable_precip)

hourly_precip.head()









    Out[7]:






  
    
      
      hourly_precip
      end_time
      start_time
      rolling
      rolling_prev
      storm_start
    
  
  
    
      1970-01-01 04:00:00
      0.0
      1970-01-01 04:00:00
      1970-01-01 03:00:00
      0.0
      0.0
      False
    
    
      1970-01-01 05:00:00
      0.0
      1970-01-01 05:00:00
      1970-01-01 04:00:00
      0.0
      0.0
      False
    
    
      1970-01-01 06:00:00
      0.0
      1970-01-01 06:00:00
      1970-01-01 05:00:00
      0.0
      0.0
      False
    
    
      1970-01-01 07:00:00
      0.0
      1970-01-01 07:00:00
      1970-01-01 06:00:00
      0.0
      0.0
      False
    
    
      1970-01-01 08:00:00
      0.0
      1970-01-01 08:00:00
      1970-01-01 07:00:00
      0.0
      0.0
      False



In [8]:

    
# The end time will search the events dataframe for the first row after this one in which the previous 12 hours saw no
# rain.  Then subtract 12.
def find_endtime(timestamp):
    row_marker = hourly_precip[timestamp:][hourly_precip['rolling'] < _measurable_precip].iloc[0]
    return row_marker['start_time'] - timedelta(hours=(_hours_between_events-1))

ts = pd.to_datetime('1970-04-13 04:00:00')
find_endtime(ts)









    



d:\data_science_projects\chicagorain\virtualenvs\nyear-venv\lib\site-packages\ipykernel\__main__.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.






    Out[8]:





Timestamp('1970-04-13 04:00:00')



In [10]:

    
# events will now contain the actual events - which is based on rows where rain starts and stops over a duration
events = pd.DataFrame(hourly_precip[hourly_precip['storm_start']][['start_time', 'end_time']])
events.head()









    Out[10]:






  
    
      
      start_time
      end_time
    
  
  
    
      1970-03-19 19:00:00
      1970-03-19 18:00:00
      1970-03-19 19:00:00
    
    
      1970-03-25 22:00:00
      1970-03-25 21:00:00
      1970-03-25 22:00:00
    
    
      1970-04-13 04:00:00
      1970-04-13 03:00:00
      1970-04-13 04:00:00
    
    
      1970-04-16 04:00:00
      1970-04-16 03:00:00
      1970-04-16 04:00:00
    
    
      1970-04-19 04:00:00
      1970-04-19 03:00:00
      1970-04-19 04:00:00



In [11]:

    
events.columns = ['start_time', 'hours_end_time']
events['end_time'] = events['hours_end_time'].apply(find_endtime)
events.drop('hours_end_time', 1)
events = events.set_index(['start_time'])
events['start_time'] = events.index.values
events['duration_hrs'] = (events['end_time'] - events['start_time']).astype('timedelta64[h]')
events.head()









    



d:\data_science_projects\chicagorain\virtualenvs\nyear-venv\lib\site-packages\ipykernel\__main__.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.






    Out[11]:






  
    
      
      hours_end_time
      end_time
      start_time
      duration_hrs
    
    
      start_time
      
      
      
      
    
  
  
    
      1970-03-19 18:00:00
      1970-03-19 19:00:00
      1970-03-19 22:00:00
      1970-03-19 18:00:00
      4.0
    
    
      1970-03-25 21:00:00
      1970-03-25 22:00:00
      1970-03-26 07:00:00
      1970-03-25 21:00:00
      10.0
    
    
      1970-04-13 03:00:00
      1970-04-13 04:00:00
      1970-04-13 04:00:00
      1970-04-13 03:00:00
      1.0
    
    
      1970-04-16 03:00:00
      1970-04-16 04:00:00
      1970-04-16 04:00:00
      1970-04-16 03:00:00
      1.0
    
    
      1970-04-19 03:00:00
      1970-04-19 04:00:00
      1970-04-19 10:00:00
      1970-04-19 03:00:00
      7.0



In [12]:

    
for index, event in events.iterrows():
    events.loc[index, 'total_precip'] = hourly_precip[str(event['start_time']):str(event['end_time'])]['hourly_precip'].sum()
events.head()









    Out[12]:






  
    
      
      hours_end_time
      end_time
      start_time
      duration_hrs
      total_precip
    
    
      start_time
      
      
      
      
      
    
  
  
    
      1970-03-19 18:00:00
      1970-03-19 19:00:00
      1970-03-19 22:00:00
      1970-03-19 18:00:00
      4.0
      0.18
    
    
      1970-03-25 21:00:00
      1970-03-25 22:00:00
      1970-03-26 07:00:00
      1970-03-25 21:00:00
      10.0
      0.27
    
    
      1970-04-13 03:00:00
      1970-04-13 04:00:00
      1970-04-13 04:00:00
      1970-04-13 03:00:00
      1.0
      0.24
    
    
      1970-04-16 03:00:00
      1970-04-16 04:00:00
      1970-04-16 04:00:00
      1970-04-16 03:00:00
      1.0
      0.06
    
    
      1970-04-19 03:00:00
      1970-04-19 04:00:00
      1970-04-19 10:00:00
      1970-04-19 03:00:00
      7.0
      0.29



In [13]:

    
events = events[['start_time', 'end_time', 'duration_hrs', 'total_precip']]
events.head()









    Out[13]:






  
    
      
      start_time
      end_time
      duration_hrs
      total_precip
    
    
      start_time
      
      
      
      
    
  
  
    
      1970-03-19 18:00:00
      1970-03-19 18:00:00
      1970-03-19 22:00:00
      4.0
      0.18
    
    
      1970-03-25 21:00:00
      1970-03-25 21:00:00
      1970-03-26 07:00:00
      10.0
      0.27
    
    
      1970-04-13 03:00:00
      1970-04-13 03:00:00
      1970-04-13 04:00:00
      1.0
      0.24
    
    
      1970-04-16 03:00:00
      1970-04-16 03:00:00
      1970-04-16 04:00:00
      1.0
      0.06
    
    
      1970-04-19 03:00:00
      1970-04-19 03:00:00
      1970-04-19 10:00:00
      7.0
      0.29



In [14]:

    
events.to_csv('data/rain_events_ohare.csv', index=False)



In [ ]:

	HOURLYPrecip
2016-08-28 19:00:00	0.0
2016-08-28 20:00:00	0.0
2016-08-28 21:00:00	0.0
2016-08-28 22:00:00	0.0
2016-08-28 23:00:00	0.0

	end_time	start_time
1970-01-01 04:00:00	1970-01-01 04:00:00	1970-01-01 03:00:00
1970-01-01 05:00:00	1970-01-01 05:00:00	1970-01-01 04:00:00
1970-01-01 06:00:00	1970-01-01 06:00:00	1970-01-01 05:00:00
1970-01-01 07:00:00	1970-01-01 07:00:00	1970-01-01 06:00:00
1970-01-01 08:00:00	1970-01-01 08:00:00	1970-01-01 07:00:00

	start_time	end_time
1970-03-19 19:00:00	1970-03-19 18:00:00	1970-03-19 19:00:00
1970-03-25 22:00:00	1970-03-25 21:00:00	1970-03-25 22:00:00
1970-04-13 04:00:00	1970-04-13 03:00:00	1970-04-13 04:00:00
1970-04-16 04:00:00	1970-04-16 03:00:00	1970-04-16 04:00:00
1970-04-19 04:00:00	1970-04-19 03:00:00	1970-04-19 04:00:00

	hours_end_time	end_time	start_time	duration_hrs
start_time
1970-03-19 18:00:00	1970-03-19 19:00:00	1970-03-19 22:00:00	1970-03-19 18:00:00	4.0
1970-03-25 21:00:00	1970-03-25 22:00:00	1970-03-26 07:00:00	1970-03-25 21:00:00	10.0
1970-04-13 03:00:00	1970-04-13 04:00:00	1970-04-13 04:00:00	1970-04-13 03:00:00	1.0
1970-04-16 03:00:00	1970-04-16 04:00:00	1970-04-16 04:00:00	1970-04-16 03:00:00	1.0
1970-04-19 03:00:00	1970-04-19 04:00:00	1970-04-19 10:00:00	1970-04-19 03:00:00	7.0