In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
%matplotlib inline
# These settings are for a dark theme, comment it out if you
# are using the default theme.
sns.set(rc={'text.color':'#cdd2e9',
'figure.facecolor': '#384151',
'axes.facecolor':'#262931',
'axes.labelcolor':'#cdd2e9',
'grid.color':'#3b3e45',
'xtick.color':'#cdd2e9',
'ytick.color':'#cdd2e9',
'xtick.labelsize':13,
'ytick.labelsize':13,
'axes.titlesize':16,
'legend.fontsize':14,
'figure.figsize':(11.5,8)})
sns.set_palette(sns.color_palette("Set2", 8))
In [2]:
# The following code is adopted from Pat's Rolling Rain N-Year Threshold.pynb
# Loading in hourly rain data from CSV, parsing the timestamp, and adding it
# as an index so it's more useful
rain_df = pd.read_csv('data/ohare_full_precip_hourly.csv')
rain_df['datetime'] = pd.to_datetime(rain_df['datetime'])
rain_df = rain_df.set_index(pd.DatetimeIndex(rain_df['datetime']))
# Data does not really exist before 1973, data between 11/1991 and 8/1992 is all 0s...
rain_df = rain_df[(rain_df['datetime'] > '1973') &
((rain_df['datetime'] < '19911101') | (rain_df['datetime'] > '19920801'))]
In [3]:
chi_rain_series = rain_df['HOURLYPrecip'].resample('1H').max().dropna()
# We take maximum, because when there are multiple reports within the same hour,
# the values are *accumulated* (and then reset at the next full hour). Thus we
# want to take the maximum reading from any given hour.
ax = chi_rain_series.resample('24H').sum().plot()
_ = ax.set_title('Daily Precipitation (in) at O\'Hare')
In [4]:
n_year_threshes = pd.read_csv('data/n_year_definitions.csv')
n_year_threshes = n_year_threshes.set_index('Duration')
n_year_threshes
Out[4]:
Verify that the precipitation amount is the "rainfall" equivalent, i.e. that snowfall has been melted into liquid water. We will verify this by plotting the total amount of precipitation reported each day from the "Snowpocalypse" in 2011, where 21.2 inches of snow fell at O'Hare from 1/31/2011 to 2/2/2011. We see that each of these days has < 1 inch of precipitation reported, and the total number of inches reported is < 2.
In [5]:
chi_rain_series['20110130':'20110204'].resample('24H').sum().plot(figsize=(6,4))
print('Total of {:.2f} inches of precip reported.'.format(
chi_rain_series['20110130':'20110204'].sum()
))
In [6]:
dur_str_to_hours = {
'5-min':5/60.0,
'10-min':10/60.0,
'15-min':15/60.0,
'30-min':0.5,
'1-hr':1.0,
'2-hr':2.0,
'3-hr':3.0,
'6-hr':6.0,
'12-hr':12.0,
'18-hr':18.0,
'24-hr':24.0,
'48-hr':48.0,
'72-hr':72.0,
'5-day':5*24.0,
'10-day':10*24.0
}
def plot_thresh(duration_str, n_years, ax=None):
'''
For a given duration and a given n, the number of years, plot the
rolling amount of rain of the given duration, and the amount
of rain in the given duration that constitutes an n-year storm.
duration_str: duration as a string, see index of n_year_threshes
n_years : number of years, must be column of n_year_threshes
ax : optional, matplotlib axis object on which to plot
>>> plot_thresh('48-hour', 100)
>>> plot_thresh('5-day', 10)
'''
global rain_df
global n_year_threshes
global dur_str_to_hours
if ax is None:
ax = plt.gca()
thresh = n_year_threshes.loc[duration_str, str(n_years) + '-year']
duration = dur_str_to_hours[duration_str]
duration = max(duration, 1) # cannot upsample to more frequent than hourly
# TODO: want to throw warning?
# Create plot
rain_line = chi_rain_series.rolling(window=int(duration), min_periods=0).sum().plot(
ax=ax, color=sns.color_palette()[0])
x_limits = ax.get_xlim()
ax.plot(x_limits, [thresh, thresh], color=sns.color_palette()[1])
ax.set_ylim([0, ax.get_ylim()[1]])
ax.legend(['moving cumulative rain',
str(n_years) + '-year ' + duration_str + ' threshold'],
loc='best')
return ax
In [7]:
ax = plot_thresh('24-hr', 100)
Essentially replicating bulletin 70.
One of the pre-processing steps requires finding all of the "separate storm systems" for the durations that are less than or equal to 24 hours. Quoting from Bulletin 70, Section 3: Independence of Observations:
As in any statistical analysis, the individual ob- servations or data points should be independent of each other. With a partial-duration series, one must be careful that the observations used are not meteor- ologically dependent; that is, they must be from sepa- rate storm systems. In the present study, data for precipitation durations of 24 hours or less were ob- tained from individual precipitation events, defined as precipitation periods in which there was no pre- cipitation for at least 6 hours before and 6 hours after the precipitation event (Huff, 1967); then, only the maximum value for the particular duration (6 hours, 12 hours, etc.) within such a precipitation event was used. This ensures that the precipitation values are independent of each other and are derived from individual storms. For precipitation durations of 2 to 10 days, no time separation criteria were needed.
In [8]:
def get_independent_storms():
'''
TODO
See page 21 of http://www.isws.illinois.edu/atmos/statecli/PDF/b70-all.pdf,
Section 3: Independence of Observations, also quoted above.
'''
pass
In [9]:
ps = np.linspace(0,1,1000)
f, (ax1, ax2) = plt.subplots(1, 2, sharex=True, figsize=(11,4))
ax1.set_title('Quantiles')
ax1.plot(chi_rain_series.rolling(window=int(24), min_periods=0).sum().quantile(ps))
ax1.hold(True)
ax1.plot(chi_rain_series.resample('24H').sum().dropna().quantile(ps), '--')
ax2.set_title('Difference')
junk = ax2.plot(ps,
chi_rain_series.rolling(window=int(24), min_periods=0).sum().quantile(ps) -
chi_rain_series.resample('24H').sum().dropna().quantile(ps)
)
# We hope that these will match well. One is generated using (somewhat) independent
# observations, and the other is generated using highly dependent observations.
In [10]:
def new_recurrence_intervals():
'''
TODO
'''
global rain_df
global n_year_threshes
global dur_str_to_hours
new_recur_ints = n_year_threshes.ix[:-4,:].copy(deep=True)
for recurrence in new_recur_ints.columns:
for dur_str in new_recur_ints.index:
thresh = n_year_threshes.ix[dur_str, recurrence]
dur = dur_str_to_hours[dur_str]
rain = chi_rain_series.rolling(window=int(dur), min_periods=0).sum()
# 24 * 365.25 = 8766 hours per year
# rain.size * dur is the number of total hours we are looking at, so
# rain.size * dur / 8766 is total number of years in dataset
new_recur_ints.ix[dur_str, recurrence] = rain.size * dur / 8766. / sum(rain > thresh)
return new_recur_ints
In [12]:
new_recur_ints = new_recurrence_intervals()
In [16]:
print(new_recur_ints.to_string())
In [ ]: