Assignment CIE 5703 - week 6

Import libraries


In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
plt.style.use('ggplot')

Rotterdam rain gauge dataset 10 min data from 2003 - 2013

Read in data


In [2]:
data = pd.read_csv('rotterdam_rg_2003-2014.csv', skipinitialspace=True)

Convert the dates to a readable format...


In [3]:
dates = data['Datum']
time = data['Tijd']
dates = dates.map(str)
date_split = dates.str.extract('(.{4})(.{2})(.{2})', expand=True)

time=time.apply(lambda x: '%04d' % x)
time_split = time.str.extract('(.{2})(.{2})', expand=True)

date_split.loc[:,3] = time_split.loc[:,0]
date_split.loc[:,4] = time_split.loc[:,1]

data.loc[:,'dt'] = pd.to_datetime(dict(year=date_split[0], month=date_split[1], day=date_split[2], hour=date_split[3], minute=date_split[4]))
data.index=data['dt']
data.head()


Out[3]:
Datum Tijd R10m dt
dt
2003-04-01 00:10:00 20030401 10 0.0 2003-04-01 00:10:00
2003-04-01 00:20:00 20030401 20 0.0 2003-04-01 00:20:00
2003-04-01 00:30:00 20030401 30 0.0 2003-04-01 00:30:00
2003-04-01 00:40:00 20030401 40 0.0 2003-04-01 00:40:00
2003-04-01 00:50:00 20030401 50 0.0 2003-04-01 00:50:00

Plot all data


In [4]:
plt.plot(data['dt'], data["R10m"])
plt.ylabel('mm/10min')
plt.gcf().autofmt_xdate()


Resample the 10 min dataset to hourly accumulated data


In [5]:
data_1h = pd.DataFrame()
data_1h['mean_rain'] = data.R10m.resample('H').mean()
data_1h['accum_rain'] = data.R10m.resample('H').sum()

In [6]:
data_1h.tail()


Out[6]:
mean_rain accum_rain
dt
2013-12-31 20:00:00 0.200000 1.2
2013-12-31 21:00:00 0.166667 1.0
2013-12-31 22:00:00 0.116667 0.7
2013-12-31 23:00:00 0.050000 0.3
2014-01-01 00:00:00 0.000000 0.0

In [7]:
plt.plot(data_1h["accum_rain"])
plt.ylabel('mm/1h')
plt.gcf().autofmt_xdate()



In [8]:
plt.plot(data_1h["mean_rain"])
plt.ylabel(r'mm/10min ($\varnothing$ of 1h)')
plt.gcf().autofmt_xdate()


Resample 10 min dataset to 24 accumulated data


In [9]:
data_24h = pd.DataFrame()
data_24h['mean_rain'] = data.R10m.resample('D').mean()
data_24h['accum_rain'] = data.R10m.resample('D').sum()

In [10]:
plt.plot(data_24h["accum_rain"])
plt.ylabel('mm/24h')
plt.gcf().autofmt_xdate()



In [11]:
plt.plot(data_24h["mean_rain"])
plt.ylabel(r'mm/10min ($\varnothing$ of 24h)')
plt.gcf().autofmt_xdate()


Select summer and winter months as separate datasets


In [12]:
data_summer_1h = data_1h.loc[(data_1h.index.month>=4) & (data_1h.index.month<=9)]

mask_start = (data_1h.index.month >= 1) & (data_1h.index.month <= 3)
mask_end = (data_1h.index.month >= 10) & (data_1h.index.month <= 12)
mask = mask_start | mask_end

data_winter_1h = data_1h.loc[mask]

In [13]:
plt.plot(data_summer_1h["accum_rain"])
plt.ylabel('mm/h')
plt.gcf().autofmt_xdate()



In [14]:
plt.plot(data_winter_1h["accum_rain"])
plt.ylabel('mm/h')
plt.gcf().autofmt_xdate()


Resample 10 min rain data to monthly accumulated data


In [15]:
data_monthly = pd.DataFrame()
data_monthly['mean_rain'] = data.R10m.resample('M').mean()
data_monthly['accum_rain'] = data.R10m.resample('M').sum()

In [16]:
plt.plot(data_monthly["accum_rain"])
plt.ylabel('mm/month')
plt.gcf().autofmt_xdate()



In [17]:
plt.plot(data_monthly["mean_rain"])
plt.ylabel(r'mm/10min ($\varnothing$ per month)')
plt.gcf().autofmt_xdate()


Answering the assignments

1. General statistics for 24-hour and 10-min datasets: compute mean, standard deviation, skewness; plot histograms

10 min dataset

In [18]:
print('Mean: %s' % str(data.R10m.mean()))
print('Std: %s' % str(data.R10m.std()))
print('Skew: %s' % str(data.R10m.skew()))


Mean: 0.0169286744739
Std: 0.124545711332
Skew: 25.9282818744

In [19]:
data.R10m.hist(bins = 100)
plt.xlabel('mm/10min')
plt.gca().set_yscale("log")



In [20]:
cur_data = data.R10m.loc[data.R10m>0]
hist_d = plt.hist(cur_data, bins=100)
plt.xlabel('mm/10min')
plt.gca().set_yscale("log")


24 h dataset

In [21]:
print('Mean: %s' % str(data_24h.accum_rain.mean()))
print('Std: %s' % str(data_24h.accum_rain.std()))
print('Skew: %s' % str(data_24h.accum_rain.skew()))


Mean: 2.43710867905
Std: 5.00782253544
Skew: 3.98268765871

In [22]:
data_24h.accum_rain.hist(bins = 100)
plt.gca().set_yscale("log")
plt.xlabel('mm/24h')


Out[22]:
<matplotlib.text.Text at 0x7f6b1e075160>

Ignoring zeros:


In [23]:
cur_data = data_24h.accum_rain.loc[data_24h.accum_rain>0]
hist_d = plt.hist(cur_data, bins=100)
plt.gca().set_yscale("log")
plt.xlabel('mm/24h')


Out[23]:
<matplotlib.text.Text at 0x7f6b23f1d588>

In [24]:
data_24h.mean_rain.hist(bins = 100)
plt.xlabel(r'mm/10min ($\varnothing$ per 24h)')
plt.gca().set_yscale("log")



In [25]:
selected_monthly_data = data_monthly
#selected_monthly_data = data_monthly[(data_monthly.index >= '2004-01-01')]
selected_monthly_data.head()


Out[25]:
mean_rain accum_rain
dt
2003-04-30 0.010141 43.8
2003-05-31 0.017921 80.0
2003-06-30 0.006435 27.8
2003-07-31 0.008871 39.6
2003-08-31 0.002263 10.1

2. a. Analysis of seasonal cycles: create boxplots for monthly totals across all year


In [26]:
pd.options.mode.chained_assignment = None  # default='warn'
selected_monthly_data['mon'] = selected_monthly_data.index.month
selected_monthly_data['year'] = selected_monthly_data.index.year
selected_monthly_data.boxplot(column=['accum_rain'], by='mon', sym='+')
plt.ylabel('mm/month')


Out[26]:
<matplotlib.text.Text at 0x7f6b22224438>

Or per year:


In [27]:
selected_monthly_data.boxplot(column=['accum_rain'], by='year', sym='+')
plt.ylabel('mm/month')
plt.gcf().autofmt_xdate()


2. b. Analysis of diurnal cycles: create boxplots for hourly totals for entire dataseries


In [28]:
data_1h['hour'] = data_1h.index.hour
data_1h.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[28]:
<matplotlib.text.Text at 0x7f6b26f25860>

Neglecting events <1mm/h


In [29]:
cur_df = data_1h.copy()
cur_df.loc[cur_df.accum_rain<1, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[29]:
<matplotlib.text.Text at 0x7f6b1963a048>

Neglecting events < 3mm/h


In [30]:
cur_df = data_1h.copy()
cur_df.loc[cur_df.accum_rain<3, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[30]:
<matplotlib.text.Text at 0x7f6b233cfc50>

2. c. Variation of diurnal cycles with seasons: create boxplots for hourly totals for summer season (April – September) and for winter season (October-March)


In [31]:
data_summer_1h['hour'] = data_summer_1h.index.hour
data_summer_1h.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[31]:
<matplotlib.text.Text at 0x7f6b1be3f438>

Neglecting events <1mm/h


In [32]:
cur_df = data_summer_1h.copy()
cur_df.loc[cur_df.accum_rain<1, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[32]:
<matplotlib.text.Text at 0x7f6b17897a58>

Neglecting events <3mm/h


In [33]:
cur_df = data_summer_1h.copy()
cur_df.loc[cur_df.accum_rain<3, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[33]:
<matplotlib.text.Text at 0x7f6b184626d8>

Repeat with winter


In [34]:
data_winter_1h['hour'] = data_winter_1h.index.hour
data_winter_1h.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[34]:
<matplotlib.text.Text at 0x7f6b18f983c8>

Neglecting events <1mm/h


In [35]:
cur_df = data_winter_1h.copy()
cur_df.loc[cur_df.accum_rain<1, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[35]:
<matplotlib.text.Text at 0x7f6b17e99438>

Neglecting events <3mm/h


In [36]:
cur_df = data_winter_1h.copy()
cur_df.loc[cur_df.accum_rain<3, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')


Out[36]:
<matplotlib.text.Text at 0x7f6b19afec18>

2. d. Diurnal cycles of intense storm events: Count nr of exceedances above 10 mm/h threshold for each hour of the day, for entire data series and for summer months only

Show rainfall events > 10mm /h over entire 1h accumulated dataset


In [37]:
rotterdam_1h_exceeds = data_1h.accum_rain[data_1h.accum_rain>5]

In [38]:
y = np.array(rotterdam_1h_exceeds)
N = len(y)
x = range(N)
plt.bar(x, y)
plt.ylabel('mm/h')


Out[38]:
<matplotlib.text.Text at 0x7f6b1b84b2b0>

Amount of events


In [39]:
print(len(rotterdam_1h_exceeds))


183

Events in summer


In [40]:
rotterdam_1h_exceeds_summer = data_summer_1h.accum_rain[data_summer_1h.accum_rain>10]

y = np.array(rotterdam_1h_exceeds_summer)
N = len(y)
x = range(N)
plt.bar(x, y)
plt.ylabel('mm/h')


Out[40]:
<matplotlib.text.Text at 0x7f6b172ed940>

Amount of events


In [41]:
print(len(rotterdam_1h_exceeds_summer))


32

3. Fit GEV-distribution for POT values in the time series

3. a. Create plots: histogram and GEV fit and interpret


In [42]:
exceed_hist = plt.hist(rotterdam_1h_exceeds, bins=100)



In [43]:
from scipy.stats import genextreme

In [44]:
x = np.linspace(5, 30, 1000)
y = np.array(rotterdam_1h_exceeds[:])

In [45]:
np.seterr(divide='ignore', invalid='ignore')
genextreme.fit(y)


Out[45]:
(-0.65409281605640668, 6.1495033811051876, 1.2472888910648763)

In [46]:
np.seterr(divide='ignore', invalid='ignore')
pdf = plt.plot(x, genextreme.pdf(x, *genextreme.fit(y)))
pdf_hist = plt.hist(y, bins=50, normed=True, histtype='stepfilled', alpha=0.8)


3. c. Compute rainfall amounts associated with return periods of 1 year, 10 years and 100 years


In [47]:
genextreme.ppf((1-1/1), *genextreme.fit(y))


Out[47]:
4.2426044510956666

In [48]:
genextreme.ppf((1-1/10), *genextreme.fit(y))


Out[48]:
12.552337032609081

In [49]:
genextreme.ppf((1-1/100), *genextreme.fit(y))


Out[49]:
42.887043608252711

Lower since we're missing out the heavy rainfall events in 2015 and 2016 (only 10 year dataset)

Update 10.10.2017

Block maxima & GEV


In [50]:
from scipy.stats import genpareto

In [51]:
temp_monthly = data_1h.groupby(pd.TimeGrouper(freq='M'))
block_max_y = np.array(temp_monthly.accum_rain.max())
print(block_max_y)


[  6.5  11.4  11.9   9.7   1.4   5.9   3.8   5.7   7.7   3.4   4.8   5.
  19.2   5.7   8.1   9.5  13.    6.2   5.5   3.3   3.6   4.2   4.4   2.3
   2.4   3.5  12.1  14.6   5.5  14.2   6.2   6.9   5.5   2.    3.3   3.4
   2.    4.4   4.1   5.2  16.3   3.1   8.9   3.1   2.7   5.5   2.4  10.7
   0.2  11.9  11.1  15.5   9.4   7.    3.    3.5   4.5   3.1   7.3   5.9
   4.4  16.1   5.    9.3  15.9   2.9   7.9   4.5   4.5   3.7   4.    3.4
   1.8  17.3   5.5   5.5   4.8   3.3  10.7   6.2   4.9   2.5   3.6   5.3
   4.    3.7   7.6  10.2   8.2   8.9   6.2   8.5   3.3   3.4   2.3   1.7
   2.    1.4  11.5  10.8   9.7  11.7   3.1   0.9   8.8   6.1   3.7   3.
   6.1   9.8  12.1   9.1  11.4   4.5   6.2   4.4   5.6   5.3   3.2   4.8
   2.8   7.9   5.    9.7  11.2  16.6   9.6   5.    4.7   0. ]

In [52]:
print(len(block_max_y))


130

In [53]:
x = np.linspace(0, 30, 1000)
pdf_bm = plt.plot(x, genextreme.pdf(x, *genextreme.fit(block_max_y)))
pdf_hist_bm = plt.hist(block_max_y, bins=100, normed=True, histtype='stepfilled', alpha=0.8)


GEV and block maxima of monthly maxima of 1h data


In [54]:
genextreme.fit(block_max_y)


Out[54]:
(-0.102585030922085, 4.5295951009939452, 2.8207912290114203)

In [55]:
genextreme.ppf((1-1/10), *genextreme.fit(block_max_y))


Out[55]:
11.669914176064232

GPD & POT


In [56]:
pdf_bm = plt.plot(x, genpareto.pdf(x, *genpareto.fit(y)))
pdf_hist_bm = plt.hist(y, bins=100, normed=True, histtype='stepfilled', alpha=0.8)


GPD and POT of data>10mm/h


In [57]:
genpareto.fit(y)


Out[57]:
(-0.00022006912025799782, 4.9999999933117056, 2.9071259531053117)

In [58]:
genpareto.ppf((1-1/10), *genpareto.fit(y))


Out[58]:
11.692209168720096

In [ ]:

Boxplot of POT values


In [59]:
event_occurences = pd.DataFrame(rotterdam_1h_exceeds)
event_occurences['hour'] = event_occurences.index.hour
event_occurences.boxplot(column=['accum_rain'], by='hour', sym='+')


Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6b0ca975c0>

In [60]:
event_occurences.hour.value_counts(sort=False)


Out[60]:
0      8
1      8
2     10
3      9
4      7
5     10
6      5
7      5
8      4
9      9
10     9
11     7
12     6
13     3
14    12
15    13
16     9
17     9
18     8
19     6
20     7
21     6
22     8
23     5
Name: hour, dtype: int64

In [61]:
cur_hist = plt.hist(event_occurences.hour, bins=24, histtype='stepfilled')
plt.xticks(range(24))
plt.xlabel('hour')


Out[61]:
<matplotlib.text.Text at 0x7f6b0c85ce10>

In [ ]: