Assignment CIE 5703 - week 6

Import libraries



In [1]:

    
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
plt.style.use('ggplot')

Rotterdam rain gauge dataset 10 min data from 2003 - 2013

Read in data



In [2]:

    
data = pd.read_csv('rotterdam_rg_2003-2014.csv', skipinitialspace=True)

Convert the dates to a readable format...



In [3]:

    
dates = data['Datum']
time = data['Tijd']
dates = dates.map(str)
date_split = dates.str.extract('(.{4})(.{2})(.{2})', expand=True)

time=time.apply(lambda x: '%04d' % x)
time_split = time.str.extract('(.{2})(.{2})', expand=True)

date_split.loc[:,3] = time_split.loc[:,0]
date_split.loc[:,4] = time_split.loc[:,1]

data.loc[:,'dt'] = pd.to_datetime(dict(year=date_split[0], month=date_split[1], day=date_split[2], hour=date_split[3], minute=date_split[4]))
data.index=data['dt']
data.head()









    Out[3]:







  
    
      
      Datum
      Tijd
      R10m
      dt
    
    
      dt
      
      
      
      
    
  
  
    
      2003-04-01 00:10:00
      20030401
      10
      0.0
      2003-04-01 00:10:00
    
    
      2003-04-01 00:20:00
      20030401
      20
      0.0
      2003-04-01 00:20:00
    
    
      2003-04-01 00:30:00
      20030401
      30
      0.0
      2003-04-01 00:30:00
    
    
      2003-04-01 00:40:00
      20030401
      40
      0.0
      2003-04-01 00:40:00
    
    
      2003-04-01 00:50:00
      20030401
      50
      0.0
      2003-04-01 00:50:00

Plot all data



In [4]:

    
plt.plot(data['dt'], data["R10m"])
plt.ylabel('mm/10min')
plt.gcf().autofmt_xdate()

Resample the 10 min dataset to hourly accumulated data



In [5]:

    
data_1h = pd.DataFrame()
data_1h['mean_rain'] = data.R10m.resample('H').mean()
data_1h['accum_rain'] = data.R10m.resample('H').sum()



In [6]:

    
data_1h.tail()









    Out[6]:







  
    
      
      mean_rain
      accum_rain
    
    
      dt
      
      
    
  
  
    
      2013-12-31 20:00:00
      0.200000
      1.2
    
    
      2013-12-31 21:00:00
      0.166667
      1.0
    
    
      2013-12-31 22:00:00
      0.116667
      0.7
    
    
      2013-12-31 23:00:00
      0.050000
      0.3
    
    
      2014-01-01 00:00:00
      0.000000
      0.0



In [7]:

    
plt.plot(data_1h["accum_rain"])
plt.ylabel('mm/1h')
plt.gcf().autofmt_xdate()



In [8]:

    
plt.plot(data_1h["mean_rain"])
plt.ylabel(r'mm/10min ($\varnothing$ of 1h)')
plt.gcf().autofmt_xdate()

Resample 10 min dataset to 24 accumulated data



In [9]:

    
data_24h = pd.DataFrame()
data_24h['mean_rain'] = data.R10m.resample('D').mean()
data_24h['accum_rain'] = data.R10m.resample('D').sum()



In [10]:

    
plt.plot(data_24h["accum_rain"])
plt.ylabel('mm/24h')
plt.gcf().autofmt_xdate()



In [11]:

    
plt.plot(data_24h["mean_rain"])
plt.ylabel(r'mm/10min ($\varnothing$ of 24h)')
plt.gcf().autofmt_xdate()

Select summer and winter months as separate datasets



In [12]:

    
data_summer_1h = data_1h.loc[(data_1h.index.month>=4) & (data_1h.index.month<=9)]

mask_start = (data_1h.index.month >= 1) & (data_1h.index.month <= 3)
mask_end = (data_1h.index.month >= 10) & (data_1h.index.month <= 12)
mask = mask_start | mask_end

data_winter_1h = data_1h.loc[mask]



In [13]:

    
plt.plot(data_summer_1h["accum_rain"])
plt.ylabel('mm/h')
plt.gcf().autofmt_xdate()



In [14]:

    
plt.plot(data_winter_1h["accum_rain"])
plt.ylabel('mm/h')
plt.gcf().autofmt_xdate()

Resample 10 min rain data to monthly accumulated data



In [15]:

    
data_monthly = pd.DataFrame()
data_monthly['mean_rain'] = data.R10m.resample('M').mean()
data_monthly['accum_rain'] = data.R10m.resample('M').sum()



In [16]:

    
plt.plot(data_monthly["accum_rain"])
plt.ylabel('mm/month')
plt.gcf().autofmt_xdate()



In [17]:

    
plt.plot(data_monthly["mean_rain"])
plt.ylabel(r'mm/10min ($\varnothing$ per month)')
plt.gcf().autofmt_xdate()

Answering the assignments

1. General statistics for 24-hour and 10-min datasets: compute mean, standard deviation, skewness; plot histograms

10 min dataset



In [18]:

    
print('Mean: %s' % str(data.R10m.mean()))
print('Std: %s' % str(data.R10m.std()))
print('Skew: %s' % str(data.R10m.skew()))









    



Mean: 0.0169286744739
Std: 0.124545711332
Skew: 25.9282818744



In [19]:

    
data.R10m.hist(bins = 100)
plt.xlabel('mm/10min')
plt.gca().set_yscale("log")



In [20]:

    
cur_data = data.R10m.loc[data.R10m>0]
hist_d = plt.hist(cur_data, bins=100)
plt.xlabel('mm/10min')
plt.gca().set_yscale("log")

24 h dataset



In [21]:

    
print('Mean: %s' % str(data_24h.accum_rain.mean()))
print('Std: %s' % str(data_24h.accum_rain.std()))
print('Skew: %s' % str(data_24h.accum_rain.skew()))









    



Mean: 2.43710867905
Std: 5.00782253544
Skew: 3.98268765871



In [22]:

    
data_24h.accum_rain.hist(bins = 100)
plt.gca().set_yscale("log")
plt.xlabel('mm/24h')









    Out[22]:





<matplotlib.text.Text at 0x7f6b1e075160>

Ignoring zeros:



In [23]:

    
cur_data = data_24h.accum_rain.loc[data_24h.accum_rain>0]
hist_d = plt.hist(cur_data, bins=100)
plt.gca().set_yscale("log")
plt.xlabel('mm/24h')









    Out[23]:





<matplotlib.text.Text at 0x7f6b23f1d588>



In [24]:

    
data_24h.mean_rain.hist(bins = 100)
plt.xlabel(r'mm/10min ($\varnothing$ per 24h)')
plt.gca().set_yscale("log")



In [25]:

    
selected_monthly_data = data_monthly
#selected_monthly_data = data_monthly[(data_monthly.index >= '2004-01-01')]
selected_monthly_data.head()

2. a. Analysis of seasonal cycles: create boxplots for monthly totals across all year



In [26]:

    
pd.options.mode.chained_assignment = None  # default='warn'
selected_monthly_data['mon'] = selected_monthly_data.index.month
selected_monthly_data['year'] = selected_monthly_data.index.year
selected_monthly_data.boxplot(column=['accum_rain'], by='mon', sym='+')
plt.ylabel('mm/month')









    Out[26]:





<matplotlib.text.Text at 0x7f6b22224438>

Or per year:



In [27]:

    
selected_monthly_data.boxplot(column=['accum_rain'], by='year', sym='+')
plt.ylabel('mm/month')
plt.gcf().autofmt_xdate()

2. b. Analysis of diurnal cycles: create boxplots for hourly totals for entire dataseries



In [28]:

    
data_1h['hour'] = data_1h.index.hour
data_1h.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[28]:





<matplotlib.text.Text at 0x7f6b26f25860>

Neglecting events <1mm/h



In [29]:

    
cur_df = data_1h.copy()
cur_df.loc[cur_df.accum_rain<1, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[29]:





<matplotlib.text.Text at 0x7f6b1963a048>

Neglecting events < 3mm/h



In [30]:

    
cur_df = data_1h.copy()
cur_df.loc[cur_df.accum_rain<3, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[30]:





<matplotlib.text.Text at 0x7f6b233cfc50>

2. c. Variation of diurnal cycles with seasons: create boxplots for hourly totals for summer season (April – September) and for winter season (October-March)



In [31]:

    
data_summer_1h['hour'] = data_summer_1h.index.hour
data_summer_1h.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[31]:





<matplotlib.text.Text at 0x7f6b1be3f438>

Neglecting events <1mm/h



In [32]:

    
cur_df = data_summer_1h.copy()
cur_df.loc[cur_df.accum_rain<1, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[32]:





<matplotlib.text.Text at 0x7f6b17897a58>

Neglecting events <3mm/h



In [33]:

    
cur_df = data_summer_1h.copy()
cur_df.loc[cur_df.accum_rain<3, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[33]:





<matplotlib.text.Text at 0x7f6b184626d8>

Repeat with winter



In [34]:

    
data_winter_1h['hour'] = data_winter_1h.index.hour
data_winter_1h.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[34]:





<matplotlib.text.Text at 0x7f6b18f983c8>

Neglecting events <1mm/h



In [35]:

    
cur_df = data_winter_1h.copy()
cur_df.loc[cur_df.accum_rain<1, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[35]:





<matplotlib.text.Text at 0x7f6b17e99438>

Neglecting events <3mm/h



In [36]:

    
cur_df = data_winter_1h.copy()
cur_df.loc[cur_df.accum_rain<3, 'accum_rain'] = np.nan
cur_df.boxplot(column=['accum_rain'], by='hour', sym='+')
plt.ylabel('mm/h')









    Out[36]:





<matplotlib.text.Text at 0x7f6b19afec18>

2. d. Diurnal cycles of intense storm events: Count nr of exceedances above 10 mm/h threshold for each hour of the day, for entire data series and for summer months only

Show rainfall events > 10mm /h over entire 1h accumulated dataset



In [37]:

    
rotterdam_1h_exceeds = data_1h.accum_rain[data_1h.accum_rain>5]



In [38]:

    
y = np.array(rotterdam_1h_exceeds)
N = len(y)
x = range(N)
plt.bar(x, y)
plt.ylabel('mm/h')









    Out[38]:





<matplotlib.text.Text at 0x7f6b1b84b2b0>

Amount of events



In [39]:

    
print(len(rotterdam_1h_exceeds))

Events in summer



In [40]:

    
rotterdam_1h_exceeds_summer = data_summer_1h.accum_rain[data_summer_1h.accum_rain>10]

y = np.array(rotterdam_1h_exceeds_summer)
N = len(y)
x = range(N)
plt.bar(x, y)
plt.ylabel('mm/h')









    Out[40]:





<matplotlib.text.Text at 0x7f6b172ed940>

Amount of events



In [41]:

    
print(len(rotterdam_1h_exceeds_summer))

3. Fit GEV-distribution for POT values in the time series

3. a. Create plots: histogram and GEV fit and interpret



In [42]:

    
exceed_hist = plt.hist(rotterdam_1h_exceeds, bins=100)



In [43]:

    
from scipy.stats import genextreme



In [44]:

    
x = np.linspace(5, 30, 1000)
y = np.array(rotterdam_1h_exceeds[:])



In [45]:

    
np.seterr(divide='ignore', invalid='ignore')
genextreme.fit(y)









    Out[45]:





(-0.65409281605640668, 6.1495033811051876, 1.2472888910648763)



In [46]:

    
np.seterr(divide='ignore', invalid='ignore')
pdf = plt.plot(x, genextreme.pdf(x, *genextreme.fit(y)))
pdf_hist = plt.hist(y, bins=50, normed=True, histtype='stepfilled', alpha=0.8)

3. c. Compute rainfall amounts associated with return periods of 1 year, 10 years and 100 years



In [47]:

    
genextreme.ppf((1-1/1), *genextreme.fit(y))









    Out[47]:





4.2426044510956666



In [48]:

    
genextreme.ppf((1-1/10), *genextreme.fit(y))









    Out[48]:





12.552337032609081



In [49]:

    
genextreme.ppf((1-1/100), *genextreme.fit(y))









    Out[49]:





42.887043608252711

Lower since we're missing out the heavy rainfall events in 2015 and 2016 (only 10 year dataset)

Update 10.10.2017

Block maxima & GEV



In [50]:

    
from scipy.stats import genpareto



In [51]:

    
temp_monthly = data_1h.groupby(pd.TimeGrouper(freq='M'))
block_max_y = np.array(temp_monthly.accum_rain.max())
print(block_max_y)









    



[  6.5  11.4  11.9   9.7   1.4   5.9   3.8   5.7   7.7   3.4   4.8   5.
  19.2   5.7   8.1   9.5  13.    6.2   5.5   3.3   3.6   4.2   4.4   2.3
   2.4   3.5  12.1  14.6   5.5  14.2   6.2   6.9   5.5   2.    3.3   3.4
   2.    4.4   4.1   5.2  16.3   3.1   8.9   3.1   2.7   5.5   2.4  10.7
   0.2  11.9  11.1  15.5   9.4   7.    3.    3.5   4.5   3.1   7.3   5.9
   4.4  16.1   5.    9.3  15.9   2.9   7.9   4.5   4.5   3.7   4.    3.4
   1.8  17.3   5.5   5.5   4.8   3.3  10.7   6.2   4.9   2.5   3.6   5.3
   4.    3.7   7.6  10.2   8.2   8.9   6.2   8.5   3.3   3.4   2.3   1.7
   2.    1.4  11.5  10.8   9.7  11.7   3.1   0.9   8.8   6.1   3.7   3.
   6.1   9.8  12.1   9.1  11.4   4.5   6.2   4.4   5.6   5.3   3.2   4.8
   2.8   7.9   5.    9.7  11.2  16.6   9.6   5.    4.7   0. ]



In [52]:

    
print(len(block_max_y))



In [53]:

    
x = np.linspace(0, 30, 1000)
pdf_bm = plt.plot(x, genextreme.pdf(x, *genextreme.fit(block_max_y)))
pdf_hist_bm = plt.hist(block_max_y, bins=100, normed=True, histtype='stepfilled', alpha=0.8)

GEV and block maxima of monthly maxima of 1h data



In [54]:

    
genextreme.fit(block_max_y)









    Out[54]:





(-0.102585030922085, 4.5295951009939452, 2.8207912290114203)



In [55]:

    
genextreme.ppf((1-1/10), *genextreme.fit(block_max_y))









    Out[55]:





11.669914176064232

GPD & POT



In [56]:

    
pdf_bm = plt.plot(x, genpareto.pdf(x, *genpareto.fit(y)))
pdf_hist_bm = plt.hist(y, bins=100, normed=True, histtype='stepfilled', alpha=0.8)

GPD and POT of data>10mm/h



In [57]:

    
genpareto.fit(y)









    Out[57]:





(-0.00022006912025799782, 4.9999999933117056, 2.9071259531053117)



In [58]:

    
genpareto.ppf((1-1/10), *genpareto.fit(y))









    Out[58]:





11.692209168720096



In [ ]:

Boxplot of POT values



In [59]:

    
event_occurences = pd.DataFrame(rotterdam_1h_exceeds)
event_occurences['hour'] = event_occurences.index.hour
event_occurences.boxplot(column=['accum_rain'], by='hour', sym='+')









    Out[59]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6b0ca975c0>



In [60]:

    
event_occurences.hour.value_counts(sort=False)









    Out[60]:





0      8
1      8
2     10
3      9
4      7
5     10
6      5
7      5
8      4
9      9
10     9
11     7
12     6
13     3
14    12
15    13
16     9
17     9
18     8
19     6
20     7
21     6
22     8
23     5
Name: hour, dtype: int64



In [61]:

    
cur_hist = plt.hist(event_occurences.hour, bins=24, histtype='stepfilled')
plt.xticks(range(24))
plt.xlabel('hour')









    Out[61]:





<matplotlib.text.Text at 0x7f6b0c85ce10>



In [ ]:

	mean_rain	accum_rain
dt
2003-04-30	0.010141	43.8
2003-05-31	0.017921	80.0
2003-06-30	0.006435	27.8
2003-07-31	0.008871	39.6
2003-08-31	0.002263	10.1

	Datum	Tijd	R10m	dt
dt
2003-04-01 00:10:00	20030401	10	0.0	2003-04-01 00:10:00
2003-04-01 00:20:00	20030401	20	0.0	2003-04-01 00:20:00
2003-04-01 00:30:00	20030401	30	0.0	2003-04-01 00:30:00
2003-04-01 00:40:00	20030401	40	0.0	2003-04-01 00:40:00
2003-04-01 00:50:00	20030401	50	0.0	2003-04-01 00:50:00

	mean_rain	accum_rain
dt
2013-12-31 20:00:00	0.200000	1.2
2013-12-31 21:00:00	0.166667	1.0
2013-12-31 22:00:00	0.116667	0.7
2013-12-31 23:00:00	0.050000	0.3
2014-01-01 00:00:00	0.000000	0.0