In [1]:
# import packages for analysis and modeling
# data frame operations
import pandas as pd
# pathname pattern expansion
import glob
# dates treatment
import datetime
import os
# regular expressions
import re
# Plots
import matplotlib.pyplot as plt

In [2]:
# Hourly consumption file from ENTSO-e
file_name = 'Hourly_2013_month10.xls'

In [3]:
# read excel file
wb = pd.read_excel(file_name, skiprows=9, na_values=[u'n.a.'], keep_default_na=False)


WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero

In [4]:
# show columns names
wb.columns


Out[4]:
Index([u'Country', u'Day', u'01:00:00', u'02:00:00', u'3A:00:00', u'3B:00:00', u'04:00:00', u'05:00:00', u'06:00:00', u'07:00:00', u'08:00:00', u'09:00:00', u'10:00:00', u'11:00:00', u'12:00:00', u'13:00:00', u'14:00:00', u'15:00:00', u'16:00:00', u'17:00:00', u'18:00:00', u'19:00:00', u'20:00:00', u'21:00:00', u'22:00:00', u'23:00:00', u'24:00:00'], dtype='object')

In [5]:
# show first rows of the data frame
wb.head()


Out[5]:
Country Day 01:00:00 02:00:00 3A:00:00 3B:00:00 04:00:00 05:00:00 06:00:00 07:00:00 ... 15:00:00 16:00:00 17:00:00 18:00:00 19:00:00 20:00:00 21:00:00 22:00:00 23:00:00 24:00:00
0 AT 2013-10-01 6494 6198 6046 NaN 5938 6264 7314 8724 ... 9281 9189 9114 9226 9603 9206 8556 7779 7507 6881
1 AT 2013-10-02 6544 6315 6186 NaN 6122 6410 7443 8830 ... 9102 8872 8849 9013 9492 9186 8523 7744 7528 6902
2 AT 2013-10-03 6483 6243 6192 NaN 6121 6387 7531 8976 ... 9207 9091 9014 9194 9713 9392 8707 7911 7701 7036
3 AT 2013-10-04 6594 6415 6301 NaN 6232 6539 7522 8972 ... 8807 8714 8669 8834 9377 9078 8349 7752 7591 6971
4 AT 2013-10-05 6522 6180 5945 NaN 5821 5907 6338 6875 ... 7565 7611 7652 7835 8117 7734 7230 6761 6642 6165

5 rows × 27 columns


In [6]:
hourchange='3B:00:00'

In [7]:
# delete a column
del wb[hourchange]

In [8]:
# show columns names
wb.columns


Out[8]:
Index([u'Country', u'Day', u'01:00:00', u'02:00:00', u'3A:00:00', u'04:00:00', u'05:00:00', u'06:00:00', u'07:00:00', u'08:00:00', u'09:00:00', u'10:00:00', u'11:00:00', u'12:00:00', u'13:00:00', u'14:00:00', u'15:00:00', u'16:00:00', u'17:00:00', u'18:00:00', u'19:00:00', u'20:00:00', u'21:00:00', u'22:00:00', u'23:00:00', u'24:00:00'], dtype='object')

In [9]:
# show first rows of the data frame
wb.head()


Out[9]:
Country Day 01:00:00 02:00:00 3A:00:00 04:00:00 05:00:00 06:00:00 07:00:00 08:00:00 ... 15:00:00 16:00:00 17:00:00 18:00:00 19:00:00 20:00:00 21:00:00 22:00:00 23:00:00 24:00:00
0 AT 2013-10-01 6494 6198 6046 5938 6264 7314 8724 9334 ... 9281 9189 9114 9226 9603 9206 8556 7779 7507 6881
1 AT 2013-10-02 6544 6315 6186 6122 6410 7443 8830 9336 ... 9102 8872 8849 9013 9492 9186 8523 7744 7528 6902
2 AT 2013-10-03 6483 6243 6192 6121 6387 7531 8976 9631 ... 9207 9091 9014 9194 9713 9392 8707 7911 7701 7036
3 AT 2013-10-04 6594 6415 6301 6232 6539 7522 8972 9411 ... 8807 8714 8669 8834 9377 9078 8349 7752 7591 6971
4 AT 2013-10-05 6522 6180 5945 5821 5907 6338 6875 7397 ... 7565 7611 7652 7835 8117 7734 7230 6761 6642 6165

5 rows × 26 columns


In [10]:
# check types
wb.dtypes


Out[10]:
Country     object
Day         object
01:00:00     int64
02:00:00     int64
3A:00:00     int64
04:00:00     int64
05:00:00     int64
06:00:00     int64
07:00:00     int64
08:00:00     int64
09:00:00     int64
10:00:00     int64
11:00:00     int64
12:00:00     int64
13:00:00     int64
14:00:00     int64
15:00:00     int64
16:00:00     int64
17:00:00     int64
18:00:00     int64
19:00:00     int64
20:00:00     int64
21:00:00     int64
22:00:00     int64
23:00:00     int64
24:00:00     int64
dtype: object

In [11]:
# create new column as a date object
wb['date'] = pd.to_datetime(wb['Day'])

In [12]:
# show columns names
wb.columns


Out[12]:
Index([u'Country', u'Day', u'01:00:00', u'02:00:00', u'3A:00:00', u'04:00:00', u'05:00:00', u'06:00:00', u'07:00:00', u'08:00:00', u'09:00:00', u'10:00:00', u'11:00:00', u'12:00:00', u'13:00:00', u'14:00:00', u'15:00:00', u'16:00:00', u'17:00:00', u'18:00:00', u'19:00:00', u'20:00:00', u'21:00:00', u'22:00:00', u'23:00:00', u'24:00:00', u'date'], dtype='object')

In [13]:
# check types
wb.dtypes


Out[13]:
Country             object
Day                 object
01:00:00             int64
02:00:00             int64
3A:00:00             int64
04:00:00             int64
05:00:00             int64
06:00:00             int64
07:00:00             int64
08:00:00             int64
09:00:00             int64
10:00:00             int64
11:00:00             int64
12:00:00             int64
13:00:00             int64
14:00:00             int64
15:00:00             int64
16:00:00             int64
17:00:00             int64
18:00:00             int64
19:00:00             int64
20:00:00             int64
21:00:00             int64
22:00:00             int64
23:00:00             int64
24:00:00             int64
date        datetime64[ns]
dtype: object

In [14]:
# Add new columns with info about weekday, month and year
wb['weekday'] = wb.date.apply(lambda x: x.weekday())
wb['month'] = wb.date.apply(lambda x: x.month)
wb['year'] = wb.date.apply(lambda x: x.year)

In [15]:
# show columns names
wb.columns


Out[15]:
Index([u'Country', u'Day', u'01:00:00', u'02:00:00', u'3A:00:00', u'04:00:00', u'05:00:00', u'06:00:00', u'07:00:00', u'08:00:00', u'09:00:00', u'10:00:00', u'11:00:00', u'12:00:00', u'13:00:00', u'14:00:00', u'15:00:00', u'16:00:00', u'17:00:00', u'18:00:00', u'19:00:00', u'20:00:00', u'21:00:00', u'22:00:00', u'23:00:00', u'24:00:00', u'date', u'weekday', u'month', u'year'], dtype='object')

In [16]:
# show first rows of the data frame
wb.head()


Out[16]:
Country Day 01:00:00 02:00:00 3A:00:00 04:00:00 05:00:00 06:00:00 07:00:00 08:00:00 ... 19:00:00 20:00:00 21:00:00 22:00:00 23:00:00 24:00:00 date weekday month year
0 AT 2013-10-01 6494 6198 6046 5938 6264 7314 8724 9334 ... 9603 9206 8556 7779 7507 6881 2013-10-01 1 10 2013
1 AT 2013-10-02 6544 6315 6186 6122 6410 7443 8830 9336 ... 9492 9186 8523 7744 7528 6902 2013-10-02 2 10 2013
2 AT 2013-10-03 6483 6243 6192 6121 6387 7531 8976 9631 ... 9713 9392 8707 7911 7701 7036 2013-10-03 3 10 2013
3 AT 2013-10-04 6594 6415 6301 6232 6539 7522 8972 9411 ... 9377 9078 8349 7752 7591 6971 2013-10-04 4 10 2013
4 AT 2013-10-05 6522 6180 5945 5821 5907 6338 6875 7397 ... 8117 7734 7230 6761 6642 6165 2013-10-05 5 10 2013

5 rows × 30 columns


In [17]:
# change column name for 3A:00 to 3:00
wb = wb.rename(columns=lambda x: re.sub(r'^(\d{1})A:.+', 'H0\\1', x))

In [19]:
# change columns names with regular expressions
wb = wb.rename(columns=lambda x: re.sub(r'^(\d{2}):.+', 'H\\1', x))

In [42]:
# show first rows of the data frame
wb.head()


Out[42]:
Country year month weekday date Day H01 H02 H03 H04 ... H16 H17 H18 H19 H20 H21 H22 H23 H24 Consumption
0 AT 2013 10 1 2013-10-01 2013-10-01 6494 6198 6046 5938 ... 9189 9114 9226 9603 9206 8556 7779 7507 6881 199057
1 AT 2013 10 2 2013-10-02 2013-10-02 6544 6315 6186 6122 ... 8872 8849 9013 9492 9186 8523 7744 7528 6902 198884
2 AT 2013 10 3 2013-10-03 2013-10-03 6483 6243 6192 6121 ... 9091 9014 9194 9713 9392 8707 7911 7701 7036 202271
3 AT 2013 10 4 2013-10-04 2013-10-04 6594 6415 6301 6232 ... 8714 8669 8834 9377 9078 8349 7752 7591 6971 198288
4 AT 2013 10 5 2013-10-05 2013-10-05 6522 6180 5945 5821 ... 7611 7652 7835 8117 7734 7230 6761 6642 6165 171842

5 rows × 31 columns


In [43]:
# show info about the data frame
wb.describe()


Out[43]:
year month weekday H01 H02 H03 H04 H05 H06 H07 ... H16 H17 H18 H19 H20 H21 H22 H23 H24 Consumption
count 1116 1116 1116.000000 1116.000000 1116.000000 1116.000000 1116.000000 1116.000000 1116.000000 1116.000000 ... 1116.00000 1116.000000 1116.000000 1116.000000 1116.000000 1116.000000 1116.000000 1116.000000 1116.00000 1116.000000
mean 2013 10 2.903226 8265.825269 7888.914875 7667.830645 7571.103047 7650.938172 8067.577061 8983.907706 ... 10363.55914 10345.817204 10564.135305 11026.544803 11270.760753 10780.675627 10094.959677 9586.985663 8933.28405 233706.946237
std 0 0 1.941186 11339.612157 10845.052125 10482.110012 10320.189816 10445.270018 11055.675226 12500.518778 ... 14497.86079 14321.691190 14531.393956 15176.437019 15576.936230 14762.143166 13788.339433 13304.146171 12452.24682 323801.860557
min 2013 10 0.000000 286.000000 264.000000 247.000000 226.000000 232.000000 245.000000 266.000000 ... 325.00000 331.000000 335.000000 372.000000 395.000000 383.000000 368.000000 348.000000 0.00000 7842.000000
25% 2013 10 1.000000 1030.500000 985.250000 963.000000 941.750000 984.750000 1109.250000 1271.500000 ... 1394.75000 1388.000000 1431.500000 1493.750000 1467.000000 1377.500000 1299.500000 1201.250000 1056.75000 31059.750000
50% 2013 10 3.000000 3959.500000 3688.000000 3514.500000 3404.500000 3388.000000 3581.500000 3964.500000 ... 4348.50000 4523.500000 4777.000000 5128.500000 5252.500000 5086.500000 4802.500000 4572.000000 4325.00000 105059.500000
75% 2013 10 5.000000 9607.000000 9339.500000 9175.250000 9062.000000 9230.250000 9767.000000 10345.250000 ... 11657.75000 11688.500000 12082.750000 12541.750000 12559.750000 12030.250000 11341.250000 10930.250000 10361.50000 260843.500000
max 2013 10 6.000000 51434.000000 50392.000000 48232.000000 46502.000000 47147.000000 50663.000000 58157.000000 ... 67431.00000 65834.000000 69240.000000 69966.000000 67881.000000 64189.000000 59996.000000 57221.000000 55954.00000 1436330.000000

8 rows × 28 columns


In [21]:
# Set index to make operations easier
wb = wb.set_index(['Country', 'year', "month", "weekday", "date", "Day"])

In [22]:
# show first rows of the data frame
wb.head()


Out[22]:
H01 H02 H03 H04 H05 H06 H07 H08 H09 H10 ... H15 H16 H17 H18 H19 H20 H21 H22 H23 H24
Country year month weekday date Day
AT 2013 10 1 2013-10-01 2013-10-01 6494 6198 6046 5938 6264 7314 8724 9334 9464 9416 ... 9281 9189 9114 9226 9603 9206 8556 7779 7507 6881
2 2013-10-02 2013-10-02 6544 6315 6186 6122 6410 7443 8830 9336 9462 9530 ... 9102 8872 8849 9013 9492 9186 8523 7744 7528 6902
3 2013-10-03 2013-10-03 6483 6243 6192 6121 6387 7531 8976 9631 9758 9705 ... 9207 9091 9014 9194 9713 9392 8707 7911 7701 7036
4 2013-10-04 2013-10-04 6594 6415 6301 6232 6539 7522 8972 9411 9535 9505 ... 8807 8714 8669 8834 9377 9078 8349 7752 7591 6971
5 2013-10-05 2013-10-05 6522 6180 5945 5821 5907 6338 6875 7397 7843 8018 ... 7565 7611 7652 7835 8117 7734 7230 6761 6642 6165

5 rows × 24 columns


In [23]:
# Compute daily consumption
wb['Consumption'] = wb.sum(axis='columns')

In [24]:
# reset index 
wb = wb.reset_index()

In [25]:
#Select a country
df_ES = wb[wb.Country == 'ES']

In [26]:
# delete column
del df_ES['Country']

In [27]:
# show first rows of the data frame
df_ES.head()


Out[27]:
year month weekday date Day H01 H02 H03 H04 H05 ... H16 H17 H18 H19 H20 H21 H22 H23 H24 Consumption
310 2013 10 1 2013-10-01 2013-10-01 23172 22368 22087 21966 22319 ... 31818 31884 31440 31210 33657 32764 29570 26920 25142 696677
311 2013 10 2 2013-10-02 2013-10-02 23741 22878 22408 22344 22595 ... 31654 31708 31089 31182 33495 32782 29664 27039 25462 697502
312 2013 10 3 2013-10-03 2013-10-03 23919 23202 23103 23160 23415 ... 31822 31948 31804 32041 33954 33009 29745 27147 25372 709784
313 2013 10 4 2013-10-04 2013-10-04 23857 22846 22573 22425 22757 ... 30828 30759 30338 30194 31819 30964 28253 26081 24646 688566
314 2013 10 5 2013-10-05 2013-10-05 23229 22067 21341 21045 20728 ... 25034 24819 24749 25355 28149 28220 26221 24490 22845 590847

5 rows × 30 columns


In [28]:
# show only the selected columns
df_ES[['date', 'weekday', 'Consumption']]


Out[28]:
date weekday Consumption
310 2013-10-01 1 696677
311 2013-10-02 2 697502
312 2013-10-03 3 709784
313 2013-10-04 4 688566
314 2013-10-05 5 590847
315 2013-10-06 6 538611
316 2013-10-07 0 648843
317 2013-10-08 1 651812
318 2013-10-09 2 650535
319 2013-10-10 3 665309
320 2013-10-11 4 653868
321 2013-10-12 5 556792
322 2013-10-13 6 530189
323 2013-10-14 0 646747
324 2013-10-15 1 667252
325 2013-10-16 2 671685
326 2013-10-17 3 671159
327 2013-10-18 4 667334
328 2013-10-19 5 592291
329 2013-10-20 6 538449
330 2013-10-21 0 656125
331 2013-10-22 1 677288
332 2013-10-23 2 670606
333 2013-10-24 3 673259
334 2013-10-25 4 659903
335 2013-10-26 5 586298
336 2013-10-27 6 537990
337 2013-10-28 0 656785
338 2013-10-29 1 660768
339 2013-10-30 2 667511
340 2013-10-31 3 633981

In [29]:
# Plot the month consumption
plt.figure()
graphic = df_ES.plot(x='date', y='Consumption', title='Consumption in Spain 10/2013', kind='area')
graphic.set_ylabel('MWh')
plt.show()

In [30]:
# BoxPlots of the hourly values
plt.figure()
df_ES.iloc[:,5:29].boxplot(return_type='dict')
plt.show()

In [31]:
# Compute average hourly consumption per weekday
prot_day = df_ES.groupby(['weekday']).mean()

In [32]:
# delete columns that are not needed
prot_day = prot_day.drop(['year', 'month', 'Consumption'], axis='columns')
# show the data frame
prot_day


Out[32]:
H01 H02 H03 H04 H05 H06 H07 H08 H09 H10 ... H15 H16 H17 H18 H19 H20 H21 H22 H23 H24
weekday
0 20349.25 19833.25 19682.25 19742.00 20796.75 23493.25 27082.50 29052.00 29828.75 30546.75 ... 29415.50 29365.25 29953.50 30042.25 30867.25 32061.25 30436.50 27437.50 25115.75 23620.00
1 22496.60 21858.00 21540.00 21556.00 22291.60 24478.80 27847.40 29751.20 30497.20 30238.80 ... 30083.00 30105.60 30540.20 30615.00 31184.80 32685.20 31191.80 28104.00 25609.20 23938.60
2 22626.20 21906.80 21609.40 21641.60 22308.00 24508.60 27767.00 29604.60 30377.80 31053.60 ... 29928.80 29877.00 30304.60 30256.20 30900.00 32372.80 31016.20 28051.80 25712.40 24104.60
3 22741.20 22024.80 21759.80 21813.00 22454.00 24600.60 27806.20 29621.00 30458.80 31169.20 ... 30065.80 30111.00 30664.80 30642.80 31346.40 32573.40 31230.00 28243.80 25799.20 19717.20
4 23131.50 22226.25 21835.00 21677.25 22031.00 23895.75 27519.50 29553.00 30266.75 31088.50 ... 29809.50 29507.00 29508.75 29170.50 30057.00 31701.25 30737.00 27901.50 25677.50 24290.50
5 22858.75 21724.75 21109.25 20744.25 20465.25 20745.25 21833.75 22700.75 24566.75 26279.50 ... 25132.75 24363.00 24157.00 24118.25 25375.75 27771.50 27751.25 25827.25 24037.50 22457.25
6 20973.00 19810.00 19182.25 18919.50 18894.00 19107.50 19794.75 20333.50 21697.00 23071.75 ... 22795.50 22240.00 22604.25 23187.00 24513.50 26762.25 26696.00 25033.25 23089.50 21491.50

7 rows × 24 columns


In [33]:
# transpose the data frame
prot_dayT=prot_day.transpose()

In [41]:
# Change columns names
prot_dayT.columns = ['Monday', 'Tuesday', 'Wednesday', 'Thrusday', 'Friday', 'Saturday', 'Sunday']
# show the data frame
prot_dayT


Out[41]:
Monday Tuesday Wednesday Thrusday Friday Saturday Sunday
H01 20349.25 22496.6 22626.2 22741.2 23131.50 22858.75 20973.00
H02 19833.25 21858.0 21906.8 22024.8 22226.25 21724.75 19810.00
H03 19682.25 21540.0 21609.4 21759.8 21835.00 21109.25 19182.25
H04 19742.00 21556.0 21641.6 21813.0 21677.25 20744.25 18919.50
H05 20796.75 22291.6 22308.0 22454.0 22031.00 20465.25 18894.00
H06 23493.25 24478.8 24508.6 24600.6 23895.75 20745.25 19107.50
H07 27082.50 27847.4 27767.0 27806.2 27519.50 21833.75 19794.75
H08 29052.00 29751.2 29604.6 29621.0 29553.00 22700.75 20333.50
H09 29828.75 30497.2 30377.8 30458.8 30266.75 24566.75 21697.00
H10 30546.75 30238.8 31053.6 31169.2 31088.50 26279.50 23071.75
H11 30964.00 31662.8 31520.8 31541.6 31452.50 26816.50 23740.00
H12 31355.50 31094.8 31866.2 31969.0 31904.75 27023.25 24058.25
H13 31082.25 30763.8 31674.2 31751.2 31835.00 27223.00 24379.25
H14 30001.75 30625.0 30578.6 30593.6 30640.50 26474.50 23936.25
H15 29415.50 30083.0 29928.8 30065.8 29809.50 25132.75 22795.50
H16 29365.25 30105.6 29877.0 30111.0 29507.00 24363.00 22240.00
H17 29953.50 30540.2 30304.6 30664.8 29508.75 24157.00 22604.25
H18 30042.25 30615.0 30256.2 30642.8 29170.50 24118.25 23187.00
H19 30867.25 31184.8 30900.0 31346.4 30057.00 25375.75 24513.50
H20 32061.25 32685.2 32372.8 32573.4 31701.25 27771.50 26762.25
H21 30436.50 31191.8 31016.2 31230.0 30737.00 27751.25 26696.00
H22 27437.50 28104.0 28051.8 28243.8 27901.50 25827.25 25033.25
H23 25115.75 25609.2 25712.4 25799.2 25677.50 24037.50 23089.50
H24 23620.00 23938.6 24104.6 19717.2 24290.50 22457.25 21491.50

In [36]:
# Plot prototype weekdays
prot_dayT.plot(title= 'Average Consumption per Weekday for Spain 10/2013')
plt.show()