In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import seaborn; seaborn.set()


/home/esport/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:279: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  'Matplotlib is building the font cache using fc-list. '

In [2]:
r11 = pd.read_csv("2011.csv", sep=';', na_values='#N/D!',\
        usecols=["Data",'Godzina', 'Krajowe zapotrzebowanie na moc [MW]'], parse_dates=["Data"], index_col=['Data'])
r12 = pd.read_csv("2012.csv", sep=";", na_values='#N/D!',\
        usecols=["Data",'Godzina', 'Krajowe zapotrzebowanie na moc [MW]'], parse_dates=["Data"], index_col=['Data'])
r13 = pd.read_csv("2013.csv", sep=";", na_values='#N/D!',\
        usecols=["Data",'Godzina', 'Krajowe zapotrzebowanie na moc [MW]'], parse_dates=["Data"], index_col=['Data'])
r14 = pd.read_csv("2014.csv", sep=";", na_values='#N/D!',\
        usecols=["Data",'Godzina', 'Krajowe zapotrzebowanie na moc [MW]'], parse_dates=["Data"], index_col=['Data'])
r15 = pd.read_csv("2015.csv", sep=";", na_values='#N/D!',\
        usecols=["Data",'Godzina', 'Krajowe zapotrzebowanie na moc [MW]'], parse_dates=["Data"], index_col=['Data'])
r16 = pd.read_csv("2016.csv", sep=";", na_values='#N/D!',\
        usecols=["Data",'Godzina', 'Krajowe zapotrzebowanie na moc [MW]'], parse_dates=["Data"], index_col=['Data'])
r17 = pd.read_csv("2017.csv", sep=";", na_values='#N/D!',\
        usecols=["Data",'Godzina', 'Krajowe zapotrzebowanie na moc'], parse_dates=["Data"], index_col=['Data'])

In [3]:
r17['Krajowe zapotrzebowanie na moc [MW]'] = r17['Krajowe zapotrzebowanie na moc']
r17.drop('Krajowe zapotrzebowanie na moc', axis=1, inplace=True)
zapotrzebowanie = pd.concat([r11, r12, r13, r14, r15, r16, r17])
zapotrzebowanie.columns = ['godz', 'zapotrzebowanie']

In [4]:
zapotrzebowanie.head()


Out[4]:
godz zapotrzebowanie
Data
2011-02-10 1 16 657
2011-02-10 2 16 045
2011-02-10 3 15 928
2011-02-10 4 15 949
2011-02-10 5 16 254

In [5]:
zapotrzebowanie = zapotrzebowanie.replace(' ', '', regex=True)
zapotrzebowanie['zapotrzebowanie'] = pd.to_numeric(zapotrzebowanie['zapotrzebowanie'])

In [6]:
zapotrzebowanie.head()


Out[6]:
godz zapotrzebowanie
Data
2011-02-10 1 16657
2011-02-10 2 16045
2011-02-10 3 15928
2011-02-10 4 15949
2011-02-10 5 16254

In [7]:
tt = pd.DataFrame(zapotrzebowanie.groupby([zapotrzebowanie.index])['godz'].count())
qwe = tt[(tt.godz < 24) | (tt.godz > 24)].index
qwe


Out[7]:
DatetimeIndex(['2011-03-27', '2011-10-30', '2012-03-25', '2012-10-28',
               '2013-03-31', '2013-10-27', '2014-03-30', '2014-10-26',
               '2015-03-29', '2015-10-25', '2016-03-27', '2016-10-30',
               '2017-03-26'],
              dtype='datetime64[ns]', name='Data', freq=None)

In [8]:
zapotrzebowanie.drop(qwe, inplace=True)

In [9]:
tt2 = pd.DataFrame(zapotrzebowanie.groupby([zapotrzebowanie.index])['godz'].count())
tt2[(tt2.godz < 24) | (tt2.godz > 24)]
zapotrzebowanie.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 58248 entries, 2011-02-10 to 2017-10-15
Data columns (total 2 columns):
godz               58248 non-null int64
zapotrzebowanie    58248 non-null int64
dtypes: int64(2)
memory usage: 2.6 MB

In [10]:
rt = pd.DataFrame(pd.date_range('2011-02-10 00:59:59', '2017-10-15 23:59:59', freq='H', close='left'))
rt['zxc'] = rt[0]
rt = rt.set_index(rt[0])
rt.head(3)
a = rt

In [11]:
daty = ['2011-03-27', '2011-10-30', '2012-03-25', '2012-10-28',
               '2013-03-31', '2013-10-27', '2014-03-30', '2014-10-26',
               '2015-03-29', '2015-10-25', '2016-03-27', '2016-10-30',
               '2017-03-26']

In [12]:
for i in range(13):
    
    rt.drop(rt.loc[daty[i]].index,inplace=True)

In [13]:
d = rt.index

In [14]:
clean = zapotrzebowanie
clean = clean.set_index(d)
clean.tail(4)


Out[14]:
godz zapotrzebowanie
0
2017-10-15 20:59:59 21 18748
2017-10-15 21:59:59 22 17811
2017-10-15 22:59:59 23 16841
2017-10-15 23:59:59 24 15769

In [15]:
clean['zapotrzebowanie'].plot(figsize=(20,10), title='Zapotrzebowanie na energie w latach 2011 - 2017');



In [16]:
days = ['Pon', 'Wto', 'Śro', 'Czw', 'Pią', 'Sob', 'Nie']
for i in range(7):
    clean[days[i]] = (clean.index.weekday == i).astype(float)
clean['tydzien'] = clean.index.month.astype(float)
clean.head()


Out[16]:
godz zapotrzebowanie Pon Wto Śro Czw Pią Sob Nie tydzien
0
2011-02-10 00:59:59 1 16657 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0
2011-02-10 01:59:59 2 16045 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0
2011-02-10 02:59:59 3 15928 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0
2011-02-10 03:59:59 4 15949 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0
2011-02-10 04:59:59 5 16254 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0

In [17]:
godz = ['1', '2', '3', '4', '5', '6','7', '8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24']
for i in range(24):
    clean[godz[i]] = (clean.index.hour+1 == i+1).astype(float)

In [18]:
clean.drop(['godz'], inplace=True, axis=1)

In [19]:
clean.head()


Out[19]:
zapotrzebowanie Pon Wto Śro Czw Pią Sob Nie tydzien 1 ... 15 16 17 18 19 20 21 22 23 24
0
2011-02-10 00:59:59 16657 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2011-02-10 01:59:59 16045 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2011-02-10 02:59:59 15928 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2011-02-10 03:59:59 15949 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2011-02-10 04:59:59 16254 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 33 columns


In [20]:
clean.groupby(clean.index.year)['zapotrzebowanie'].describe()


Out[20]:
count mean std min 25% 50% 75% max
0
2011 7752.0 17705.679180 2931.652963 10143.0 15238.00 17999.0 19825.50 24525.0
2012 8736.0 17948.152358 3061.593738 10898.0 15400.00 18157.5 20125.00 25595.0
2013 8712.0 18040.755051 3001.584824 10874.0 15557.75 18284.0 20283.00 24635.0
2014 8712.0 18126.448691 2996.120717 10815.0 15590.00 18433.0 20420.25 25340.0
2015 8712.0 18412.179752 2974.438984 11158.0 15882.25 18727.5 20750.00 24725.0
2016 8736.0 18730.351305 3075.056258 11472.0 16142.75 18990.0 21153.25 25576.0
2017 6888.0 18995.464576 3046.134144 11795.0 16363.50 19257.0 21436.25 25871.0

In [21]:
tygodniowe = clean['zapotrzebowanie'].resample('W').sum()
tygodniowe.rolling(20, center=True).mean().plot(figsize=(20,10));



In [22]:
fig, ax = plt.subplots(1, 2, figsize=(20,7))

by_time = clean.groupby(clean.index.time).mean()
hourly_ticks = 4 * 60 * 60 * np.arange(6)
by_time.zapotrzebowanie.plot(ax=ax[0], xticks=hourly_ticks, title='Średnie zapotrzebowanie w danej godzinie')

by_weekday = clean.groupby(clean.index.dayofweek).mean()
by_weekday.index = ['Pon', 'Wto', 'Śro', 'Czw', 'Pią', 'Sob', 'Nie']
by_weekday.zapotrzebowanie.plot(ax=ax[1], title='Średnie zapotrzebowanie w danym dniu tygodnia',);



In [23]:
by_weekday


Out[23]:
zapotrzebowanie Pon Wto Śro Czw Pią Sob Nie tydzien 1 ... 15 16 17 18 19 20 21 22 23 24
Pon 18500.780771 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6.468391 0.041667 ... 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667
Wto 19103.122246 0.0 1.0 0.0 0.0 0.0 0.0 0.0 6.468391 0.041667 ... 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667
Śro 19221.085129 0.0 0.0 1.0 0.0 0.0 0.0 0.0 6.465517 0.041667 ... 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667
Czw 19157.981734 0.0 0.0 0.0 1.0 0.0 0.0 0.0 6.449857 0.041667 ... 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667
Pią 19063.302292 0.0 0.0 0.0 0.0 1.0 0.0 0.0 6.449857 0.041667 ... 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667
Sob 17256.421681 0.0 0.0 0.0 0.0 0.0 1.0 0.0 6.481375 0.041667 ... 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667
Nie 15467.586558 0.0 0.0 0.0 0.0 0.0 0.0 1.0 6.458333 0.041667 ... 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667 0.041667

7 rows × 33 columns


In [24]:
def dlugosc_dnia(dane, axis=21.01, latitude=52.23):
   
    days = (dane - pd.datetime(2000, 12, 21)).days
    m = (1. - np.tan(np.radians(latitude))
         * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))
    return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180. - 0.20

clean['d_dnia'] = list(map(dlugosc_dnia, clean.index))
fig, ax = plt.subplots(2, figsize=(20,7))
clean['d_dnia'].plot( ax=ax[0],figsize=(20,10), title='Długość dnia')
clean['zapotrzebowanie'].plot( ax=ax[1],figsize=(20,10),title='Zapotrzebowanie na energie w latach 2011 - 2017' );



In [37]:
column_names = ['Pon', 'Wto', 'Śro', 'Czw', 'Pią', 'Sob', 'Nie', 'tydzien', 'd_dnia','1', '2', '3', '4', '5', '6', '8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24']
X = clean[column_names]
y = clean['zapotrzebowanie']

model = LinearRegression(fit_intercept = False)
model.fit(X, y)
clean['szacunek'] = model.predict(X)

In [38]:
clean[['zapotrzebowanie', 'szacunek']].plot(alpha=0.5, figsize=(20,10));



In [45]:
clean[clean['zapotrzebowanie'] == clean['zapotrzebowanie'].min()]


Out[45]:
zapotrzebowanie Pon Wto Śro Czw Pią Sob Nie tydzien 1 ... 17 18 19 20 21 22 23 24 d_dnia szacunek
0
2011-04-25 05:59:59 10143 1.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13.802557 14864.10899

1 rows × 35 columns


In [27]:
print('Współczynnik determinacji wynosi: %.2f' % r2_score(clean['zapotrzebowanie'], clean['szacunek']))


Współczynnik determinacji wynosi: 0.80
Stworzenie nowych danych w celu predykcji.

In [28]:
s = pd.date_range('2017-10-16 00:59:59', '2017-10-22 23:59:59', freq='H', close='left')
nowe = pd.DataFrame(index=s)
for i in range(7):
    nowe[days[i]] = (nowe.index.weekday == i).astype(float)
nowe['tydzien'] = nowe.index.month.astype(float)
nowe['d_dnia'] = list(map(dlugosc_dnia, nowe.index))
for i in range(24):
    
    nowe[godz[i]] = (nowe.index.hour+1 == i+1 ).astype(float)

In [29]:
nowe.head(2)


Out[29]:
Pon Wto Śro Czw Pią Sob Nie tydzien d_dnia 1 ... 15 16 17 18 19 20 21 22 23 24
2017-10-16 00:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2017-10-16 01:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

2 rows × 33 columns


In [30]:
X = nowe[column_names]
nowe['szacunek'] = model.predict(X)

In [31]:
nowe.head(24)


Out[31]:
Pon Wto Śro Czw Pią Sob Nie tydzien d_dnia 1 ... 16 17 18 19 20 21 22 23 24 szacunek
2017-10-16 00:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16093.996587
2017-10-16 01:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15476.634823
2017-10-16 02:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15196.702808
2017-10-16 03:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15141.453117
2017-10-16 04:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15190.242157
2017-10-16 05:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15627.755548
2017-10-16 06:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17298.214963
2017-10-16 07:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18937.868445
2017-10-16 08:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 19932.901819
2017-10-16 09:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20458.826830
2017-10-16 10:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20627.932310
2017-10-16 11:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20837.530167
2017-10-16 12:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20845.796339
2017-10-16 13:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20761.583731
2017-10-16 14:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20412.002355
2017-10-16 15:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20213.329508
2017-10-16 16:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20292.633999
2017-10-16 17:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 20400.033257
2017-10-16 18:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 20558.210019
2017-10-16 19:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 20824.681795
2017-10-16 20:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 20688.495144
2017-10-16 21:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 19824.179528
2017-10-16 22:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 18512.616694
2017-10-16 23:59:59 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 10.266921 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 17233.344341

24 rows × 34 columns


In [32]:
nowe['szacunek'].plot(alpha=0.5, figsize=(20,10));