We begin by importing all the necessary libraries for data visualization and time series analysis and data frame structure
In [283]:
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import pandas as pd
from datetime import datetime
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as smapi
import statsmodels as sm
Then, we load the data into a data frame with $\texttt{timestamp}$ as index, and the $\texttt{consommation}$ as label.
In [284]:
'''Data loading'''
f_names_national = [
'2012 Conso Prod.csv',
'2013 Conso Prod.csv',
'2014 Conso Prod.csv',
'2015 Conso Prod.csv'
]
datas = []
data_news = []
for f_name in f_names_national:
# print(f_name)
data = pd.read_csv('data/'+ f_name, delimiter='\t', encoding = "ISO-8859-1", low_memory=False)
pd.set_option('max_columns', 100)
headers = list(data)
data = data[data.Consommation.notnull()]
data = data[data.Date.notnull()]
data['timestamp'] = [str(d) + ' ' + str(t) for d, t in zip(data['Date'].values, data['Heures'].values)]
data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M')
datas.append(data)
data_final = pd.concat(datas).reset_index()
data_final = data_final.sort_values(by=['Date','Heures'], ascending=[False,False])
ts = pd.Series(data_final['Consommation'].values, index = data_final['timestamp'].values )
We then visualize our data for all the range provided, to understand what type of model we should use
In [285]:
fig, ax = plt.subplots(figsize=(20,5))
ax.plot(ts[::10].index, ts[::10])
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.set_title(" Electricity consumption in France in MW")
# plt.savefig('visualization.png')
Out[285]:
As we can see, there is too much variation in the plot, so it would be more adequate to visualize the rolling mean since it is smoother and reprensent the data very well
If we take 48 (a day) as the length of the window, we obtain the following
In [286]:
fig, ax = plt.subplots(figsize=(20,5))
rolling_mean = ts.rolling(center=False,window=48).mean()
ax.plot(rolling_mean.index, rolling_mean)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.set_title(" Electricity consumption in France in MW")
Out[286]:
The figure still has a lot of variations, therefore, we try with a larger window, say 1 week (7*48)
In [287]:
fig, ax = plt.subplots(figsize=(20,5))
rolling_mean = ts.rolling(center=False,window=48*7).mean()
ax.plot(rolling_mean.index, rolling_mean)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.set_title(" Electricity consumption in France in MW")
Out[287]:
and 1 month
In [288]:
fig, ax = plt.subplots(figsize=(20,5))
rolling_mean = ts.rolling(center=False,window=48*30).mean()
ax.plot(rolling_mean.index, rolling_mean)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.set_title(" Electricity consumption in France in MW")
Out[288]:
With a 1-month window, we get a smooth curve, which allows us to find the periodicity of our time series. ARMA models are not fit to modelize periodic time series, so we have to do some preprocessing by eliminating the periodicity in the data. Let's find the frequency of our data.
In [289]:
rolling_mean = rolling_mean.dropna()
fig, ax = plt.subplots(figsize=(20,5))
f, PSD = signal.periodogram((rolling_mean.values-np.mean(rolling_mean.values))/np.std(rolling_mean.values), fs = 1/(30*60))
ax.semilogy(f, PSD)
plt.xlabel('frequency [Hz]')
plt.ylabel('PSD [V**2/Hz]')
plt.ylim([1e-2, 1e9])
plt.show()
In [290]:
fig, ax = plt.subplots(figsize=(20,5))
f, PSD = signal.periodogram((rolling_mean.values-np.mean(rolling_mean.values))/np.std(rolling_mean.values), fs = 1/(30*60))
ax.plot(f, PSD)
plt.xlabel('frequency [Hz]')
plt.ylabel('PSD [V**2/Hz]')
plt.xlim([0, 0.0000001])
plt.show()
freq_Hz = f[PSD.argmax(axis=0)];
print(freq_Hz)
print(freq_Hz*60*60*24*365.25)
print(1/(freq_Hz*60*30))
In [291]:
rolling_mean.values.shape
Out[291]:
The PSD ( Power Spectral Density) seems be able to detect the frequency, which is equal to $3.23 * 10^{-8}$. in terms of number of points the period is equal to $\texttt{period} = \frac{1}{3.23*10^{-8}*60*30} = 17172$.Manually, We can see that in our range, there is approximatively 4 periods, so the period would be the range divided by 4 i.e $\texttt{period} = \frac{70000}{4} = 17500, it turns out to be a good approximation.$
We can try to decompose our data in trend, seasonality and residuals. Let's split them into train and test
In [292]:
ts_train = data_final[data_final['Date'] <= '2015-10-31']
ts_train = pd.Series(ts_train['Consommation'].values, index = ts_train['timestamp'].values )
plt.plot(ts_train)
Out[292]:
In [293]:
ts_test = data_final[data_final['Date'] > '2015-10-31']
ts_test=pd.Series(ts_test['Consommation'].values, index = ts_test['timestamp'].values )
plt.plot(ts_test)
Out[293]:
We then perform the decomposition
In [294]:
seasonal_dec = seasonal_decompose(ts_train.values, freq = 17172)
data_trend = seasonal_dec.trend
data_seasonal = seasonal_dec.seasonal
data_residual = seasonal_dec.resid
plt.subplot(221)
plt.plot(ts_train, label='Original')
plt.legend(loc='best')
plt.subplot(222)
plt.plot(data_trend, label='Trend')
plt.legend(loc='best')
plt.subplot(223)
plt.plot(data_seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(224)
plt.plot(data_residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
np.mean(seasonal_dec.trend[np.logical_not(np.isnan(seasonal_dec.trend))])
Out[294]:
As we can see, we obtain residuals with a 15000 amplitude (20% of the mean value of the series), and a 25000 amplitude seasonality. The trend is nearly constant (= 55000). The principal part would be the prediction of the residuals, which have the form of a white noise. We could generate a white noise with the same characteristics (variance), but applying ARIMA to forecast these residuals can give good results.
We can use a statistic test named Dickey-Fuller to be more sure of the stationarity of the data. We hence define a function that test the stationarity of the time series.
In [295]:
def test_stationarity(ts, stationarity):
orig = plt.plot(ts, color='blue',label='Original')
rolling_mean = ts.rolling(center=False,window=stationarity).mean()
rolling_std = ts.rolling(center=False,window=stationarity).std()
mean = plt.plot(rolling_mean, color='red', label='Rolling Mean')
std = plt.plot(rolling_std, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
print('Results of Dickey-Fuller Test:')
dftest = adfuller(ts, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
In [296]:
test_stationarity(ts_train, 48*30)
The results of the Dickey-Fuller test confirm the observation that the data are stationnary (since the test statistic $-7.95$ is less than the critical value (1%) $-3.43$), we can try to differentiate to see the effect that could have on the stationarity.
In [297]:
ts_train_diff = ts_train - ts_train.shift()
plt.plot(ts_train_diff)
ts_train_diff.dropna(inplace=True)
test_stationarity(ts_train_diff, 48*30)
By differencing our data once, we get more stationarity (Test Statistic $-32 \leq -8$ that we had before).
In [298]:
ts_train_diff2 = ts_train_diff - ts_train_diff.shift()
ts_train_diff2.dropna(inplace=True)
test_stationarity(ts_train_diff2, 48*30)
We notice that, the more we differenciate the data, the better is the stationarity.
But since the data provided are already stationary, we don't need the integrated part of the ARIMA, we will then fix the d value (degree of differentiation) to 0.
We focus now on finding the parameters of the ARMA model.
In [302]:
data_residual = data_residual[np.logical_not(np.isnan(data_residual))]
mn =np.mean(data_residual)
stdd = np.std(data_residual)
print(mn)
print(stdd)
In [303]:
samples = np.random.normal(mn, stdd, size= 54000)
plt.plot(samples)
Out[303]:
We try to approximate the first degree difference with an ARMA model. We find the parameters of the AR and MA through the autocorrelation and partial autocorrelation functions.
In [304]:
acf_lag = acf(ts_train_diff, nlags=20)
pacf_lag = pacf(ts_train_diff, nlags=20, method='ols')
In [305]:
plt.subplot(211)
plt.plot(acf_lag)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_train_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_train_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
plt.subplot(212)
plt.plot(pacf_lag)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_train_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_train_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
#plt.savefig('auto_partial.png')
The degree of the MA part appears to be equal to 4, because it is the threshold where the autocorrelation function starts to be negative.
The degree of the MA part is equal to 2 (the same argument as before with the partial autocorrelation function instead of the autocorrelation function).
In [306]:
model = ARIMA(ts_train, order=(2, 1, 0))
AR_results = model.fit(disp=-1)
plt.plot(ts_train_diff)
plt.plot(AR_results.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((AR_results.fittedvalues-ts_train_diff)**2))
Out[306]:
In [307]:
model = ARIMA(ts_train, order=(0, 1, 4))
MA_results = model.fit(disp=-1)
plt.plot(ts_train_diff)
plt.plot(MA_results.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((MA_results.fittedvalues-ts_train_diff)**2))
Out[307]:
In [308]:
model = ARIMA(ts_train, order=(2, 1, 4))
ARIMA_results = model.fit(disp=-1)
plt.plot(ts_train_diff)
plt.plot(ARIMA_results.fittedvalues, color='red')
sum(((ARIMA_results.fittedvalues-ts_train_diff))**2)/len(ts_train_diff)
#plt.savefig('ARMA.png')
Out[308]:
We notice that the fitted values have the same form but not the good amplitude, so we multiply by a factor in order to fit with the max and min.
In [309]:
plt.plot(ts_train)
plt.plot(np.cumsum(results_ARIMA.fittedvalues), color='red')
#plt.savefig('cumsum.png')
Out[309]:
In [310]:
res = results_ARIMA.fittedvalues
res = res * (np.max(ts_train_diff)-np.min(ts_train_diff))/(np.max(res)-np.min(res))
plt.plot(ts_train_diff)
plt.plot(res, color='red')
sum(((res-ts_train_diff))**2)/len(ts_train_diff)
Out[310]:
But we notice that we increased the error by doing so.
In [311]:
plt.plot(ts_train)
plt.plot(np.cumsum(res), color='red')
Out[311]:
We can see that we get sort of the opposite of what we should obtain (modulo normalization), we can calculate the error if we fix the problem
In [312]:
plt.plot(ts_train)
res1 = -np.cumsum(res)
res2 = (res1 - np.mean(res1))*np.std(ts_train)/np.std(res1)+np.mean(ts_train)
plt.plot(res2, color='red')
#plt.savefig('modified.png')
MSE = sum(np.abs((res2 - ts_train.values[:-1]))/res2)/len(res2)
MSE
Out[312]:
We get a training error of 13%. Let's apply the predictions on the validation set.
We will train our model and use it to predict one value after, for 15 times (the first 15 values of the test set)
Firstly, we alimen the dynamic set with the observations in order to prediction the next ones
In [314]:
dynamic_train_set = [x for x in ts_train]
predictions = list()
nb_tests = 15
for t in range(nb_tests):
model = ARIMA(dynamic_train_set, order=(2,1,4))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = ts_test[t]
dynamic_train_set.append(obs)
print('predicted=%f, expected=%f' % (yhat, obs))
In [315]:
RMSE = np.sqrt(sum(((predictions-ts_test.values[:nb_tests-1])/predictions)**2)[0]/nb_tests)
RMSE
Out[315]:
In [317]:
dynamic_train_set = [x for x in ts_train]
predictions = list()
nb_tests = 15
for t in range(nb_tests):
model = ARIMA(dynamic_train_set, order=(2,1,4))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = ts_test[t]
dynamic_train_set.append(yhat)
print('predicted=%f, expected=%f' % (yhat, obs))
In [318]:
RMSE = np.sqrt(sum(((predictions-ts_test.values[:nb_tests-1])/predictions)**2)[0]/nb_tests)
RMSE
Out[318]:
We got a good score (5%) for 1 30-minute forecasting, and 1% if we retrain the model dynamically. But 30-minutes prediction is not enough, we will have to try with more lag prediction.
In [319]:
history = [x for x in ts_train]
model = ARIMA(history, order=(2,1,4))
model_fit = model.fit(disp=0)
In [320]:
tests = list()
for nb_test in range(test_len[0]):
output = model_fit.forecast(nb_test+1)
predictions = output[0]
tests.append(np.sqrt((sum(np.abs(((predictions-ts_test[:nb_test+1]))/predictions)**2))/(nb_test+1)))
In [321]:
plt.plot(tests)
Out[321]:
In [322]:
tests[:47]
Out[322]:
We get a good score for the first iterations, and asymptotically, we have a 12% error. In fact, if we plot the predictions, we will be convinced of the problem
In [323]:
plt.plot(predictions)
Out[323]:
The ARIMA model is not fit to predict our data model. Before we try another approach let's change the parameters a little bit to see the impact on the predictions
In [324]:
history = [x for x in ts_train]
model = ARIMA(history, order=(2,1,9))
model_fit = model.fit(disp=0)
In [325]:
tests = list()
for nb_test in range(test_len[0]):
output = model_fit.forecast(nb_test+1)
predictions = output[0]
tests.append(np.sqrt((sum(np.abs(((predictions-ts_test[:nb_test+1]))/predictions)**2))/(nb_test+1)))
In [326]:
plt.plot(tests)
plt.ylabel('Root Mean Square Error')
plt.xlabel('Number Of Predicted Points')
#plt.savefig('error.png')
Out[326]:
In [327]:
history = [x for x in ts_train]
model = ARIMA(history, order=(2,1,9))
model_fit = model.fit(disp=0)
In [328]:
tests = list()
for nb_test in range(test_len[0]):
output = model_fit.forecast(nb_test+1)
predictions = output[0]
tests.append(np.sqrt((sum(np.abs(((predictions-ts_test[:nb_test+1]))/predictions)**2))/(nb_test+1)))
In [329]:
plt.plot(tests)
plt.ylabel('Root Mean Square Error')
plt.xlabel('Number of predicted points')
Out[329]:
The result is still not satisfying. We will use another approach.
Since the fitted values don't even stick with the training set, the ARIMA approach wasn't convenient in this case (periodic time series). Hence, we will apply the same operations but using an adapted form of ARIMA that is fitted to the periodic case. This form of ARIMA is called SARIMA (for Seasonal ARIMA).
In [330]:
train_len = ts_train.shape
print('size of training data : {}'.format(train_len[0]))
test_len = ts_test.shape
print('size of test data : {}'.format(test_len[0]))
We perform the same operations as before to determine the parameters.
In [331]:
ts_train_diff = ts_train - ts_train.shift(12)
ts_train_diff = ts_train_diff.dropna(inplace=False)
In [332]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = smapi.graphics.tsa.plot_acf(ts_train_diff.iloc[13:], lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = smapi.graphics.tsa.plot_pacf(ts_train_diff.iloc[13:], lags=40, ax=ax2)
We conclude that the p parameter for the Auto Regressive part is equal to 3, and the q parameter for the Moving Average part is equal to 9.
Now we will train our model
In [333]:
mod = sm.tsa.statespace.sarimax.SARIMAX(ts_train, trend='c', order=(2,1,4), seasonal_order=(1,1,1,1))
# c because our trend is nearly constant
# order determined by the autocorrelation and partial autocorrelation function
# s = 1 because we have a 1 year seasonality
results = mod.fit()
print(results.summary())
In [334]:
res3 = results.predict(start = train_len[0]+1, end= train_len[0]+test_len[0], dynamic= True)
plt.figure()
plt.plot(ts_test.values)
plt.plot(res3.values)
Out[334]: