In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
%matplotlib inline
plt.style.use('classic')
plt.rc("figure", facecolor="white")
rcParams['figure.figsize'] = 12, 6
# random seed
random_state=0
rng = np.random.RandomState(seed=random_state)
In [3]:
# Load the data and keep the the dates with less missing values for most houses
data_process = pd.read_csv('houses_clean.csv',parse_dates=['Time'], index_col='Time')
data = data_process.copy(deep=True)
data = data['2014-04':'2015-03']
In [4]:
# keep the house of choice for modelling
houses=[1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,20]
house_nr = 1
drop_houses = ['House_'+str(i) for i in houses if i != house_nr]
data = data.drop(drop_houses, axis=1)
# drop nan values
data = data.dropna(axis=0,how='any')
In [5]:
plt.plot(data)
Out[5]:
In [9]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(center=False,window=24).mean()
rolstd = timeseries.rolling(center=False,window=24).std()
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries.iloc[:,0].values, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
In [13]:
# check for stationarity
test_stationarity(data)
ARIMA stands for Auto-Regressive Integrated Moving Averages. The ARIMA forecasting for a stationary time series is nothing but a linear (like a linear regression) equation. The predictors depend on the parameters (p,d,q) of the ARIMA model:
In [17]:
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data.House_1,lags=100)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation of Hourly Electricity Consumption')
plt.show()
In [22]:
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(data.House_1,lags=100)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Partial Autocorrelation of Hourly Electricity Consumption')
plt.show()
In [89]:
model = sm.tsa.statespace.SARIMAX(data, order=(24, 0, 1))
results = model.fit()
In [90]:
print(results.summary())
In [91]:
results.plot_diagnostics(figsize=(15, 12))
plt.show()
In [ ]:
In [6]:
data_diff = data - data.shift(periods=24)
In [7]:
data_diff = data_diff.dropna(axis=0,how='any')
In [12]:
from statsmodels.graphics.tsaplots import plot_acf
plt.figure(1)
plot_acf(data_diff.House_1,lags=50)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation of Hourly Electricity Consumption')
plt.savefig('figures/95_percent/autocorrelation.eps')
plt.savefig('figures/95_percent/autocorrelation.pdf')
In [15]:
from statsmodels.graphics.tsaplots import plot_pacf
plt.figure(1)
plot_pacf(data_diff.House_1,lags=50)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Partial Autocorrelation of Hourly Electricity Consumption')
plt.savefig('figures/95_percent/partial_autocorrelation.eps')
plt.savefig('figures/95_percent/partial_autocorrelation.pdf')
The data was differenced on 24 period lags. From the ACF and PACF plots on the differenced data we can determine:
In [34]:
mod = sm.tsa.statespace.SARIMAX(data,
order=(1, 0, 4),
seasonal_order=(1, 0, 1, 24),
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
In [35]:
print(results.summary())
In [36]:
results.plot_diagnostics(figsize=(15, 12))
plt.show()
In [92]:
pred = results.get_prediction(start=pd.to_datetime('2015-03'), dynamic=False)
pred_ci = pred.conf_int()
In [93]:
y_forecasted = pred.predicted_mean
y_truth = data['2015-03':]
# Compute the mean square error
mse = mean_squared_error(y_true=y_truth,y_pred=y_forecasted)
mae = mean_absolute_error(y_true=y_truth,y_pred=y_forecasted)
mde = median_absolute_error(y_true=y_truth,y_pred=y_forecasted)
print('On testing time over two month period the model achieves:')
print('MSE: {0:.3f}, MAE: {1:.3f}, MDE: {2:.3f} '.format(mse,mae,mde))
In [94]:
ax = data['2015-03':].plot(label='Observed data')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7)
ax.fill_between(pred_ci.index,
pred_ci.iloc[:, 0],
pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Consumption')
plt.legend()
plt.show()
In [46]:
pred = results.get_prediction(start=pd.to_datetime('2015-03'), dynamic=False)
pred_ci = pred.conf_int()
In [47]:
ax = data['2015-03':].plot(label='Observed data')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7)
ax.fill_between(pred_ci.index,
pred_ci.iloc[:, 0],
pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Consumption')
plt.legend()
plt.show()
In [49]:
y_forecasted = pred.predicted_mean
y_truth = data['2015-03':]
# Compute the mean square error
mse = mean_squared_error(y_true=y_truth,y_pred=y_forecasted)
mae = mean_absolute_error(y_true=y_truth,y_pred=y_forecasted)
mde = median_absolute_error(y_true=y_truth,y_pred=y_forecasted)
print('On testing time over two month period the model achieves:')
print('MSE: {0:.3f}, MAE: {1:.3f}, MDE: {2:.3f} '.format(mse,mae,mde))
In [50]:
pred_dynamic = results.get_prediction(start=pd.to_datetime('2015-03'), dynamic=True, full_results=True)
pred_dynamic_ci = pred_dynamic.conf_int()
In [54]:
ax = data['2015-02':].plot(label='observed', figsize=(20, 15))
pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax)
ax.fill_between(pred_dynamic_ci.index,
pred_dynamic_ci.iloc[:, 0],
pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25)
ax.fill_betweenx(ax.get_ylim(), pd.to_datetime('2015-03'), data.index[-1],
alpha=.1, zorder=-1)
ax.set_xlabel('Date')
ax.set_ylabel('Consumption')
plt.legend()
plt.show()
In [53]:
y_forecasted = pred_dynamic.predicted_mean
y_truth = data['2015-03':]
# Compute the mean square error
mse = mean_squared_error(y_true=y_truth,y_pred=y_forecasted)
mae = mean_absolute_error(y_true=y_truth,y_pred=y_forecasted)
mde = median_absolute_error(y_true=y_truth,y_pred=y_forecasted)
print('On testing time over two month period the model achieves:')
print('MSE: {0:.3f}, MAE: {1:.3f}, MDE: {2:.3f} '.format(mse,mae,mde))
In [ ]: