In [1]:
import os
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams

%matplotlib inline
plt.style.use('classic')
plt.rc("figure", facecolor="white")
rcParams['figure.figsize'] = 12, 6

# random seed
random_state=0
rng = np.random.RandomState(seed=random_state)

Load data and preprocess


In [3]:
# Load the data and keep the the dates with less missing values for most houses
data_process = pd.read_csv('houses_clean.csv',parse_dates=['Time'], index_col='Time')
data = data_process.copy(deep=True)
data = data['2014-04':'2015-03']

In [4]:
# keep the house of choice for modelling
houses=[1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,20]
house_nr = 1
drop_houses = ['House_'+str(i) for i in houses if i != house_nr]
data = data.drop(drop_houses, axis=1)

# drop nan values
data = data.dropna(axis=0,how='any')

In [5]:
plt.plot(data)


Out[5]:
[<matplotlib.lines.Line2D at 0x222b8fa8cf8>]

Stationarity Tests


In [9]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(center=False,window=24).mean()
    rolstd = timeseries.rolling(center=False,window=24).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries.iloc[:,0].values, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [13]:
# check for stationarity
test_stationarity(data)


Results of Dickey-Fuller Test:
Test Statistic                -6.249855e+00
p-value                        4.481655e-08
#Lags Used                     3.200000e+01
Number of Observations Used    8.491000e+03
Critical Value (5%)           -2.861880e+00
Critical Value (10%)          -2.566951e+00
Critical Value (1%)           -3.431120e+00
dtype: float64

ARIMA Modeling

ARIMA stands for Auto-Regressive Integrated Moving Averages. The ARIMA forecasting for a stationary time series is nothing but a linear (like a linear regression) equation. The predictors depend on the parameters (p,d,q) of the ARIMA model:

  • Number of AR (Auto-Regressive) terms (p): AR terms are just lags of dependent variable. For instance if p is 5, the predictors for x(t) will be x(t-1), …, x(t-5).
  • Number of MA (Moving Average) terms (q): MA terms are lagged forecast errors in prediction equation. For instance if q is 5, the predictors for x(t) will be e(t-1), …, e(t-5) where e(i) is the difference between the moving average at ith instant and actual value.
  • Number of Differences (d): These are the number of nonseasonal differences, i.e. in this case we took the first order difference. So either we can pass that variable and put d=0 or pass the original variable and put d=1. Both will generate same results.

In [17]:
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data.House_1,lags=100)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation of Hourly Electricity Consumption')
plt.show()



In [22]:
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(data.House_1,lags=100)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Partial Autocorrelation of Hourly Electricity Consumption')
plt.show()



In [89]:
model = sm.tsa.statespace.SARIMAX(data, order=(24, 0, 1))

results = model.fit()

In [90]:
print(results.summary())


                           Statespace Model Results                           
==============================================================================
Dep. Variable:                House_1   No. Observations:                 8524
Model:              SARIMAX(24, 0, 1)   Log Likelihood               -1667.374
Date:                Fri, 23 Jun 2017   AIC                           3386.749
Time:                        22:13:30   BIC                           3570.065
Sample:                    04-01-2014   HQIC                          3449.296
                         - 03-31-2015                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.3951      0.034     11.508      0.000       0.328       0.462
ar.L2          0.0273      0.015      1.823      0.068      -0.002       0.057
ar.L3          0.0093      0.011      0.862      0.389      -0.012       0.031
ar.L4          0.0567      0.010      5.937      0.000       0.038       0.075
ar.L5         -0.0167      0.010     -1.627      0.104      -0.037       0.003
ar.L6         -0.0118      0.011     -1.068      0.286      -0.034       0.010
ar.L7          0.0277      0.010      2.728      0.006       0.008       0.048
ar.L8          0.0160      0.010      1.666      0.096      -0.003       0.035
ar.L9          0.0148      0.011      1.327      0.185      -0.007       0.037
ar.L10         0.0094      0.010      0.899      0.369      -0.011       0.030
ar.L11         0.0005      0.011      0.048      0.962      -0.021       0.022
ar.L12         0.0170      0.011      1.541      0.123      -0.005       0.039
ar.L13         0.0121      0.011      1.123      0.261      -0.009       0.033
ar.L14         0.0172      0.010      1.746      0.081      -0.002       0.036
ar.L15         0.0449      0.010      4.721      0.000       0.026       0.064
ar.L16        -0.0018      0.010     -0.190      0.849      -0.021       0.017
ar.L17         0.0054      0.011      0.484      0.629      -0.016       0.027
ar.L18        -0.0166      0.011     -1.500      0.134      -0.038       0.005
ar.L19        -0.0161      0.010     -1.566      0.117      -0.036       0.004
ar.L20         0.0535      0.010      5.559      0.000       0.035       0.072
ar.L21         0.0148      0.010      1.533      0.125      -0.004       0.034
ar.L22         0.0050      0.010      0.479      0.632      -0.015       0.025
ar.L23         0.1115      0.008     14.069      0.000       0.096       0.127
ar.L24         0.2103      0.009     23.040      0.000       0.192       0.228
ma.L1         -0.0780      0.035     -2.232      0.026      -0.146      -0.010
sigma2         0.0865      0.001    149.631      0.000       0.085       0.088
===================================================================================
Ljung-Box (Q):                       72.71   Jarque-Bera (JB):             65642.96
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               1.38   Skew:                             2.33
Prob(H) (two-sided):                  0.00   Kurtosis:                        15.77
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

In [91]:
results.plot_diagnostics(figsize=(15, 12))
plt.show()



In [ ]:

ARIMA with seasonal differencing


In [6]:
data_diff =  data - data.shift(periods=24)

In [7]:
data_diff = data_diff.dropna(axis=0,how='any')

In [12]:
from statsmodels.graphics.tsaplots import plot_acf
plt.figure(1)
plot_acf(data_diff.House_1,lags=50)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation of Hourly Electricity Consumption')
plt.savefig('figures/95_percent/autocorrelation.eps')
plt.savefig('figures/95_percent/autocorrelation.pdf')


<matplotlib.figure.Figure at 0x233641e3080>

In [15]:
from statsmodels.graphics.tsaplots import plot_pacf
plt.figure(1)
plot_pacf(data_diff.House_1,lags=50)
plt.xlabel('Hour lags')
plt.ylabel('Autocorrelation')
plt.title('Partial Autocorrelation of Hourly Electricity Consumption')
plt.savefig('figures/95_percent/partial_autocorrelation.eps')
plt.savefig('figures/95_percent/partial_autocorrelation.pdf')


<matplotlib.figure.Figure at 0x233643c0c18>

The data was differenced on 24 period lags. From the ACF and PACF plots on the differenced data we can determine:

  • Non-Seasonal terms: AR(1) term from the first (1,2,3..) lags of the PACF plot and MA(1) or MA(2) term from the ACF plot.
  • Seasonal terms: PACF plots clear seasonality on $S=24$ hour period and AR(1) term, whereas the ACF plot shoes no sign of seasonal MA terms

In [34]:
mod = sm.tsa.statespace.SARIMAX(data,
                                order=(1, 0, 4),
                                seasonal_order=(1, 0, 1, 24),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

results = mod.fit()

In [35]:
print(results.summary())


                                 Statespace Model Results                                 
==========================================================================================
Dep. Variable:                            House_1   No. Observations:                 8524
Model:             SARIMAX(1, 0, 4)x(1, 0, 1, 24)   Log Likelihood               -1584.685
Date:                            Fri, 23 Jun 2017   AIC                           3185.370
Time:                                    19:27:31   BIC                           3241.775
Sample:                                04-01-2014   HQIC                          3204.615
                                     - 03-31-2015                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9968      0.001    947.604      0.000       0.995       0.999
ma.L1         -0.6961      0.006   -116.156      0.000      -0.708      -0.684
ma.L2         -0.1469      0.009    -15.556      0.000      -0.165      -0.128
ma.L3         -0.0784      0.011     -6.862      0.000      -0.101      -0.056
ma.L4         -0.0252      0.010     -2.554      0.011      -0.044      -0.006
ar.S.L24       0.9164      0.007    128.075      0.000       0.902       0.930
ma.S.L24      -0.7781      0.010    -74.887      0.000      -0.799      -0.758
sigma2         0.0848      0.001    149.647      0.000       0.084       0.086
===================================================================================
Ljung-Box (Q):                      144.46   Jarque-Bera (JB):             71193.41
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               1.46   Skew:                             2.42
Prob(H) (two-sided):                  0.00   Kurtosis:                        16.33
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

In [36]:
results.plot_diagnostics(figsize=(15, 12))
plt.show()



In [92]:
pred = results.get_prediction(start=pd.to_datetime('2015-03'), dynamic=False)
pred_ci = pred.conf_int()

In [93]:
y_forecasted = pred.predicted_mean
y_truth = data['2015-03':]

# Compute the mean square error
mse = mean_squared_error(y_true=y_truth,y_pred=y_forecasted)
mae = mean_absolute_error(y_true=y_truth,y_pred=y_forecasted)
mde = median_absolute_error(y_true=y_truth,y_pred=y_forecasted)
print('On testing time over two month period the model achieves:')
print('MSE: {0:.3f}, MAE: {1:.3f}, MDE: {2:.3f} '.format(mse,mae,mde))


On testing time over two month period the model achieves:
MSE: 0.076, MAE: 0.186, MDE: 0.098 

In [94]:
ax = data['2015-03':].plot(label='Observed data')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7)
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Date')
ax.set_ylabel('Consumption')
plt.legend()

plt.show()


Point-to-point predictions


In [46]:
pred = results.get_prediction(start=pd.to_datetime('2015-03'), dynamic=False)
pred_ci = pred.conf_int()

In [47]:
ax = data['2015-03':].plot(label='Observed data')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7)
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Date')
ax.set_ylabel('Consumption')
plt.legend()

plt.show()



In [49]:
y_forecasted = pred.predicted_mean
y_truth = data['2015-03':]

# Compute the mean square error
mse = mean_squared_error(y_true=y_truth,y_pred=y_forecasted)
mae = mean_absolute_error(y_true=y_truth,y_pred=y_forecasted)
mde = median_absolute_error(y_true=y_truth,y_pred=y_forecasted)
print('On testing time over two month period the model achieves:')
print('MSE: {0:.3f}, MAE: {1:.3f}, MDE: {2:.3f} '.format(mse,mae,mde))


On testing time over two month period the model achieves:
MSE: 0.085, MAE: 0.199, MDE: 0.126 

Dynamic predictions


In [50]:
pred_dynamic = results.get_prediction(start=pd.to_datetime('2015-03'), dynamic=True, full_results=True)
pred_dynamic_ci = pred_dynamic.conf_int()

In [54]:
ax = data['2015-02':].plot(label='observed', figsize=(20, 15))
pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax)

ax.fill_between(pred_dynamic_ci.index,
                pred_dynamic_ci.iloc[:, 0],
                pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25)

ax.fill_betweenx(ax.get_ylim(), pd.to_datetime('2015-03'), data.index[-1],
                 alpha=.1, zorder=-1)

ax.set_xlabel('Date')
ax.set_ylabel('Consumption')

plt.legend()
plt.show()



In [53]:
y_forecasted = pred_dynamic.predicted_mean
y_truth = data['2015-03':]

# Compute the mean square error
mse = mean_squared_error(y_true=y_truth,y_pred=y_forecasted)
mae = mean_absolute_error(y_true=y_truth,y_pred=y_forecasted)
mde = median_absolute_error(y_true=y_truth,y_pred=y_forecasted)
print('On testing time over two month period the model achieves:')
print('MSE: {0:.3f}, MAE: {1:.3f}, MDE: {2:.3f} '.format(mse,mae,mde))


On testing time over two month period the model achieves:
MSE: 0.214, MAE: 0.340, MDE: 0.221 

In [ ]: