Multivariable regression

Imports and setup


In [ ]:
import os
import pandas as pd

from opengrid.library import houseprint, caching, regression
from opengrid import config

c = config.Config()

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = 16,8

In [ ]:
# Create houseprint from saved file, if not available, parse the google spreadsheet
try:
    hp_filename = os.path.join(c.get('data', 'folder'), 'hp_anonymous.pkl')
    hp = houseprint.load_houseprint_from_file(hp_filename)
    print("Houseprint loaded from {}".format(hp_filename))
except Exception as e:
    print(e)
    print("Because of this error we try to build the houseprint from source")
    hp = houseprint.Houseprint()
hp.init_tmpo()

Load Data

We are going to use daily gas, electricity and water consumption data and weather data. Because we don't want to overload the weather API, we will only use 1 location (Ukkel).

First, let's define the start and end date of the identification data. That is the data to be used to find the model. Later, we will use the model to predict.


In [ ]:
start = pd.Timestamp('2015-01-01', tz='Europe/Brussels')
end = pd.Timestamp('now', tz='Europe/Brussels')
end_model = pd.Timestamp('2016-12-31', tz='Europe/Brussels') #last day of the data period for the model

Energy data

We for each consumption type (electricity, gas and water), we create a daily dataframe and save it in the dictionary dfs. The data is obtained from the daily caches.


In [ ]:
caches = {}
dfs = {}
for cons in ['electricity', 'gas', 'water']:
    caches[cons] = caching.Cache(variable='{}_daily_total'.format(cons))
    dfs[cons] = caches[cons].get(sensors = hp.get_sensors(sensortype=cons))

Weather and other exogenous data

Run this block to download the weather data and save it to a pickle. This is a large request, and you can only do 2 or 3 of these per day before your credit with Forecast.io runs out!

We also add a column for each day-of-week which may be used by the regression algorithm on a daily basis.


In [ ]:
from opengrid.library import forecastwrapper
weather = forecastwrapper.Weather(location=(50.8024, 4.3407), start=start, end=end - pd.Timedelta(days=1))
irradiances=[
    (0, 90), # north vertical
    (90, 90), # east vertical
    (180, 90), # south vertical
    (270, 90), # west vertical
]
orientations = [0, 90, 180, 270]
weather_data = weather.days(irradiances=irradiances, 
                            wind_orients=orientations, 
                            heating_base_temperatures=[0, 6, 8 ,10, 12, 14, 16, 18]).dropna(axis=1)
weather_data.drop(['icon', 'summary', 'moonPhase', 'windBearing', 'temperatureMaxTime', 'temperatureMinTime',
                   'apparentTemperatureMaxTime', 'apparentTemperatureMinTime', 'uvIndexTime', 
                   'sunsetTime', 'sunriseTime'], 
                  axis=1, inplace=True)
# Add columns for the day-of-week
for i, d in zip(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']):
    weather_data[d] = 0
    weather_data.loc[weather_data.index.weekday == i, d] = 1
weather_data = weather_data.applymap(float)

In [ ]:
weather_data.head()

In [ ]:
weather_data.columns

Put data together

The generator below will return a dataframe with sensor id as first column and all exogenous data as other columns.


In [ ]:
def data_generator(consumable):
    dfcons = dfs[consumable]
    for sensorid in dfcons.columns:
        df = pd.concat([dfcons[sensorid], weather_data], axis=1).dropna()
        df = df.tz_convert('Europe/Brussels')
        yield sensorid, df

Let's have a peek


In [ ]:
cons = 'gas'
analysis_data = data_generator(cons)

sensorid, peek = next(analysis_data)
peek = peek.resample(rule='MS').sum()

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot_date(peek.index, peek[sensorid], '-', color='grey', lw=8, label=cons)
for column in peek.columns[1:]:
    if 'heatingDegreeDays' in column:
        ax2.plot_date(peek.index, peek[column], '-', label=column)
plt.legend()

Run Regression Analysis

We run the analysis on monthly and weekly basis.


In [ ]:
cons = 'water' 
save_figures = True

In [ ]:
analysis_data = data_generator(cons)

mrs_month = []
mrs_month_cv = []
mrs_week = []
for sensorid, data in analysis_data: 
    data.rename(columns={sensorid:cons}, inplace=True)
    
    df = data.resample(rule='MS').sum()
    if len(df) < 2:
        continue
    
    # monthly model, statistical validation
    mrs_month.append(regression.MVLinReg(df.ix[:end_model], cons, p_max=0.03)) 
    figures = mrs_month[-1].plot(df=df)
    
    if save_figures:
        figures[0].savefig(os.path.join(c.get('data', 'folder'), 'figures', 'multivar_model_'+sensorid+'.png'), dpi=100)
        figures[1].savefig(os.path.join(c.get('data', 'folder'), 'figures', 'multivar_results_'+sensorid+'.png'), dpi=100)


    # weekly model, statistical validation
    df = data.resample(rule='W').sum()
    if len(df.ix[:end_model]) < 4:
        continue
    mrs_week.append(regression.MVLinReg(df.ix[:end_model], cons, p_max=0.02))
    if len(df.ix[end_model:]) > 0:
        figures = mrs_week[-1].plot(model=False, bar_chart=True, df=df.ix[end_model:])
        if save_figures:
            figures[0].savefig(os.path.join(c.get('data', 'folder'), 'figures', 'multivar_prediction_weekly_'+sensorid+'.png'), dpi=100)

In [ ]: