Multivariable regression

Imports and setup


In [ ]:
import os
import pandas as pd

from opengrid.library import houseprint, regression
from opengrid import config

c = config.Config()

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = 16,8

Load Data

We are going to use gas consumption data and weather data. Because we don't want to overload the weather API, we will only use 1 location (Ukkel).

First, let's define the start and end date of the identification data. That is the data to be used to find the model. Later, we will use the model to predict.


In [ ]:
start = pd.Timestamp('2015-01-01', tz='Europe/Brussels')
end = pd.Timestamp('now', tz='Europe/Brussels')
end_model = pd.Timestamp('2016-12-31', tz='Europe/Brussels') #last day of the data period for the model

Energy data


In [ ]:
try:
    hp_filename = os.path.join(c.get('data', 'folder'), 'hp_anonymous.pkl')
    hp = houseprint.load_houseprint_from_file(hp_filename)
    print("Houseprint loaded from {}".format(hp_filename))
except Exception as e:
    print(e)
    print("Because of this error we try to build the houseprint from source")
    hp = houseprint.Houseprint()
hp.init_tmpo()

In [ ]:
hp.sync_tmpos()

In [ ]:
def gas_data_generator():
    # Daily approach, obtain fully correct daily data.
    # To be aggregated to monthly or weekly or ...
    
    for gas_sensor in hp.get_sensors(sensortype='gas'):
        df = gas_sensor.get_data(head=start-pd.Timedelta(days=1), 
                                 tail=end+pd.Timedelta(days=1), 
                                 resample='day', 
                                 unit='kWh', 
                                 diff=False, 
                                 tz='Europe/Brussels')
        df = df.diff().shift(-1).dropna()
        if df.empty:
            continue
        yield df

def elec_data_generator():
    # Preferred method: as accurate as 3, and faster
    # Daily approach, obtain fully correct daily data.
    # To be aggregated to monthly or weekly or ...
    
    for sensor in hp.get_sensors(sensortype='electricity'):
        df = sensor.get_data(head=start-pd.Timedelta(days=1), 
                             tail=end+pd.Timedelta(days=1), 
                             resample='day', 
                             unit='kWh', 
                             diff=False, 
                             tz='Europe/Brussels')
        df = df.diff().shift(-1).dropna()
        if df.empty:
            continue
        yield df

Let's have a peek


In [ ]:
gas_data = gas_data_generator()
elec_data = elec_data_generator()

In [ ]:
peek = next(gas_data)
peek.plot()
peek = next(elec_data) peek.plot()

Weather Data

Run this block to download the weather data and save it to a pickle. This is a large request, and you can only do 2 or 3 of these per day before your credit with Forecast.io runs out!

To get the data run the cell below


In [ ]:
from opengrid.library import forecastwrapper
weather = forecastwrapper.Weather(location=(50.8024, 4.3407), start=start, end=end - pd.Timedelta(days=1))
irradiances=[
    (0, 90), # north vertical
    (90, 90), # east vertical
    (180, 90), # south vertical
    (270, 90), # west vertical
]
orientations = [0, 90, 180, 270]
weather_data = weather.days(irradiances=irradiances, 
                            wind_orients=orientations, 
                            heating_base_temperatures=[0, 6, 8 ,10, 12, 14, 16, 18]).dropna(axis=1)
weather_data.drop(['icon', 'summary', 'moonPhase', 'windBearing', 'temperatureMaxTime', 'temperatureMinTime',
                   'apparentTemperatureMaxTime', 'apparentTemperatureMinTime', 'uvIndexTime', 
                   'sunsetTime', 'sunriseTime'], 
                  axis=1, inplace=True)
weather_data = weather_data.applymap(float)

In [ ]:
weather_data.head()

In [ ]:
weather_data.columns

Put data together

I wrote a generator that uses our previously defined generator so you can generate while you generate.


In [ ]:
def analysis_data_generator():
    gas_data = gas_data_generator()
    for gas_df in gas_data:
        sensorid = gas_df.name
        gas_df.name='gas'
        df = pd.concat([gas_df, weather_data], axis=1).dropna()
        df = df.tz_convert('Europe/Brussels')
        yield sensorid, df
        
def analysis_data_generator_elec():
    elec_data = elec_data_generator()
    for elec_df in elec_data:
        sensorid = elec_df.name
        elec_df.name='elec'
        df = pd.concat([elec_df, weather_data], axis=1).dropna()
        df = df.tz_convert('Europe/Brussels')
        yield sensorid, df

Let's have another peek


In [ ]:
analysis_data = analysis_data_generator()
peek = next(analysis_data) peek = peek.resample(rule='MS').sum() fig, ax1 = plt.subplots() ax2 = ax1.twinx() ax1.plot_date(peek.index, peek['gas'], '-', color='grey', lw=8, label='gas') for column in peek.columns[1:]: if 'heatingDegreeDays' in column: ax2.plot_date(peek.index, peek[column], '-', label=column) plt.legend()

Run Regression Analysis for gas

We run the analysis on monthly and weekly basis.


In [ ]:
analysis_data = analysis_data_generator()
mrs_month = []
mrs_month_cv = []
mrs_week = []
for sensorid, data in analysis_data: 
    
    df = data.resample(rule='MS').sum()
    if len(df) < 2:
        continue
    
    # monthly model, statistical validation
    mrs_month.append(regression.MVLinReg(df.ix[:end_model], 'gas', p_max=0.03)) 
    figures = mrs_month[-1].plot(df=df)
    
    figures[0].savefig(os.path.join(c.get('data', 'folder'), 'figures', 'multivar_model_'+sensorid+'.png'), dpi=100)
    figures[1].savefig(os.path.join(c.get('data', 'folder'), 'figures', 'multivar_results_'+sensorid+'.png'), dpi=100)
    

    # weekly model, statistical validation
    df = data.resample(rule='W').sum()
    mrs_week.append(regression.MVLinReg(df.ix[:end_model], 'gas', p_max=0.02))
    if len(df.ix[end_model:]) > 0:
        figures = mrs_week[-1].plot(model=False, bar_chart=True, df=df.ix[end_model:])
        figures[0].savefig(os.path.join(c.get('data', 'folder'), 'figures', 'multivar_prediction_weekly_'+sensorid+'.png'), dpi=100)

In [ ]:
data

Run Regression Analysis for electricity

We run the analysis on monthly and weekly basis.


In [ ]:
analysis_data = analysis_data_generator_elec()
mrs_month_elec = []
mrs_month_elec_cv = []
mrs_week_elec = []

In [ ]:
for sensorid, data in analysis_data: 
    
    df = data.resample(rule='MS').sum()
    if len(df) < 2:
        continue
    
    # monthly model, statistical validation
    mrs_month_elec.append(regression.MVLinReg(df.ix[:end_model], 'elec', p_max=0.03)) 
    figures = mrs_month_elec[-1].plot(df=df)
    figures[1].savefig(os.path.join(c.get('data', 'folder'), 'figures', 'multivar_results_'+sensorid+'.png'), dpi=100)

In [ ]:
break
    # weekly model, statistical validation
    df = data.resample(rule='W').sum()
    mrs_week.append(regression.MVLinReg(df.ix[:end_model], 'elec', p_max=0.02))
    if len(df.ix[end_model:]) > 0:
        mrs_week[-1].plot(model=False, bar_chart=True, df=df.ix[end_model:])
for i, fit in enumerate(mrs_month[3].list_of_fits): print('{}: {}'.format(fit.model.formula, mrs_month[3].list_of_cverrors[i]))
print(mrs_month[3].list_of_fits[-1].summary())

In [ ]:
data.info()

In [ ]:
pd.Timestamp(pd.Timestamp('now', tz='Europe/Brussels').date(), tz='Europe/Brussels')

In [ ]: