This script is a port of EnergyID's code that calculates a linear regression on heating data

Imports and setup

General imports



In [ ]:

    
import pandas as pd

OpenGrid-specific imports



In [ ]:

    
from opengrid.library import houseprint
from opengrid import config
from opengrid.library import linearregression

c = config.Config()

Plotting settings



In [ ]:

    
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 16,8

Load Data

We are going to use gas consumption data and weather data. Because we don't want to overload the weather API, we will only use 1 location (Ukkel).

First, let's define the start and end date of our experiment. Let's take 1 year worth of data, starting with last month.



In [ ]:

    
# If we want to get consumption for 12 months, we will need 13 months of data
end = pd.Timestamp.today().replace(day=2).normalize()
start = (end.replace(year=end.year-1) - pd.Timedelta(days=2))

start = start.tz_localize('Europe/Brussels')
end = end.tz_localize('Europe/Brussels')
print(start, end)

Gas Data



In [ ]:

    
# Load the Houseprint, and sync all data
hp = houseprint.Houseprint()
#hp = houseprint.load_houseprint_from_file('cache_hp.hp')
hp.init_tmpo()
#hp.sync_tmpos()



In [ ]:

    
#hp.save('cache_hp.hp')



In [ ]:

    
def gas_data_generator1():
    # original monthly data generator, returns wrong data
    for gas_sensor in hp.get_sensors(sensortype='gas'):
        df = gas_sensor.get_data(head=start, tail=end, unit='kWh', diff=False)
        df = df.tz_convert('Europe/Brussels')
        df = df.resample('MS')
        df = df.diff().dropna()
        df = df[df>0]
        if df.empty:
            continue
        yield df
        
def gas_data_generator2():
    # Simple roughly correct monthly data generator
    # Roughly-correct means that the gas consumption between two counter values
    # right before and right after a month-transition are attributed to the new month.  
    # However, it is robust and does not need data beyond the last month
    for gas_sensor in hp.get_sensors(sensortype='gas'):
        df = gas_sensor.get_data(head=start, tail=end, unit='kWh', diff=False)
        df = df.tz_convert('Europe/Brussels')
        df = df.resample('MS').last()
        df = df.diff().dropna()
        df = df[df>0]
        if df.empty:
            continue
        yield df
        
def gas_data_generator3():
    # More complicated but most correct correct monthly data generator
    # The difference with the previous is that this generator interpolates
    # at month-transitions in order to estimate the exact counter value at 00:00:00
    # whereas the previous attributed all gas consumption at month-transitions to the 
    # new month
    # Drawbacks: very slow (due to the two reindex() calls) and if there would be no 
    # data after the end of the last month or before beginning of first month, 
    # interpolation can't be made, and the entire last (or first) month has no data
    
    for gas_sensor in hp.get_sensors(sensortype='gas'):
        df = gas_sensor.get_data(head=start, tail=end, unit='kWh', diff=False)
        df = df.tz_convert('Europe/Brussels')
        newindex = df.resample('MS').first().index
        df = df.reindex(df.index.union(newindex))
        df = df.interpolate(method='time')
        df = df.reindex(newindex)
        df = df.diff()
        df = df.shift(-1).dropna()
        df = df[df>0]
        if df.empty:
            continue
        yield df
        
def gas_data_generator4():
    # Preferred method: as accurate as 3, and faster
    # Daily approach, obtain fully correct daily data.
    # To be aggregated to monthly or weekly or ...
    
    for gas_sensor in hp.get_sensors(sensortype='gas'):
        df = gas_sensor.get_data(head=start, tail=end, resample='day', unit='kWh', diff=False, tz='Europe/Brussels')
        df = df.diff().shift(-1).dropna()
        if df.empty:
            continue
        yield df

Let's have a peek



In [ ]:

    
gas_data1 = gas_data_generator1()
gas_data2 = gas_data_generator2()
gas_data3 = gas_data_generator3()
gas_data4 = gas_data_generator4()



In [ ]:

    
peek1 = next(gas_data1)
peek2 = next(gas_data2)
peek3 = next(gas_data3)
peek4 = next(gas_data4)

plt.figure()
plt.plot(peek1, label='1')
plt.plot(peek2, label='2')
plt.plot(peek3, label='3')
plt.plot(peek4.resample('MS').sum(), label='4')
plt.legend()



In [ ]:

    
print(peek3 - peek4.resample('MS').sum())



In [ ]:

    
%timeit(next(gas_data1))
%timeit(next(gas_data2))
%timeit(next(gas_data3))
%timeit(next(gas_data4))

Weather Data

Run this block to download the weather data and save it to a pickle. This is a large request, and you can only do 2 or 3 of these per day before your credit with Forecast.io runs out!

TODO: Use the caching library for this.

To get the data run the cell below



In [ ]:

    
from opengrid.library import forecastwrapper
weather = forecastwrapper.Weather(location='Ukkel, Belgium', start=start, end=end)
weather_data = weather.days().resample('MS').sum()



In [ ]:

    
weather_data['heatingDegreeDays16.5'].plot()

Put data together

We have defined an OpenGrid analysis as a class that takes a single DataFrame as input, so we'll create that dataframe.

I wrote a generator that uses our previously defined generator so you can generate while you generate.



In [ ]:

    
def analysis_data_generator():
    gas_data = gas_data_generator()
    for gas_df in gas_data:
        df = pd.concat([gas_df, weather_data['heatingDegreeDays16.5']], axis=1).dropna()
        df.columns = ['gas', 'degreedays']
        yield df

Let's have another peek



In [ ]:

    
analysis_data = analysis_data_generator()



In [ ]:

    
peek = next(analysis_data)
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
for axis, column, color in zip([ax1, ax2], peek.columns, ['b', 'r']):
    axis.plot_date(peek.index, peek[column], '-', color=color, label=column)
plt.legend()

Run Regression Analysis



In [ ]:

    
analysis_data = analysis_data_generator()
for data in analysis_data:    
    try:
        analysis = linearregression.LinearRegression(independent=data.degreedays, dependent=data.gas)
    except ValueError as e:
        print(e)

    fig = analysis.plot()
    fig.show()



In [ ]:

    
analysis_data = analysis_data_generator()
for data in analysis_data:
    try:
        analysis = linearregression.LinearRegression3(independent=data.degreedays, dependent=data.gas,
                                                      breakpoint=60, percentage=0.5)
    except ValueError as e:
        print(e)
    fig = analysis.plot()
    fig.show()



In [ ]: