In [10]:
import sys
import os
sys.path.append('../../') # or non-Unix equivalent (add wikienergy/ to path)
from disaggregator import linregress as lng
from disaggregator import GreenButtonDatasetAdapter as gbda
from disaggregator import weather
import pandas as pd
import numpy as np
import datetime
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
In [9]:
def get_temperatures_from_pickle(date_start,date_end):
with open('temp_oakpark_20120101_20140730.pkl','rb') as f:
temps_df=pickle.load(f)
temps_series = pd.Series(temps_df['temp'],index=temps_df.index)
return temps_series
def get_temperatures_from_wunderground(date_start,date_end):
api_key='API-Key'
city='Oak Park'
state='IL'
weather_df=weather.get_weather_data_as_df_from_zipcode(api_key,'18976',start_date,end_date)['temp']
weather_df=weather.get_weather_data_as_df(api_key,city,state,start_date,end_date)
temps_series=weather_df['temp'].resample('D',how='mean')
return temps_series
def xml_file_to_pandas_series(filepath,sample_rate='D',cal_heat_temp_range=range(50,60),cal_cool_temp_range=range(60,70)):
with open(filepath,'rb') as f:
xml_house = f.read()
trace = gbda.get_trace(xml_house)
trace = trace.resample(sample_rate,method='sum')
trace_series = trace.series
return trace_series
In [48]:
start_date= datetime.datetime(2012,07,01).date()
end_date= datetime.datetime(2012,07,20).date()
#temps_series_orig=get_temperatures_from_wunderground(start_date,end_date)
temps_series_orig=get_temperatures_from_pickle(start_date,end_date)
df = pd.DataFrame(temps_series_orig).resample('H',how='mean')
temps_series=weather._remove_low_outliers_df(df,'temp')['temp']
filepath='sample.xml'
trace_series = xml_file_to_pandas_series(filepath)
In this case, the daily energy usage is aligned with average temperature for each day. This pair is then run through an Ordinary Least Squares method for estimating the unknown parameters of a linear regression model. Two distinct regressions are calculated: one for heating (when electric heating and the HVAC blower is typically used), and one for cooling (when air conditioning is typically used). Each regression is repeated for a range of setpoint temperatures, which represent the temperature outside that typically leads to the HVAC system to turn on. The default cooling setpoints are from 60 to 70 degrees, and the default heating setpoints are 50 to 60 degrees.
The regressions with the smallest adjusted r2 error are chosen for each case (heating and cooling). These regressions, along with the r2 error, the setpoints that yielded the best r2, and several other parameters, are returned in the results_dict dictionary. This dictionary is used in conjunction with the trace and temperature series to then predict the total energy usage for a given day based solely on the temperature for a house. The intercept of the linear regressions are then shifted to effectively have them read an energy value of 0 at their chosen set points. This new linear regression is then used to predict the energy usage associated with the HVAC signal.
There is a single method that takes in a trace series and temperature series and returns the total usage series that is aligned with the available temperature data, the disaggregated air signal, and the difference between the predicted total usage and the actual total usage. In order to show some more information about this data, the two component functions were used instead.
The sensitivity score is essentially the slope of the linear regression used for prediction. The certainty score is the number of times that the actual total energy is within 5 kWh of the predicted total energy usage. This is a metric that indicates how well correlated a consumer's energy usage is to temperature.
In [43]:
#[total_series,air_series,diff_series] = lng.run_regressions_and_predict(trace_series,temps_series)
results_dict=lng.run_regressions(trace_series,temps_series)
[total_series,air_series,diff_series]=lng.predict_from_regressions(trace_series,temps_series,results_dict)
In [44]:
print "Sensitivity to Heat: " + "%.2f" % float(slope_cdd/float(1000))+'kWh/degree'
print "Sensitivity to Cold: %.2f" % float(slope_hdd/float(1000))+'kWh/degree'
perc_correlation=len([val for val in diff_series if abs(val) < 5000])/float(len(diff_series))*100
print "Certainty Score: " + str(perc_correlation)
In [45]:
import matplotlib.pyplot as plt
%matplotlib inline
house_num=0
results_list={}
results_list[0]=results_dict
slope_hdd = results_list[house_num]['slope_hdd']
intercept_hdd = results_list[house_num]['intercept_hdd']
best_hdd_temp = results_list[house_num]['best_hdd_temp']
r2_cdd = results_list[house_num]['best_r2_adj_cdd']
slope_cdd = results_list[house_num]['slope_cdd']
intercept_cdd = results_list[house_num]['intercept_cdd']
best_cdd_temp = results_list[house_num]['best_cdd_temp']
r2_hdd = results_list[house_num]['best_r2_adj_hdd']
df_trace = pd.DataFrame(trace_series,columns=['kwh'])
df_trace = df_trace.sort_index()
df_trace = df_trace
df_temps = pd.DataFrame(temps_series,columns=['temp'])
temps = temps_series.resample('24H', fill_method='ffill')
outlier_thresh=np.mean(temps)-(5*np.std(temps))
df_sub = pd.merge(df_trace,df_temps,left_index=True,right_index=True)
percent_daily=[]
total_daily=[]
pred_air_daily=[]
cool_temps=[]
cool_vals=[]
heat_temps=[]
heat_vals=[]
cool_pred_vals=[]
heat_pred_vals=[]
if(intercept_cdd):
intercept_cdd_new=best_cdd_temp*slope_cdd+intercept_cdd
if(intercept_hdd):
intercept_hdd_new=best_hdd_temp*slope_hdd+intercept_hdd
for i,val in enumerate(df_sub['kwh']):
use_kwh_per_day=float(val)
if(df_sub['temp'][i]>best_cdd_temp):
pred_air_kwh_per_day=df_sub['temp'][i]*slope_cdd+intercept_cdd-intercept_cdd_new
pred_total_kwh_per_day=df_sub['temp'][i]*slope_cdd+intercept_cdd
cool_temps.append(df_sub['temp'][i])
cool_vals.append(df_sub['kwh'][i])
cool_pred_vals.append(pred_total_kwh_per_day)
elif(df_sub['temp'][i]<best_hdd_temp):
pred_air_kwh_per_day=df_sub['temp'][i]*slope_hdd+intercept_hdd-intercept_hdd_new
pred_total_kwh_per_day=df_sub['temp'][i]*slope_hdd+intercept_hdd
heat_temps.append(df_sub['temp'][i])
heat_vals.append(df_sub['kwh'][i])
heat_pred_vals.append(pred_total_kwh_per_day)
pred_total_daily.append(pred_total_kwh_per_day)
pred_air_daily.append(pred_air_kwh_per_day)
total_daily.append(use_kwh_per_day)
plt.plot(df_sub['kwh'].index,total_daily,'r')
plt.plot(df_sub['kwh'].index,pred_air_daily)
plt.xlabel('Time')
plt.ylabel('Wh/day')
plt.legend(['Total use', 'Predicted HVAC use'],prop={'size':'8'})
plt.figure()
plt.plot(cool_temps,cool_vals,'.r')
plt.plot(heat_temps,heat_vals,'.b')
x=np.arange(best_cdd_temp,100,.2)
y=(x)*slope_cdd+intercept_cdd
plt.plot(x,y,'k')
x2=np.arange(10,best_hdd_temp,.2)
y2=(x2)*slope_hdd+intercept_hdd
plt.plot(x2,y2,'k')
Out[45]: