In [1]:
import pandas as pd
import urllib2,urllib
import json
import datetime
import numpy as np
from copy import deepcopy
import pylab
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import json
import urllib2
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
from scipy import stats
import sys
sys.path.append('../../') # or non-Unix equivalent (add wikienergy/ to path)
from disaggregator import weather
from disaggregator import utils
from disaggregator import PecanStreetDatasetAdapter as psda
from disaggregator import linregress as lng
from disaggregator import appliance as app
In [2]:
#Load Datasets
devices_types={}
devices_types_unsampled={}
db_url='postgresql://USERNAME:PASSWORD@db.wiki-energy.org:5432/postgres'
psda.set_url(db_url)
schema = 'curated'
tables= psda.get_table_names(schema)
print tables
In [44]:
table = [u'group2_disaggregated_2013_05',u'group2_disaggregated_2013_06',u'group2_disaggregated_2013_07',u'group2_disaggregated_2013_08',u'group2_disaggregated_2013_09']
In [45]:
#Gets id's associated with a device and generates types for each ids
ids_for_devices={}
ids_device_name='air1'
ids_for_devices[ids_device_name]=psda.get_dataids_with_real_values(schema,table[4],ids_device_name)
In [46]:
num_houses=10
device_name='air1'
devices_types_unsampled[device_name]=psda.generate_instances_for_appliance_by_dataids(schema,table,device_name,ids_for_devices[ids_device_name][:num_houses])
device_name='use'
devices_types_unsampled[device_name]=psda.generate_instances_for_appliance_by_dataids(schema,table,device_name,ids_for_devices[ids_device_name][:num_houses])
In [4]:
#pecan_temps=weather.get_weather_data_as_df('1d83c5de274645d4','78739','TX', '20130501', '20130930')
import pickle
with open('../docs/tutorials/temp_austin_may_sept_2013.pkl','rb') as f:
pecan_temps=pickle.load(f)
In [63]:
temps_series=pecan_temps['temp'].resample('24H',how='mean')
In [65]:
num_hours=24
pecan_temps_resampled=pecan_temps.resample(str(num_hours)+'H', fill_method='ffill')
pecan_temps_resampled['temps']=pecan_temps_resampled['temp']
devices_types_actual_types=turn_instance_list_dict_into_type(devices_types_unsampled)
devices_types=resample_and_split(devices_types_actual_types,str(num_hours)+'H','D',True,True)
house_energy_dict=turn_type_dict_into_df_dict(devices_types)
In [66]:
print house_energy_dict.keys()
In [98]:
house_num=94
trace_series=house_energy_dict[house_num]['kwh']
real_air_series=house_energy_dict[house_num]['air_kwh']
[total_series,air_series,diff_series]=lng.run_regressions_and_predict(trace_series,temps_series,json=False)
In [99]:
plt.plot(air_series)
plt.plot(real_air_series)
plt.plot(total_series,'k')
Out[99]:
In [71]:
house_id=624
date_start='2013-05-01'
date_end='2013-09-30'
hours_offset=0
In [72]:
cal_temps=pecan_temps_resampled[date_start:date_end]
cal_temps=remove_outliers(cal_temps,-200,num_hours)
cal_elec=house_energy_dict[house_id][date_start:date_end]
df_daily_pred,daily_perc_act,daily_perc_pred,slope,intercept,cdd_temp=get_best_lin_regress(cal_elec,cal_temps)
In [73]:
plot_df=pd.concat([cal_temps['temp'],cal_elec['air_kwh'],cal_elec['kwh']],join='outer',axis=1)
plt.plot(plot_df['temp'],plot_df['air_kwh'],'.')
plt.plot(plot_df['temp'],plot_df['kwh'],'.')
x=np.arange(60,100,(40)/float(len(plot_df['temp'])))
y=(x-cdd_temp)*slope+intercept
plt.plot(x,y,'k')
y2=(x-cdd_temp)*slope
y2=[0 if i<cdd_temp else y2[a] for a,i in enumerate(x)]
plt.plot(x,y2,'c')
Out[73]:
In [ ]:
#DAILY PREDICTED
In [85]:
plt.plot(daily_perc_pred)
plt.plot(plot_df['air_kwh'])
Out[85]:
In [81]:
date_start_plot='2013-05-01'
date_end_plot='2013-09-01'
plot_lin_regress(daily_perc_act,daily_perc_pred,date_start_plot,date_end_plot)
In [381]:
num_hours=1
devices_types_actual_types=turn_instance_list_dict_into_type(devices_types_unsampled)
devices_types=resample_and_split(devices_types_actual_types,str(num_hours)+'H','D',True,True)
house_energy_dict=turn_type_dict_into_df_dict(devices_types)
pecan_temps_resampled=pecan_temps.resample(str(num_hours)+'H', fill_method='ffill')
pecan_temps_resampled['temps']=pecan_temps_resampled['temp']
In [382]:
cal_temps=pecan_temps_resampled[date_start:date_end]
cal_temps=remove_outliers(cal_temps,-200,num_hours)
cal_elec=house_energy_dict[house_id][date_start:date_end]
cal_elec.index=cal_elec.index-pd.DateOffset(hours=hours_offset)
#df_hourly_pred,hourly_perc_act,hourly_perc_pred=get_best_lin_regress(cal_elec,cal_temps)
df_hourly_pred=df_daily_pred
df_hourly_perc_act=daily_perc_act
df_hourly_perc_pred=daily_perc_pred
In [383]:
plot_lin_regress(hourly_perc_act,hourly_perc_pred,date_start_plot,date_end_plot)
df=combine_daily_and_hourly_lin_regress(hourly_perc_act,hourly_perc_pred,df_daily_pred,date_start,date_end)
hourly_perc_pred_new=df['hourly_normalized'].fillna(0)
plt.figure()
print
plot_lin_regress(hourly_perc_act,hourly_perc_pred_new,date_start_plot,date_end_plot)
In [9]:
def index_df_by_date(df):
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.index.snap() # snap to nearest frequency
In [10]:
def get_station_id_from_zip_code(zip_code):
[lat,lng]=get_lat_lng_from_zip_code(zip_code)
station_id=get_station_id_from_lat_lng(lat,lng)
return station_id
In [11]:
def get_station_id_from_lat_lng(lat,lng):
f = urllib2.urlopen('http://developer.nrel.gov/api/solar/data_query/v1.json?api_key=API_KEY&lat='+str(lat)+'&lon='+str(lng))
json_string = f.read()
parsed_json = json.loads(json_string)
station_id_unicode=parsed_json['outputs']['tmy3']['id']
station_id=int(str.split(str(station_id_unicode),'-')[1])
return station_id
In [12]:
def get_lat_lng_from_zip_code(zip_code):
zip_code=zip_code.replace(' ','+')
zip_code=zip_code.replace(',','%2C')
f = urllib2.urlopen('https://maps.googleapis.com/maps/api/geocode/json?address='+zip_code+'&key=API_KEY')
json_string = f.read()
parsed_json_lat_lng = json.loads(json_string)
lat=parsed_json_lat_lng['results'][0]['geometry']['location']['lat']
lng=parsed_json_lat_lng['results'][0]['geometry']['location']['lng']
return [lat,lng]
In [12]:
In [13]:
def get_hdd(ref_temp,df):
df['hdd']=ref_temp-df.temps
df['hdd'].loc[df.hdd<0]=0
df['hdd_cum']=df.hdd.cumsum()
return df
def get_cdd(ref_temp,df):
df['cdd']=df.temps-ref_temp
df['cdd'].loc[df.cdd<0]=0
df['cdd_cum']=df.cdd.cumsum()
return df
def degree_day_regression(df, x_opt='both'):
'''Function that runs the weather normalization regression on energy use data
--------------
df: dataframe that includes
use per day (upd)
heating degree days per day (hddpd)
cooling degree days per day (cddpd)
---------------
x_opt: options for the regression function
'hdd': run regression with just heating degree days
'cdd': run regression with just cooling degree days
'both' (default):
'''
#select model based on options
if x_opt == 'hdd':
covar = {'HDD': df.hdd_per_day}
results = pd.ols(y=df.use_per_day, x = covar)
return pd.DataFrame([[results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs ]],
columns = ['intercept', 'intercept_std_err', 'HDD', 'HDD_std_err',' R2', 'R2_adj','N_reads'])
elif x_opt == 'cdd':
covar = {'CDD': df.cdd_per_day}
results = pd.ols(y=df.use_per_day, x = covar)
return pd.DataFrame([[results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs]],
columns = ['intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'R2', 'R2_adj','N_reads'])
elif x_opt == 'both':
covar = {'CDD': df.cdd_per_day, 'HDD': df.hdd_per_day}
results = pd.ols(y=df.use_per_day, x = covar)
return pd.DataFrame([[results.beta[2], results.std_err[2], results.beta[0], results.std_err[0], results.beta[1], results.std_err[1], results.r2, results.r2_adj, results.nobs]],
columns = ['intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'HDD','HDD_std_err', 'R2', 'R2_adj','N_reads'])
In [14]:
from datetime import datetime, timedelta, date
#this function returns a json object
#Pass in the city, state, and desired date as strings, the date format is YYYYMMDD
def get_weather_data(api,city,state,start_date,end_date):
if(start_date is not None and end_date is not None):
#format our date structure to pass to our http request
date_format = "%Y%m%d"
a = datetime.strptime(start_date, date_format)
b = datetime.strptime(end_date, date_format)
#get number of days from start_date to end_date
delta = b - a
num_days = delta.days
objects_list = []
#create new variable that will create query's for the api
for year in range(0,num_days + 1):
#count from start_date to end_date
dates = a + timedelta(days=year)
#format our str with our date_format
formatted_dates = datetime.strftime(dates, date_format)
#create query which will iterate through desired weather period
query = 'http://api.wunderground.com/api/'+ api +'/history_'+formatted_dates+'/q/'+state+'/'+city+'.json'
#iterate through the number of days and query the api. dump json results every time
f = urllib2.urlopen(query)
#read query as a json string
json_string = f.read()
#parse/load json string
parsed_json = json.loads(json_string)
#Iterate through each json object and append it to an ordered dictionary
for i in parsed_json['history']['observations']:
d = collections.OrderedDict()
d['date'] = i['date']['mon'] + '/' + i['date']['mday'] + '/' + i['date']['year']
d['time'] = i['date']['pretty'][0:8]
d['temp'] = i['tempi']
d['conds'] = i['conds']
d['wdire'] = i['wdire']
d['wdird'] = i['wdird']
d['hail'] = i['hail']
d['thunder'] = i['thunder']
d['pressurei'] = i['pressurei']
d['snow'] = i['snow']
d['pressurem'] = i['pressurem']
d['fog'] = i['fog']
d['tornado'] = i['tornado']
d['hum'] = i['hum']
d['tempi'] = i['tempi']
d['tempm'] = i['tempm']
d['dewptm'] = i['dewptm']
d['dewpti'] = i['dewpti']
d['rain'] = i['rain']
d['visim'] = i['visi']
d['wspdi'] = i['wspdi']
d['wspdm'] = i['wspdm']
objects_list.append(d)
#dump the dictionary into a json object
j = json.dumps(objects_list)
#append our json object to a list for every day and return its data
# print j
return j
#If we just need the data for ONE day (pass None for end_date):
if(end_date is None):
f = urllib2.urlopen('http://api.wunderground.com/api/API_KEY/history_'+start_date+'/q/'+state+'/'+city+'.json')
json_string = f.read()
parsed_json = json.loads(json_string)
objects_list = []
for i in parsed_json['history']['observations']:
d = collections.OrderedDict()
d['date'] = i['date']['mon'] + '/' + i['date']['mday'] + '/' + i['date']['year']
d['time'] = i['date']['pretty'][0:8]
d['temp'] = i['tempi']
d['conds'] = i['conds']
d['wdire'] = i['wdire']
d['wdird'] = i['wdird']
d['hail'] = i['hail']
d['thunder'] = i['thunder']
d['pressurei'] = i['pressurei']
d['snow'] = i['snow']
d['pressurem'] = i['pressurem']
d['fog'] = i['fog']
d['tornado'] = i['tornado']
d['hum'] = i['hum']
d['tempi'] = i['tempi']
d['tempm'] = i['tempm']
d['dewptm'] = i['dewptm']
d['dewpti'] = i['dewpti']
d['rain'] = i['rain']
d['visim'] = i['visi']
d['wspdi'] = i['wspdi']
d['wspdm'] = i['wspdm']
objects_list.append(d)
j = json.dumps(objects_list)
return j
In [15]:
def combine_date_time_and_index(temp_df):
for i,date in enumerate(temp_df['date']):
hour_min=temp_df['time'][i].split(':')
hour=hour_min[0]
min_ampm=hour_min[1].split(' ')
minute=min_ampm[0]
if('PM' in min_ampm[1]):
hour=int(hour)+12
if(hour is 24):
hour=0
temp_df['date'][i]=date.replace(hour=int(hour),minute=int(minute))
index_df_by_date(temp_df)
In [16]:
#Resamples the data
def resample_and_split(devices_types_unsampled_test,sample_rate,length,sample,split):
devices_types_test={}
devices_types_unsplit_test={}
for key in devices_types_unsampled_test:
if(sample):
devices_types_unsplit_test[key]=devices_types_unsampled_test[key].resample(sample_rate)
else:
devices_types_unsplit_test[key]=devices_types_unsampled_test[key]
if(split):
devices_types_test[key]=devices_types_unsplit_test[key].split_by(length)
else:
devices_types_test[key]=devices_types_unsplit_test[key]
print "Resampled " + str(key)
return devices_types_test
In [17]:
#Turns instances into type
def turn_instance_list_dict_into_type(devices_types_unsampled):
devices_types_actual_types={}
for key in devices_types_unsampled:
metadata=devices_types_unsampled[key][0].metadata
devices_types_actual_types[key]=app.ApplianceType(devices_types_unsampled[key],metadata)
print "Made type for " + str(key)
return devices_types_actual_types
In [18]:
def turn_type_dict_into_df_dict(devices_types):
house_energy_dict={}
for i,instance in enumerate(devices_types['use'].instances):
daily_energy_total={}
daily_energy_air={}
for j,trace in enumerate(instance.traces):
for k,val in enumerate(trace.series):
daily_energy_total[trace.series.index[k]]=trace.series[k]*24
air1_trace=devices_types['air1'].instances[i].traces[j]
for k,val in enumerate(devices_types['air1'].instances[i].traces[j].series):
daily_energy_air[air1_trace.series.index[k]]=air1_trace.series[k]*24
df=pd.DataFrame.from_dict(daily_energy_total,orient='index')
df_air=pd.DataFrame.from_dict(daily_energy_air,orient='index')
df=df.sort_index()
df_air=df_air.sort_index()
df['kwh']=df[0]
df['air_kwh']=df_air[0]
df.drop(0, axis=1, inplace=True)
house_energy_dict[instance.metadata['dataid']]=df
return house_energy_dict
In [79]:
def get_best_lin_regress(cal_elec,cal_temps):
cal_hdd_temp_range=range(45,70)
cal_cdd_temp_range=range(45,70)
model_type=['cdd']
a=[]
c=[]
for type_of_dd in model_type:
best_r2_adj=0
for cdd_temp in cal_cdd_temp_range:
cal_temps=get_cdd(cdd_temp,cal_temps)
for hdd_temp in cal_hdd_temp_range:
cal_temps=get_hdd(hdd_temp,cal_temps)
cal_elec['use_per_day']=cal_elec['kwh']
cal_all=pd.merge(cal_elec,cal_temps,left_index=True,right_index=True)
if(type_of_dd is 'hdd' or 'both'):
cal_all['hdd_per_day']=cal_all['hdd']
if(type_of_dd is 'cdd' or 'both'):
cal_all['cdd_per_day']=cal_all['cdd']
#cal_all_pre=cal_all.sort_index()[:pre_date]
#cal_all_post=cal_all.sort_index()[post_date:]
results=degree_day_regression(cal_all,type_of_dd)
r2_adj=np.array(results['R2_adj'])[0]
if(r2_adj>best_r2_adj):
best_cdd_temp=cdd_temp
best_hdd_temp=hdd_temp
best_r2_adj=r2_adj
slope_cdd=np.array(results['CDD'])[0]
intercept=np.array(results['intercept'])[0]
cal_all_best=cal_all
a.append(r2_adj)
c.append(a)
a=[]
#print str(cdd_temp)+"," +str(hdd_temp)+" "+ str(r2_adj)
print "Best CDD set point:" + str(best_cdd_temp)
print "BEST r2_adj:" + str(best_r2_adj)
print "Best CDD slope:"+str(slope_cdd)
print "Best intercept:"+str(intercept)
#May Data
#slope_cdd=1.02399
#intercept=16.76
cal_all_test=cal_all_best
error=[]
error_perc=[]
pred_air_daily={}
daily_proportion_act=[]
daily_proportion_pred=[]
daily_proportion_error=[]
pred_air=[]
time=[]
for i,val in enumerate(cal_all_test['cdd']):
real_air_kwh_per_day=float(cal_all_test['air_kwh'][i])
use_kwh_per_day=float(cal_all_test['kwh'][i])
pred_air_kwh_per_day=val*slope_cdd
if(pred_air_kwh_per_day<0):
pred_air_kwh_per_day=0
if(pred_air_kwh_per_day>use_kwh_per_day):
pred_air_kwh_per_day=use_kwh_per_day
error.append((real_air_kwh_per_day-pred_air_kwh_per_day))
daily_proportion_act.append(real_air_kwh_per_day)
daily_proportion_pred.append(pred_air_kwh_per_day)
pred_air_daily[cal_all_test['cdd'].index[i]]=100*pred_air_kwh_per_day/use_kwh_per_day
time.append(cal_all_test['kwh'].index[i])
daily_perc_act=pd.Series(daily_proportion_act,index=time)
daily_perc_pred=pd.Series(daily_proportion_pred,index=time)
df_daily=pd.DataFrame.from_dict(pred_air_daily,orient='index')
return [df_daily,daily_perc_act,daily_perc_pred,slope_cdd,intercept,best_cdd_temp]
In [20]:
def plot_lin_regress(perc_act,perc_pred,date_start,date_end):
perc_act_subset=perc_act[date_start:date_end]
plt.plot(perc_act_subset.index,perc_act_subset)
perc_pred_subset=perc_pred[date_start:date_end]
plt.plot(perc_pred_subset.index,perc_pred_subset,'r')
print "Average Error: " + str(sum(perc_act_subset-perc_pred_subset)/len(perc_pred_subset))
print "Estimated Total %: " + str(sum(perc_pred_subset)/len(perc_pred_subset[perc_pred_subset!=0]))
print "Actual Total %: " + str(sum(perc_act_subset)/len(perc_act_subset[perc_act_subset!=0]))
In [21]:
def combine_daily_and_hourly_lin_regress(hourly_perc_act,hourly_perc_pred,df_daily_pred,date_start,date_end):
hourly_perc_act_subset=hourly_perc_act[date_start:date_end]
hourly_perc_pred_subset=hourly_perc_pred[date_start:date_end]
pred_air_daily_temp=[]
time_temp=[]
mean_pred_air_daily_new=[]
mean_pred_air_daily_temp=hourly_perc_pred_subset.resample('D',how='mean')
for i,val in enumerate(hourly_perc_pred_subset):
pred_air_daily_temp.append(df_daily_pred[str(hourly_perc_pred_subset.index[i].date())][0][0])
time_temp.append(hourly_perc_pred_subset.index[i])
mean_pred_air_daily_new.append(mean_pred_air_daily_temp[str(hourly_perc_pred_subset.index[i].date())])
hourly_series=pd.Series(pred_air_daily_temp,index=time_temp)
hourly_mean_series=pd.Series(mean_pred_air_daily_new,index=time_temp)
d = {'daily_avg_regress' : hourly_series,'hourly_avg': hourly_perc_pred_subset,'daily_mean':hourly_mean_series}
df=pd.DataFrame(d)
df['hourly_normalized']=df['hourly_avg']/df['daily_mean']*df['daily_avg_regress']
df.loc[df['hourly_normalized']>100,'hourly_normalized'] = 100
return df
In [22]:
#Outlier Removal
def remove_outliers(cal_temps,threshold,num_hours):
outliers=cal_temps['temp'][(cal_temps['temps'] < threshold)].index
for a,i in enumerate(outliers):
try: cal_temps['temp'][i]= cal_temps['temp'][i-pd.DateOffset(hours=num_hours)]
except KeyError: cal_temps['temp'][i]= cal_temps['temp'][i+pd.DateOffset(hours=num_hours)]
print 'Returned ' + str(a+1)+ ' outliers.'
return cal_temps
In [29]:
In [ ]: