Energy Data to Realized Savings

  • Make sure to run all the functions at the bottom of this notebook before running any other cells.

In [17]:
import pandas as pd
import urllib2,urllib
import json
import datetime
import numpy as np
from copy import deepcopy
import pylab
import matplotlib.pyplot as plt
%matplotlib inline

Load Data from CSV Files

This is used for testing. When fully functional, the temperature data will come from an api and the gas/electric data as well as zip code will come from the user


In [18]:
cal_temps_text = pd.read_csv('../../data/cal_complete_temps.csv')
cal_temps_text.columns
index_df_by_date(cal_temps_text)
cal_temps_text.sort_index()
print




In [19]:
cal_gas_text = pd.read_csv('../../data/calt_gas_data.csv')
cal_gas_text.columns
index_df_by_date(cal_gas_text)
cal_gas_text.sort_index()
print




In [20]:
cal_elec_text = pd.read_csv('../../data/cal_elec_data.csv')
cal_elec_text.columns
index_df_by_date(cal_elec_text)
cal_elec_text.sort_index()
print




In [21]:
cal_prism_text = pd.read_csv('../../data/caltest_prism_run_elec.csv')
print cal_prism_text.columns
index_df_by_date(cal_prism_text)
cal_prism_text.sort_index()
print


Index([u'account_id', u'kwh', u'iou', u'date_num', u'bill_start_date', u'bill_end_date', u'bill_length', u'iou_id', u'site_number', u'ca_climate_zone', u'zipcode', u'wthrfile', u'weather_station', u'retrofit_end_date', u'complete_date', u'station_id', u'station_number', u'isvalid', u'station_name', u'station', u'stationnum', u'clizn', u'elevation', u'hdd_start', u'cdd_start', u'date_merge', u'hdd_end', u'cdd_end', u'hdd', u'cdd', u'hdd60pd', u'cdd65pd', u'retrofit_end_date_new', u'pre', u'date', u'upd', u'days'], dtype='object')


In [22]:
#This allows us to see all of the available account numbers we can test with.
cal_elec_text['account_number'].unique()


Out[22]:
array(['4GY9LD', '6033367454', '6383593286', '8WZD1Z', '9264652903',
       '9608731398', '9IC5HT', 'E3RCZ8', 'FBDDJ3', 'GRUZBH', 'NKP4ZU',
       'NP7PRY', 'OVYM2X', 'S812VL', 'SEHDPS1526388512', 'SEHDPS1526388932'], dtype=object)

In [30]:
#Currently, we are taking an account number, looking up that account numbers electricity usage
#in the cal_elec and cal_prism dataframes. Using the cal_prism dataframe, we find that account
#numbers corresponding zip code. We then take the zip code and use google's API to find the 
#corresponding lat/lng. We take the lat/lng and use NREL's API to find the correspodning weather
#station. We take that station_id to find the corresponding temperature at that place.

#account number -> zip code using cal_prism -> lat/lng using google -> station_id using NREL ->
#tempreature using cal_temps


account_number= 'OVYM2X'
cal_elec=deepcopy(cal_elec_text.loc[cal_elec_text['account_number'] == account_number])
cal_gas=deepcopy(cal_gas_text.loc[cal_gas_text['account_number'] == account_number])
cal_prism=deepcopy(cal_prism_text.loc[cal_prism_text['account_id'] == account_number])

#Input Parameters (eventually will come from front-end website)
zip_code=str(cal_prism['zipcode'][0])
pre_date=pd.to_datetime('1/10/2010')
post_date=pd.to_datetime('1/10/2010')
cal_hdd_temp_range=range(50,65)
cal_cdd_temp_range=range(50,65)
model_type=['both']

In [31]:
station_id=get_station_id_from_zip_code(zip_code)
cal_temps=deepcopy(cal_temps_text.loc[cal_temps_text['usaf'] == station_id])

Code For for loop

-Currently only doing it for electricity usage, but replacing cal_elec with cal_gas would do it for gas


In [32]:
#BOTH
a=[]
c=[]
for type_of_dd in model_type:
    best_r2_adj=0
    for cdd_temp in cal_cdd_temp_range:
        cal_temps=get_cdd(cdd_temp,cal_temps)
        for hdd_temp in cal_hdd_temp_range:
            cal_temps=get_hdd(hdd_temp,cal_temps)
            cal_elec['use_per_day']=cal_elec['kwh']/cal_elec['datenum'].diff()
            cal_elec['num_days']=cal_elec['datenum'].diff()
            cal_all=pd.merge(cal_elec,cal_temps,left_index=True,right_index=True)
            if(type_of_dd is 'hdd' or 'both'):
                cal_all['hdd_per_day']=cal_all['hdd_cum'].diff()/cal_all['datenum'].diff()
            if(type_of_dd is 'cdd' or 'both'):
                cal_all['cdd_per_day']=cal_all['cdd_cum'].diff()/cal_all['datenum'].diff()
            cal_all_pre=cal_all.sort_index()[:pre_date]
            cal_all_post=cal_all.sort_index()[post_date:]
            results=degree_day_regression(cal_all_pre,type_of_dd)
            r2_adj=np.array(results['R2_adj'])[0]
            
            if(r2_adj>best_r2_adj):
                best_cdd_temp=cdd_temp
                best_hdd_temp=hdd_temp
                best_r2_adj=r2_adj
                slope_hdd=np.array(results['HDD'])[0]
                slope_cdd=np.array(results['CDD'])[0]
                intercept=np.array(results['intercept'])[0]
            a.append(r2_adj)
        c.append(a)
        a=[]
            #print str(cdd_temp)+"," +str(hdd_temp)+" "+ str(r2_adj)
print "Best CDD set point:" + str(best_cdd_temp)
print "BEST HDD set point:" + str(best_hdd_temp)
print "BEST r2_adj:" + str(best_r2_adj)
print "Best HDD slope:"+str(slope_hdd)
print "Best CDD slope:"+str(slope_cdd)
print "Best intercept:"+str(intercept)


Best CDD set point:64
BEST HDD set point:52
BEST r2_adj:0.746580860892
Best HDD slope:0.363093394954
Best CDD slope:-0.408097252833
Best intercept:15.6376619454

In [45]:
plt.plot(cal_all['temps'],cal_all['use_per_day'],'.')
x1=np.arange(60,100,(36)/float(len(cal_temps['temps'])))
x2=np.arange(30,60,(22)/float(len(cal_temps['temps'])))
y=(x1-best_cdd_temp)*slope_cdd+intercept
y2=(x2-best_hdd_temp)*slope_hdd+intercept
plt.plot(x1,y,'k')
plt.plot(x2,y2,'b')


Out[45]:
[<matplotlib.lines.Line2D at 0x7f4750eecc50>]

In [ ]:
daily_savings=[]
num_post_days_sum=0
for i,val in enumerate(cal_all_post['cdd_per_day']):
    cdd_post_day=cal_all_post['cdd_per_day'][i]
    hdd_post_day=cal_all_post['hdd_per_day'][i]
    use_post_day=cal_all_post['use_per_day'][i]
    num_post_days=cal_all_post['num_days'][i]
    num_post_days_sum=num_post_days_sum+num_post_days
    kwh_per_day=(hdd_post_day*slope_hdd+cdd_post_day*slope_cdd+intercept)
    daily_savings.append((kwh_per_day-use_post_day)*0.15*num_post_days)
print 'You have an estimated savings of $'+str(sum(daily_savings)/num_post_days_sum*365)+" per year!"

In [ ]:
daily_savings=[]
num_pre_days_sum=0
for i,val in enumerate(cal_all_pre['cdd_per_day']):
    cdd_pre_day=cal_all_pre['cdd_per_day'][i]
    hdd_pre_day=cal_all_pre['hdd_per_day'][i]
    use_pre_day=cal_all_pre['use_per_day'][i]
    num_pre_days=cal_all_pre['num_days'][i]
    if(num_pre_days != num_pre_days):
        num_pre_days=0
    if(use_pre_day != use_pre_day):
        use_pre_day=0
    num_pre_days_sum=num_pre_days_sum+num_pre_days
    kwh_per_day=(hdd_pre_day*slope_hdd+cdd_pre_day*slope_cdd+intercept)
    if(kwh_per_day != kwh_per_day):
        kwh_per_day=0
    daily_savings.append((kwh_per_day-use_pre_day)*0.15*num_pre_days)
print 'You have an estimated savings of $'+str(sum(daily_savings)/num_pre_days_sum*365)+" per year!"

In [20]:
#This plots the CDD and HDD so we can better visualize where the best set point temperatures are.
plt.clf()
pylab.rcParams['figure.figsize'] = 12, 9
cal_hdd_min=cal_hdd_temp_range[0]
cal_hdd_max=cal_hdd_temp_range[len(cal_hdd_temp_range)-1]
cal_cdd_min=cal_cdd_temp_range[0]
cal_cdd_max=cal_cdd_temp_range[len(cal_cdd_temp_range)-1]
extent = [cal_hdd_min,cal_hdd_max,cal_cdd_min,cal_cdd_max]
plt.imshow(c,extent=extent,cmap='Blues',interpolation='none')
plt.xlabel('CDD Set Point Temperature')
plt.ylabel('HDD Set Point Temperature')
plt.show()


Functions


In [3]:
def index_df_by_date(df):
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    df.index.snap() # snap to nearest frequency

In [4]:
def get_station_id_from_zip_code(zip_code):
    [lat,lng]=get_lat_lng_from_zip_code(zip_code)
    station_id=get_station_id_from_lat_lng(lat,lng)
    return station_id

In [5]:
def get_station_id_from_lat_lng(lat,lng):
    f = urllib2.urlopen('http://developer.nrel.gov/api/solar/data_query/v1.json?api_key=API_KEY&lat='+str(lat)+'&lon='+str(lng))
    json_string = f.read()
    parsed_json = json.loads(json_string)
    station_id_unicode=parsed_json['outputs']['tmy3']['id']
    station_id=int(str.split(str(station_id_unicode),'-')[1])
    return station_id

In [6]:
def get_lat_lng_from_zip_code(zip_code):
    zip_code=zip_code.replace(' ','+')
    zip_code=zip_code.replace(',','%2C')
    f = urllib2.urlopen('https://maps.googleapis.com/maps/api/geocode/json?address='+zip_code+'&key=API_KEY')
    json_string = f.read()
    parsed_json_lat_lng = json.loads(json_string)
    lat=parsed_json_lat_lng['results'][0]['geometry']['location']['lat']
    lng=parsed_json_lat_lng['results'][0]['geometry']['location']['lng']
    return [lat,lng]

In [7]:
def get_hdd(ref_temp,df):
    df['hdd']=ref_temp-df.temps
    df['hdd'].loc[df.hdd<0]=0
    df['hdd_cum']=df.hdd.cumsum()
    return df

def get_cdd(ref_temp,df):
    df['cdd']=df.temps-ref_temp
    df['cdd'].loc[df.cdd<0]=0
    df['cdd_cum']=df.cdd.cumsum()
    return df

In [8]:
def degree_day_regression(df, x_opt='both'):
    
    '''Function that runs the weather normalization regression on energy use data
        --------------
        df: dataframe that includes 
            use per day (upd)
            heating degree days per day (hddpd)
            cooling degree days per day (cddpd)
        ---------------
        x_opt: options for the regression function
            'hdd': run regression with just heating degree days
            'cdd': run regression with just cooling degree days
            'both' (default): 
    '''
    
    #select model based on options
    if x_opt == 'hdd':
        covar = {'HDD': df.hdd_per_day}
        results = pd.ols(y=df.use_per_day, x = covar)
        return pd.DataFrame([[results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs ]], 
             columns = ['intercept', 'intercept_std_err', 'HDD', 'HDD_std_err',' R2', 'R2_adj','N_reads'])
    
    elif x_opt == 'cdd':
        covar = {'CDD': df.cdd_per_day}
        results = pd.ols(y=df.use_per_day, x = covar)
        return pd.DataFrame([[results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs]], 
             columns = ['intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'R2', 'R2_adj','N_reads'])

    
    elif x_opt == 'both':
        covar = {'CDD': df.cdd_per_day, 'HDD': df.hdd_per_day}
        results = pd.ols(y=df.use_per_day, x = covar)
        return pd.DataFrame([[results.beta[2], results.std_err[2], results.beta[0], results.std_err[0], results.beta[1], results.std_err[1], results.r2, results.r2_adj, results.nobs]], 
             columns = ['intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'HDD','HDD_std_err', 'R2', 'R2_adj','N_reads'])

In [ ]: