In [1]:
%matplotlib inline
import pymc
from pymc import MCMC
from pymc.Matplot import plot
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import urllib2,urllib
import json
import datetime
from copy import deepcopy
import pylab
import os
from scipy import stats
loaded_data = False
In [2]:
# run this cell to generate fake data
def create_regression_data(x,a,b,sd):
y = a + b * x + np.random.normal(0,sd,(x.shape[0],))
return y
n = 200
early_temps = np.random.normal(40,20,(n/2,))
late_temps = np.random.normal(40,20,(n/2,))
temp_data = np.concatenate([early_temps,late_temps],axis=0)
early_energy = create_regression_data(early_temps,0,.9,10)
late_energy = create_regression_data(late_temps,20,.3,10)
energy_data = np.concatenate([early_energy,late_energy],axis=0)
print temp_data.shape
print energy_data.shape
print n
In [5]:
# run this cell to use real data
# data directory
data_dir = os.path.join(os.pardir,os.pardir,'data','weather_norm')
# function to reindex dataframe
def index_df_by_date(df):
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.index.snap() # snap to nearest frequency
df.sort_index()
# functions for finding the right weather station given a zip code
def get_station_id_from_zip_code(zip_code):
[lat,lng]=get_lat_lng_from_zip_code(zip_code)
station_id=get_station_id_from_lat_lng(lat,lng)
return station_id
def get_station_id_from_lat_lng(lat,lng):
f = urllib2.urlopen('http://developer.nrel.gov/api/solar/data_query/v1.json?api_key=API_KEY&lat='+str(lat)+'&lon='+str(lng))
json_string = f.read()
parsed_json = json.loads(json_string)
station_id_unicode=parsed_json['outputs']['tmy3']['id']
station_id=int(str.split(str(station_id_unicode),'-')[1])
return station_id
def get_lat_lng_from_zip_code(zip_code):
zip_code=zip_code.replace(' ','+')
zip_code=zip_code.replace(',','%2C')
f = urllib2.urlopen('https://maps.googleapis.com/maps/api/geocode/json?address='+zip_code+'&key=API_KEY')
json_string = f.read()
parsed_json_lat_lng = json.loads(json_string)
lat=parsed_json_lat_lng['results'][0]['geometry']['location']['lat']
lng=parsed_json_lat_lng['results'][0]['geometry']['location']['lng']
return [lat,lng]
# read data
if not loaded_data:
cal_temps_text = pd.read_csv(os.path.join(data_dir,'cal_complete_temps.csv'))
cal_elec_text = pd.read_csv(os.path.join(data_dir,'cal_elec_data.csv'))
cal_prism_text = pd.read_csv(os.path.join(data_dir,'caltest_prism_run_elec.csv'))
index_df_by_date(cal_temps_text)
index_df_by_date(cal_elec_text)
index_df_by_date(cal_prism_text)
loaded_data = True
# get a particular account number
account = 7
account_number = cal_elec_text['account_number'].unique()[account]
# get the data for that account
cal_elec=deepcopy(cal_elec_text.loc[cal_elec_text['account_number'] == account_number])
cal_prism=deepcopy(cal_prism_text.loc[cal_prism_text['account_id'] == account_number])
zip_code=str(cal_prism['zipcode'][0])
station_id=get_station_id_from_zip_code(zip_code)
cal_temps=deepcopy(cal_temps_text.loc[cal_temps_text['usaf'] == station_id])
# normalize on number of days
cal_elec['use_per_day']=cal_elec['kwh']/cal_elec['datenum'].diff()
# merge electricity and temperature datasets
cal_all=pd.merge(cal_elec,cal_temps,left_index=True,right_index=True)
temp_data = np.array(cal_all.temps.fillna(cal_all.temps.mean()))
energy_data = np.array(cal_all.use_per_day.fillna(cal_all.use_per_day.mean()))
n = temp_data.shape[0]
print temp_data.shape
print energy_data.shape
print n
In [6]:
# plot the data
plt.plot(np.array([temp_data,energy_data]).T,'.')
plt.show()
plt.scatter(temp_data,energy_data)
plt.show()
In [7]:
# determine logical priors
slope, intercept, r_value, p_value, std_err = stats.linregress(temp_data,energy_data)
print slope
print intercept
print r_value
print p_value
print std_err
In [8]:
switchpoint = pymc.DiscreteUniform('switchpoint', lower=0, upper=n)
early_intercept = pymc.Normal('early_intercept', mu=slope, tau=2)
late_intercept = pymc.Normal('late_intercept', mu=slope, tau=2)
early_slope = pymc.Normal('early_slope', mu=intercept, tau=1)
late_slope = pymc.Normal('late_slope', mu=intercept, tau=1)
early_sd = pymc.Uniform('early_sd', lower=0, upper=20)
late_sd = pymc.Uniform('late_sd', lower=0, upper=20)
@pymc.deterministic(plot=False)
def intercept_params(s=switchpoint, e=early_intercept, l=late_intercept):
''' Concatenate intercepts '''
out = np.empty(len(temp_data))
out[:s] = e
out[s:] = l
return out
@pymc.deterministic(plot=False)
def slope_params(s=switchpoint, e=early_slope, l=late_slope):
''' Concatenate slopes '''
out = np.empty(len(temp_data))
out[:s] = e
out[s:] = l
return out
@pymc.deterministic(plot=False)
def sd_params(s=switchpoint, e=early_sd, l=late_sd):
''' Concatenate slopes '''
out = np.empty(len(temp_data))
out[:s] = e
out[s:] = l
return out
@pymc.deterministic(plot=False)
def energy_est(slope=slope_params, intercept=intercept_params, x=temp_data):
'''linear regression estimate of parameters'''
out = np.empty(len(temp_data))
return intercept + (slope * x)
likelihood = pymc.Normal('likelihood', mu=energy_est, tau=sd_params, value=energy_data, observed=True)
estimate = pymc.Normal('estimate', mu=energy_est, tau=sd_params)
model = pymc.Model([estimate])
M = MCMC(model)
In [9]:
M.sample(10000,5000,50)
In [10]:
plot(M)
In [ ]: