In [1]:
%matplotlib inline
import pymc
from pymc import MCMC
from pymc.Matplot import plot
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import urllib2,urllib
import json
import datetime
from copy import deepcopy
import pylab
import os
from scipy import stats

loaded_data = False

In [2]:
# run this cell to generate fake data

def create_regression_data(x,a,b,sd):
    y = a + b * x + np.random.normal(0,sd,(x.shape[0],))
    return y

n = 200
early_temps = np.random.normal(40,20,(n/2,))
late_temps = np.random.normal(40,20,(n/2,))
temp_data = np.concatenate([early_temps,late_temps],axis=0)

early_energy = create_regression_data(early_temps,0,.9,10)
late_energy = create_regression_data(late_temps,20,.3,10)
energy_data = np.concatenate([early_energy,late_energy],axis=0)
print temp_data.shape
print energy_data.shape
print n


(200,)
(200,)
200

In [5]:
# run this cell to use real data

# data directory
data_dir = os.path.join(os.pardir,os.pardir,'data','weather_norm')

# function to reindex dataframe
def index_df_by_date(df):
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    df.index.snap() # snap to nearest frequency
    df.sort_index()
 
# functions for finding the right weather station given a zip code
def get_station_id_from_zip_code(zip_code):
    [lat,lng]=get_lat_lng_from_zip_code(zip_code)
    station_id=get_station_id_from_lat_lng(lat,lng)
    return station_id

def get_station_id_from_lat_lng(lat,lng):
    f = urllib2.urlopen('http://developer.nrel.gov/api/solar/data_query/v1.json?api_key=API_KEY&lat='+str(lat)+'&lon='+str(lng))
    json_string = f.read()
    parsed_json = json.loads(json_string)
    station_id_unicode=parsed_json['outputs']['tmy3']['id']
    station_id=int(str.split(str(station_id_unicode),'-')[1])
    return station_id

def get_lat_lng_from_zip_code(zip_code):
    zip_code=zip_code.replace(' ','+')
    zip_code=zip_code.replace(',','%2C')
    f = urllib2.urlopen('https://maps.googleapis.com/maps/api/geocode/json?address='+zip_code+'&key=API_KEY')
    json_string = f.read()
    parsed_json_lat_lng = json.loads(json_string)
    lat=parsed_json_lat_lng['results'][0]['geometry']['location']['lat']
    lng=parsed_json_lat_lng['results'][0]['geometry']['location']['lng']
    return [lat,lng]
    
# read data
if not loaded_data:
    cal_temps_text = pd.read_csv(os.path.join(data_dir,'cal_complete_temps.csv'))
    cal_elec_text = pd.read_csv(os.path.join(data_dir,'cal_elec_data.csv'))
    cal_prism_text = pd.read_csv(os.path.join(data_dir,'caltest_prism_run_elec.csv'))
    index_df_by_date(cal_temps_text)
    index_df_by_date(cal_elec_text)
    index_df_by_date(cal_prism_text)
    loaded_data = True

# get a particular account number
account = 7
account_number = cal_elec_text['account_number'].unique()[account]

# get the data for that account
cal_elec=deepcopy(cal_elec_text.loc[cal_elec_text['account_number'] == account_number])
cal_prism=deepcopy(cal_prism_text.loc[cal_prism_text['account_id'] == account_number])
zip_code=str(cal_prism['zipcode'][0])
station_id=get_station_id_from_zip_code(zip_code)
cal_temps=deepcopy(cal_temps_text.loc[cal_temps_text['usaf'] == station_id])

# normalize on number of days
cal_elec['use_per_day']=cal_elec['kwh']/cal_elec['datenum'].diff()

# merge electricity and temperature datasets
cal_all=pd.merge(cal_elec,cal_temps,left_index=True,right_index=True)

temp_data = np.array(cal_all.temps.fillna(cal_all.temps.mean()))
energy_data = np.array(cal_all.use_per_day.fillna(cal_all.use_per_day.mean()))

n = temp_data.shape[0]

print temp_data.shape
print energy_data.shape
print n


(40,)
(40,)
40

In [6]:
# plot the data
plt.plot(np.array([temp_data,energy_data]).T,'.')
plt.show()
plt.scatter(temp_data,energy_data)
plt.show()



In [7]:
# determine logical priors
slope, intercept, r_value, p_value, std_err = stats.linregress(temp_data,energy_data)
print slope
print intercept
print r_value
print p_value
print std_err


-0.113007449295
19.7422271855
-0.363518927036
0.0211274203432
0.0469798448881

In [8]:
switchpoint = pymc.DiscreteUniform('switchpoint', lower=0, upper=n)
early_intercept = pymc.Normal('early_intercept', mu=slope, tau=2)
late_intercept = pymc.Normal('late_intercept', mu=slope, tau=2)
early_slope = pymc.Normal('early_slope', mu=intercept, tau=1)
late_slope = pymc.Normal('late_slope', mu=intercept, tau=1)
early_sd = pymc.Uniform('early_sd', lower=0, upper=20)
late_sd = pymc.Uniform('late_sd', lower=0, upper=20)

@pymc.deterministic(plot=False)
def intercept_params(s=switchpoint, e=early_intercept, l=late_intercept):
    ''' Concatenate intercepts '''
    out = np.empty(len(temp_data))
    out[:s] = e
    out[s:] = l
    return out

@pymc.deterministic(plot=False)
def slope_params(s=switchpoint, e=early_slope, l=late_slope):
    ''' Concatenate slopes '''
    out = np.empty(len(temp_data))
    out[:s] = e
    out[s:] = l
    return out

@pymc.deterministic(plot=False)
def sd_params(s=switchpoint, e=early_sd, l=late_sd):
    ''' Concatenate slopes '''
    out = np.empty(len(temp_data))
    out[:s] = e
    out[s:] = l
    return out

@pymc.deterministic(plot=False)
def energy_est(slope=slope_params, intercept=intercept_params, x=temp_data):
    '''linear regression estimate of parameters'''
    out = np.empty(len(temp_data))
    return intercept + (slope * x)

likelihood = pymc.Normal('likelihood', mu=energy_est, tau=sd_params, value=energy_data, observed=True)
estimate = pymc.Normal('estimate', mu=energy_est, tau=sd_params)

model = pymc.Model([estimate])
M = MCMC(model)

In [9]:
M.sample(10000,5000,50)


 [-----------------100%-----------------] 10000 of 10000 complete in 577.5 sec

In [10]:
plot(M)


Plotting early_intercept
Plotting late_sd
Plotting late_intercept
Plotting early_slope
Plotting switchpoint
Plotting early_sd
Plotting late_slope

In [ ]: