In [1]:
import pandas as pd
import urllib2,urllib
import json
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import json
import urllib2
import numpy as np
import pandas as pd
import collections
from scipy import stats
import os
from disaggregator import weather
from disaggregator import utils
from disaggregator import appliance as app
from disaggregator import OakParkDatasetAdapter as opda
from disaggregator import GreenButtonDatasetAdapter as gbda
from disaggregator import linregress as lng
import xml.etree.ElementTree as ET
import random
from datetime import datetime, timedelta, date
import pylab
import pickle
pylab.rcParams['figure.figsize'] = 12,9
In [2]:
#oak_park_temps = weather.get_weather_data_as_df('API_KEY','60302','IL', '20140301', '20140730')
#with open('temp_oakpark_20140301_20140801.pkl','w') as f:
# pickle.dump(oak_park_temps,f)
#import pickle
#with open('temp_oakpark_20120101_20140730.pkl','rb') as f:
# oak_park_temps_df=pickle.load(f)
In [4]:
import pickle
with open('../../docs/tutorials/temp_oakpark_20120101_20140730.pkl','rb') as f:
temps_df=pickle.load(f)
temps_series_orig = pd.Series(temps_df['temp'],index=temps_df.index)
df = pd.DataFrame(temps_series_orig).resample('H',how='mean')
temps_series=weather._remove_low_outliers_df(df,'temp')['temp']
In [5]:
oak_park_list=[]
for file in os.listdir("../../data/non-duplicates/xml/"):
if('.xml' in file):
oak_park_list.append("../../data/non-duplicates/xml/"+file)
print oak_park_list[0]
In [7]:
results_list=[]
for i,filepath in enumerate(oak_park_list[:1]):
#plt.subplot(8,4,i)
with open(filepath,'rb') as f:
xml_house = f.read()
trace = gbda.get_trace(xml_house)
trace = trace.resample('D',method='sum')
results_dict = lng.run_regressions(trace.series,temps_series,plot=True)
print filepath
results_list.append(results_dict)
In [241]:
best_stuff={}
best_cdd_temps=[]
best_hdd_temps=[]
r2_adj_cdd=[]
r2_adj_hdd=[]
slope_hdd=[]
slope_cdd=[]
print results_list[0].keys()
for result_dict in results_list:
for key,val in result_dict.iteritems():
if key is 'best_cdd_temp':
best_cdd_temps.append(val)
if key is 'best_hdd_temp':
best_hdd_temps.append(val)
if key is 'best_r2_adj_cdd' and val > float("-inf"):
r2_adj_cdd.append(val)
if key is 'best_r2_adj_hdd' and val > float("-inf"):
r2_adj_hdd.append(val)
if key is 'slope_hdd' and val is not None:
slope_hdd.append(val)
if key is 'slope_cdd' and val is not None:
slope_cdd.append(val)
bins=10
plt.hist(slope_hdd,bins,range=[min(slope_hdd),max(slope_hdd)])
bins=10
plt.hist(slope_cdd,bins,range=[min(slope_cdd),max(slope_cdd)])
plt.xlim([min(slope_hdd),max(slope_cdd)])
plt.figure()
bins=20
plt.hist(r2_adj_hdd,bins,range=[0,1],alpha=0.5)
plt.hist(r2_adj_cdd,bins,range=[0,1],alpha=0.5)
plt.xlim([0,1])
print np.mean(r2_adj_cdd)
In [232]:
diff_list=[]
for i,filepath in enumerate(oak_park_list):
with open(filepath,'rb') as f:
xml_house = f.read()
trace = gbda.get_trace(xml_house)
trace = trace.resample('D',method='sum')
[total_series,air_series,diff_series] = lng.run_regressions_and_predict(trace.series,temps_series)
diff_list.append(diff_series)
In [277]:
a=0
count=0
diffs_perc=[]
for diff_series in diff_list:
a=0
count=0
for val in diff_series:
if abs(val) < 5000:
count=count+1
a=a+1
temp=count/float(a)
diffs_perc.append(count/float(a))
if(temp<.33):
sureness= 'not sure'
elif(temp<.66):
sureness= 'somewhat sure'
else:
sureness='fairly sure'
In [287]:
count/float(a)
Out[287]:
In [288]:
len([val for val in diff_series if abs(val) < 5000])/float(len(diff_series))*100
Out[288]:
In [290]:
max(diff_series)
Out[290]:
In [ ]:
count=0
for val in diff_series:
if abs(val) < 5000:
count=count+1
In [272]:
plt.figure()
bins=20
plt.hist(diffs_perc,bins,range=[0,1],alpha=0.5)
plt.xlim([0,1])
Out[272]:
In [275]:
plt.plot(diffs_perc,slope_cdd,'.')
Out[275]:
In [ ]: