In [3]:
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import clear_output
import os
import glob
from datetime import datetime, timedelta
import bokeh
In [3]:
data_path = "/Users/toni/data/prop-39-school/prop39-original/"
out_path = "/Users/toni/data/prop-39-school/prop39-processed/"
In [281]:
# get different data files by type
kwh_files = glob.glob(data_path + '/*/*/*ELECTRIC*INTERVAL*.csv') + \
glob.glob(data_path + '/*/*/*Electrical*INTERVAL*.csv') + \
glob.glob(data_path + '/*/*ELECTRIC*INTERVAL*.csv') + \
glob.glob(data_path + '/*/*/*/*ELECTRIC*INTERVAL*.csv')
gas_files = glob.glob(data_path + '/*/*/*GAS*INTERVAL*.csv') + \
glob.glob(data_path + '/*/*GAS*INTERVAL*.csv') + \
glob.glob(data_path + '/*/*/*/*GAS*INTERVAL*.csv')
kwh_bill_files = glob.glob(data_path + '/*/*/*ELECTRIC*BILL*.csv') + \
glob.glob(data_path + '/*/*/*Electrical*BILL*.csv') + \
glob.glob(data_path + '/*/*ELECTRIC*BILL*.csv') + \
glob.glob(data_path + '/*/*/*/*ELECTRIC*BILL*.csv')
gas_bill_files = glob.glob(data_path + '/*/*/*GAS*BILL*.csv') + \
glob.glob(data_path + '/*/*GAS*BILL*.csv') + \
glob.glob(data_path + '/*/*/*/*GAS*BILL*.csv')
len(kwh_files), len(gas_files), len(kwh_bill_files), len(gas_bill_files)
Out[281]:
In [90]:
def parse_path(filename):
basefile = os.path.basename(filename)
utilityName = f.replace(data_path, "").split("/")[0]
info = basefile.split("_")
if len(info) ==6:
siteId, _, _, serviceType, _, dataType = info
else:
# ['30736353030244', '2012-2013', 'ELECTRIC', '20150804.xml', 'INTERVAL.csv']
siteId, _, serviceType, _, dataType = info
dataType = dataType.split(".")[0]
return siteId, utilityName, serviceType, dataType
In [91]:
print kwh_files[0]
parse_path(kwh_files[0])
Out[91]:
In [92]:
print kwh_files[4000]
parse_path(kwh_files[0])
Out[92]:
In [74]:
print gas_files[0]
parse_path(gas_files[0])
Out[74]:
In [75]:
print kwh_bill_files[0]
parse_path(kwh_bill_files[0])
Out[75]:
In [76]:
print gas_bill_files[0]
parse_path(gas_bill_files[0])
Out[76]:
In [140]:
files_df = pd.DataFrame([list(parse_path(f)) + [f] for f in kwh_files + kwh_bill_files + gas_files + gas_bill_files], \
columns=["Id", "Utility", "ServiceType", "DataType", "DataFile"])
files_df.head()
Out[140]:
In [141]:
files_df.shape
Out[141]:
In [142]:
files_df.set_index(["Id", "Utility", "ServiceType", "DataType"], inplace=True)
In [192]:
def parse_kwh_interval(filename):
# some files are zero size
import os
statinfo = os.stat(filename)
if statinfo.st_size==0:
return None
# some files have different number of columns for the first few rows
i = 0
ok = False
while not ok:
try:
kwh = pd.read_csv(filename, skiprows=i, header=None)
ok = True
except:
i += 1
if i>0:
print "%s: skipped %d lines"%(filename, i)
# all good, format data into canonical
kwh.rename(columns={0:"id", 1:"date"}, inplace=True)
# date is in Unix timestamp format, convert to ISO format
kwh['date'] = kwh['date'].apply(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
agreement_id = kwh['id'].iloc[0]
kwh.drop("id", axis=1, inplace=True)
kwh.set_index(["date"], inplace=True)
kwh = kwh.iloc[:,:96]
return kwh
In [57]:
kwh = parse_kwh_interval(kwh_files[0])
kwh.head()
Out[57]:
In [59]:
# It looks like this is 15-minute interval data?
plt.figure(figsize=(8,4))
plt.plot(kwh.max().values, label="max", lw=2)
plt.plot(kwh.std().values, label="std", lw=2)
plt.plot(kwh.mean().values, label="avg", lw=2)
idx = np.random.choice(kwh.index.values, 5)
for i in idx:
plt.plot(kwh.ix[i].values[0], label=i)
plt.ylabel("kWh")
plt.legend()
plt.figure(figsize=(8,6))
plt.imshow(kwh.iloc[:,:96].values, interpolation='nearest', aspect='auto')
Out[59]:
In [143]:
idx = (files_df.index.get_level_values('ServiceType')=='ELECTRIC') & (files_df.index.get_level_values('DataType')=='INTERVAL')
keys= files_df.index[idx].unique()
print len(keys)
In [274]:
def parse_kwh_bill(filename):
# some files are zero size
import os
statinfo = os.stat(filename)
if statinfo.st_size==0:
return None
fields = ["cds_code", "start_time", "agreement", "generation", "rate_schedule_id", \
"n_days", "last_period", "consumption_total"]
bill = pd.read_csv(filename, usecols=fields, engine='python')
bill['start_time'] = bill['start_time'].apply(lambda x: np.nan if x in ['false', 'true'] or \
type(x)==float or (type(x)==str and x.replace(".","").isdigit()) \
else datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
bill.dropna(inplace=True)
bill.set_index("start_time", inplace=True)
return bill
In [283]:
parse_kwh_bill(kwh_bill_files[0])
Out[283]:
In [288]:
from collections import Counter
Counter([os.path.getsize(f) for f in gas_files])
Out[288]:
In [289]:
from collections import Counter
Counter([os.path.getsize(f) for f in gas_bill_files])
Out[289]:
In [291]:
df = pd.read_csv(gas_bill_files[0])
df.head()
Out[291]:
In [292]:
keys[0]
Out[292]:
In [197]:
keys = files_df.index.unique()
for i,key in enumerate(keys):
curId, utilityName, serviceType, dataType = key
clear_output(wait=True)
files = files_df.ix[key]['DataFile'].values
print key, ":", len(files), "files (%d/%d)"%(i,len(keys))
# read data, stitch together
if serviceType in ['ELECTRIC', 'Electricity'] and dataType == 'INTERVAL':
func = parse_kwh_interval
elif serviceType in ['ELECTRIC', 'Electricity'] and dataType == 'BILL':
func = parse_kwh_bill
kwh = [func(f) for f in files]
if len([x for x in kwh if x is None])==len(kwh):
print "%s: no actual data!"
continue
kwh = pd.concat(kwh, axis=0).sort_index()
# save data to disc
cur_dir = out_path + "/" + utilityName + "/"
filename= "%s_%s_%s_%s.csv"%(curId, utilityName, serviceType, dataType)
try:
os.makedirs(cur_dir)
except:
pass
kwh.to_csv(cur_dir + filename)