In [3]:
from matplotlib import pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np

from IPython.display import clear_output

import os
import glob
from datetime import datetime, timedelta
import bokeh

Organize the files by type


In [3]:
data_path = "/Users/toni/data/prop-39-school/prop39-original/"
out_path  = "/Users/toni/data/prop-39-school/prop39-processed/"

In [281]:
# get different data files by type

kwh_files = glob.glob(data_path + '/*/*/*ELECTRIC*INTERVAL*.csv') + \
            glob.glob(data_path + '/*/*/*Electrical*INTERVAL*.csv') + \
            glob.glob(data_path + '/*/*ELECTRIC*INTERVAL*.csv') + \
            glob.glob(data_path + '/*/*/*/*ELECTRIC*INTERVAL*.csv')
gas_files = glob.glob(data_path + '/*/*/*GAS*INTERVAL*.csv') + \
            glob.glob(data_path + '/*/*GAS*INTERVAL*.csv') + \
            glob.glob(data_path + '/*/*/*/*GAS*INTERVAL*.csv')
kwh_bill_files = glob.glob(data_path + '/*/*/*ELECTRIC*BILL*.csv') + \
            glob.glob(data_path + '/*/*/*Electrical*BILL*.csv') + \
            glob.glob(data_path + '/*/*ELECTRIC*BILL*.csv') + \
            glob.glob(data_path + '/*/*/*/*ELECTRIC*BILL*.csv')
gas_bill_files = glob.glob(data_path + '/*/*/*GAS*BILL*.csv') + \
            glob.glob(data_path + '/*/*GAS*BILL*.csv') + \
            glob.glob(data_path + '/*/*/*/*GAS*BILL*.csv')
        
len(kwh_files), len(gas_files), len(kwh_bill_files), len(gas_bill_files)


Out[281]:
(9504, 2451, 9504, 2451)

In [90]:
def parse_path(filename):
    basefile = os.path.basename(filename)
    utilityName = f.replace(data_path, "").split("/")[0]
    info = basefile.split("_")
    if len(info) ==6:
        siteId, _, _, serviceType, _, dataType = info
    else:
        # ['30736353030244', '2012-2013', 'ELECTRIC', '20150804.xml', 'INTERVAL.csv']
        siteId, _, serviceType, _, dataType = info
    dataType = dataType.split(".")[0]
    return siteId, utilityName, serviceType, dataType

In [91]:
print kwh_files[0]
parse_path(kwh_files[0])


/Users/toni/data/prop-39-school/prop39-original/SCE/2012-2013/10620260000000_2012-2013_SCE_ELECTRIC_20150722.xml_INTERVAL.csv
Out[91]:
('10620260000000', 'SDGE', 'ELECTRIC', 'INTERVAL')

In [92]:
print kwh_files[4000]
parse_path(kwh_files[0])


/Users/toni/data/prop-39-school/prop39-original/PGE/2013-2014/Electricity/44697996108138_20132014_PacificGasElectric_ELECTRIC_20151127.xml_INTERVAL.csv
Out[92]:
('10620260000000', 'SDGE', 'ELECTRIC', 'INTERVAL')

In [74]:
print gas_files[0]
parse_path(gas_files[0])


/Users/toni/data/prop-39-school/prop39-original/PGE/2012-2013/Gas/01612590100123_20122013_PacificGasElectric_GAS_20151104.xml_INTERVAL.csv
Out[74]:
('01612590100123', 'PacificGasElectric', 'GAS', 'INTERVAL')

In [75]:
print kwh_bill_files[0]
parse_path(kwh_bill_files[0])


/Users/toni/data/prop-39-school/prop39-original/SCE/2012-2013/10620260000000_2012-2013_SCE_ELECTRIC_20150722.xml_BILL.csv
Out[75]:
('10620260000000', 'SCE', 'ELECTRIC', 'BILL')

In [76]:
print gas_bill_files[0]
parse_path(gas_bill_files[0])


/Users/toni/data/prop-39-school/prop39-original/PGE/2012-2013/Gas/01612590100123_20122013_PacificGasElectric_GAS_20151104.xml_BILL.csv
Out[76]:
('01612590100123', 'PacificGasElectric', 'GAS', 'BILL')

In [140]:
files_df = pd.DataFrame([list(parse_path(f)) + [f] for f in kwh_files + kwh_bill_files + gas_files + gas_bill_files], \
                            columns=["Id", "Utility", "ServiceType", "DataType", "DataFile"])
files_df.head()


Out[140]:
Id Utility ServiceType DataType DataFile
0 10620260000000 SCE ELECTRIC INTERVAL /Users/toni/data/prop-39-school/prop39-origina...
1 10620266005763 SCE ELECTRIC INTERVAL /Users/toni/data/prop-39-school/prop39-origina...
2 15634040000000 SCE ELECTRIC INTERVAL /Users/toni/data/prop-39-school/prop39-origina...
3 15634040106575 SCE ELECTRIC INTERVAL /Users/toni/data/prop-39-school/prop39-origina...
4 15634040112185 SCE ELECTRIC INTERVAL /Users/toni/data/prop-39-school/prop39-origina...

In [141]:
files_df.shape


Out[141]:
(19850, 5)

In [142]:
files_df.set_index(["Id", "Utility", "ServiceType", "DataType"], inplace=True)

Parse data

kWh iterval data


In [192]:
def parse_kwh_interval(filename):
    # some files are zero size
    import os
    statinfo = os.stat(filename)
    if statinfo.st_size==0:
        return None
    
    # some files have different number of columns for the first few rows
    i = 0
    ok = False
    while not ok:
        try:
            kwh = pd.read_csv(filename, skiprows=i, header=None)
            ok = True
        except:
            i += 1
    if i>0:
        print "%s: skipped %d lines"%(filename, i)
        
    # all good, format data into canonical
    kwh.rename(columns={0:"id", 1:"date"}, inplace=True)
    # date is in Unix timestamp format, convert to ISO format
    kwh['date'] = kwh['date'].apply(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
    agreement_id = kwh['id'].iloc[0]
    kwh.drop("id", axis=1, inplace=True)
    kwh.set_index(["date"], inplace=True)
    kwh = kwh.iloc[:,:96]
    
    return kwh

In [57]:
kwh = parse_kwh_interval(kwh_files[0])
kwh.head()


15634040000000_2012-2013_SCE_ELECTRIC_20150722.xml_INTERVAL.csv
Out[57]:
2 3 4 5 6 7 8 9 10 11 ... 88 89 90 91 92 93 94 95 96 97
date
2012-06-30 0.72 4.32 3.06 0.66 0.60 0.66 0.66 0.72 0.72 0.60 ... 0.66 0.60 0.66 0.66 0.66 0.66 0.60 0.66 0.60 0.66
2012-07-01 0.66 0.72 0.66 0.66 0.66 0.72 0.66 0.66 0.72 0.66 ... 0.66 0.60 0.66 0.66 0.66 0.60 0.72 0.66 0.60 0.66
2012-07-02 0.72 0.72 0.66 0.66 0.66 0.66 0.66 0.66 0.72 0.72 ... 3.60 3.18 3.66 2.88 3.36 3.00 1.44 0.72 0.66 0.66
2012-07-03 0.72 0.66 0.66 0.72 0.66 0.78 0.66 0.72 0.66 0.78 ... 0.66 0.60 0.72 0.66 0.66 0.72 0.66 0.66 0.66 0.66
2012-07-04 0.66 0.72 0.66 0.72 0.72 0.78 0.66 0.72 0.66 0.66 ... 3.12 3.24 3.66 3.00 2.04 0.78 0.78 0.78 0.78 0.84

5 rows × 96 columns


In [59]:
# It looks like this is 15-minute interval data?

plt.figure(figsize=(8,4))
plt.plot(kwh.max().values, label="max", lw=2)
plt.plot(kwh.std().values, label="std", lw=2)
plt.plot(kwh.mean().values, label="avg", lw=2)

idx = np.random.choice(kwh.index.values, 5)
for i in idx:
    plt.plot(kwh.ix[i].values[0], label=i)
    
plt.ylabel("kWh")
plt.legend()

plt.figure(figsize=(8,6))
plt.imshow(kwh.iloc[:,:96].values, interpolation='nearest', aspect='auto')


Out[59]:
<matplotlib.image.AxesImage at 0x10e805310>

In [143]:
idx = (files_df.index.get_level_values('ServiceType')=='ELECTRIC') & (files_df.index.get_level_values('DataType')=='INTERVAL')
keys= files_df.index[idx].unique()
print len(keys)


4095

kWh billing data


In [274]:
def parse_kwh_bill(filename):
    # some files are zero size
    import os
    statinfo = os.stat(filename)
    if statinfo.st_size==0:
        return None
    
    fields = ["cds_code", "start_time", "agreement", "generation", "rate_schedule_id", \
              "n_days", "last_period", "consumption_total"]
    bill = pd.read_csv(filename, usecols=fields, engine='python')
    bill['start_time'] = bill['start_time'].apply(lambda x: np.nan if x in ['false', 'true'] or \
                                                      type(x)==float or (type(x)==str and x.replace(".","").isdigit()) \
                                                            else datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
    bill.dropna(inplace=True)
    bill.set_index("start_time", inplace=True)
    
    return bill

In [283]:
parse_kwh_bill(kwh_bill_files[0])


Out[283]:
cds_code agreement rate_schedule_id n_days generation last_period consumption_total
start_time
2012-06-11 10620260000000 3449927 GS-1 30 False 21.99 0.0
2012-07-11 10620260000000 3449927 GS-1 30 False 23.10 5.0
2012-08-10 10620260000000 3449927 GS-1 32 False 30.69 36.0
2012-09-11 10620260000000 3449927 GS-1 33 False 34.19 58.0
2012-10-14 10620260000000 3449927 GS-1 30 False 28.02 42.0
2012-11-13 10620260000000 3449927 GS-1 30 False 30.39 60.0
2012-12-13 10620260000000 3449927 GS-1 32 False 32.28 56.0
2013-01-14 10620260000000 3449927 GS-1 29 False 33.00 70.0
2013-02-12 10620260000000 3449927 GS-1 30 False 32.67 62.0
2013-03-14 10620260000000 3449927 GS-1 31 False 36.05 78.0
2013-04-14 10620260000000 3449927 GS-1 31 False 36.53 78.0
2013-05-15 10620260000000 3449927 GS-1 32 False 35.45 59.0
2013-06-16 10620260000000 3449927 GS-1 30 False 36.41 68.0

Gas interval data

All gas interval data files are empty!


In [288]:
from collections import Counter
Counter([os.path.getsize(f) for f in gas_files])


Out[288]:
Counter({0: 2451})

Gas billing data

All gas billing data files are empty!


In [289]:
from collections import Counter
Counter([os.path.getsize(f) for f in gas_bill_files])


Out[289]:
Counter({304: 2451})

In [291]:
df = pd.read_csv(gas_bill_files[0])
df.head()


Out[291]:
customer_name customer_city customer_zip customer_account lea_customer cds_code school_site_name school_city school_site_zip agreement ... generation start_time last_period consumption_total on_peak semi_peak off_peak on_peak_demand semi_peak_demand off_peak_demand

0 rows × 24 columns


In [292]:
keys[0]


Out[292]:
('01612590100123', 'PGE', 'GAS', 'INTERVAL')

Process all the files


In [197]:
keys = files_df.index.unique()

for i,key in enumerate(keys):
    curId, utilityName, serviceType, dataType = key
    clear_output(wait=True)
    files = files_df.ix[key]['DataFile'].values
    print key, ":", len(files), "files (%d/%d)"%(i,len(keys))
    
    # read data, stitch together
    if serviceType in ['ELECTRIC', 'Electricity'] and dataType == 'INTERVAL':
        func = parse_kwh_interval
    elif serviceType in ['ELECTRIC', 'Electricity'] and dataType == 'BILL':
        func = parse_kwh_bill
    kwh = [func(f) for f in files]
    if len([x for x in kwh if x is None])==len(kwh):
        print "%s: no actual data!"
        continue
    kwh = pd.concat(kwh, axis=0).sort_index()
    
    # save data to disc
    cur_dir = out_path + "/" + utilityName + "/"
    filename= "%s_%s_%s_%s.csv"%(curId, utilityName, serviceType, dataType)
    try:
        os.makedirs(cur_dir)
    except:
        pass
    kwh.to_csv(cur_dir + filename)


('37683380111898', 'SDGE', 'ELECTRIC', 'INTERVAL') : 1 files (3401/4095)