In [1]:
%matplotlib inline
from matplotlib import pylab as plt
import matplotlib.dates as mdates
plt.rcParams['figure.figsize'] = (15.0, 8.0)
import pandas as pd
import seaborn as sns

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot


Using TensorFlow backend.
Couldn't import dot_parser, loading of dot files will not be possible.

In [2]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [3]:
from bokeh.charts import TimeSeries, output_file, show
from bokeh.io import output_notebook
output_notebook()


Loading BokehJS ...

Data Preparation

Eleven months data from one bed one bath apartment unit in San Jose, CA region was picked for this experiment. The electricity consumption is recorded in 15 minutes interval by the energy supply company. The raw data contains information such as type, date, start time, end time, usage, units, cost and notes fields. The start time and end time is the measurement interval. In this data, the interval is 15 minutes. The usage in 15 minutes interval is provided in kWh unit, and the cost for the consumption is presented in the dollar. Before we deep dive into the data, some quick feature engineering steps are done to enrich the data with more features.


In [4]:
data = pd.read_csv("D202.csv")
data.head(2)


Out[4]:
TYPE DATE START TIME END TIME USAGE UNITS COST NOTES
0 Electric usage 11/1/16 0:00 0:14 0.07 kWh $0.01 NaN
1 Electric usage 11/1/16 0:15 0:29 0.05 kWh $0.01 NaN

Creating Date and Time Filed


In [5]:
data["DATE_TIME"] = pd.to_datetime(data.DATE + " " + data["END TIME"])

Working Day or Not


In [6]:
data["DAY_TYPE"] = data.DATE_TIME.apply(lambda x: 1 if x.dayofweek > 5 else 0  )

Finding Fedaral Holidays


In [7]:
cal = calendar()
holidays = cal.holidays(start = data.DATE_TIME.min(), end = data.DATE_TIME.max())
data["IS_HOLIDAY"] = data.DATE_TIME.isin(holidays)

In [8]:
data.head(3)


Out[8]:
TYPE DATE START TIME END TIME USAGE UNITS COST NOTES DATE_TIME DAY_TYPE IS_HOLIDAY
0 Electric usage 11/1/16 0:00 0:14 0.07 kWh $0.01 NaN 2016-11-01 00:14:00 0 False
1 Electric usage 11/1/16 0:15 0:29 0.05 kWh $0.01 NaN 2016-11-01 00:29:00 0 False
2 Electric usage 11/1/16 0:30 0:44 0.06 kWh $0.01 NaN 2016-11-01 00:44:00 0 False

Previous Five Observations


In [9]:
for obs in range(1,6):
    data["T_" + str(obs)] = data.USAGE.shift(obs)

In [10]:
data.fillna(0.00,inplace=True)
data.head(10)


Out[10]:
TYPE DATE START TIME END TIME USAGE UNITS COST NOTES DATE_TIME DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5
0 Electric usage 11/1/16 0:00 0:14 0.07 kWh $0.01 0 2016-11-01 00:14:00 0 False 0.00 0.00 0.00 0.00 0.00
1 Electric usage 11/1/16 0:15 0:29 0.05 kWh $0.01 0 2016-11-01 00:29:00 0 False 0.07 0.00 0.00 0.00 0.00
2 Electric usage 11/1/16 0:30 0:44 0.06 kWh $0.01 0 2016-11-01 00:44:00 0 False 0.05 0.07 0.00 0.00 0.00
3 Electric usage 11/1/16 0:45 0:59 0.05 kWh $0.01 0 2016-11-01 00:59:00 0 False 0.06 0.05 0.07 0.00 0.00
4 Electric usage 11/1/16 1:00 1:14 0.03 kWh $0.01 0 2016-11-01 01:14:00 0 False 0.05 0.06 0.05 0.07 0.00
5 Electric usage 11/1/16 1:15 1:29 0.03 kWh $0.01 0 2016-11-01 01:29:00 0 False 0.03 0.05 0.06 0.05 0.07
6 Electric usage 11/1/16 1:30 1:44 0.03 kWh $0.01 0 2016-11-01 01:44:00 0 False 0.03 0.03 0.05 0.06 0.05
7 Electric usage 11/1/16 1:45 1:59 0.06 kWh $0.01 0 2016-11-01 01:59:00 0 False 0.03 0.03 0.03 0.05 0.06
8 Electric usage 11/1/16 2:00 2:14 0.04 kWh $0.01 0 2016-11-01 02:14:00 0 False 0.06 0.03 0.03 0.03 0.05
9 Electric usage 11/1/16 2:15 2:29 0.03 kWh $0.01 0 2016-11-01 02:29:00 0 False 0.04 0.06 0.03 0.03 0.03

In [11]:
data.IS_HOLIDAY = data.IS_HOLIDAY.astype("int")

In [12]:
data.head(2)


Out[12]:
TYPE DATE START TIME END TIME USAGE UNITS COST NOTES DATE_TIME DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5
0 Electric usage 11/1/16 0:00 0:14 0.07 kWh $0.01 0 2016-11-01 00:14:00 0 0 0.00 0.0 0.0 0.0 0.0
1 Electric usage 11/1/16 0:15 0:29 0.05 kWh $0.01 0 2016-11-01 00:29:00 0 0 0.07 0.0 0.0 0.0 0.0

Clean Data


In [13]:
clean_data = data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]

In [14]:
clean_data.head(2)


Out[14]:
DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5 USAGE
0 0 0 0.00 0.0 0.0 0.0 0.0 0.07
1 0 0 0.07 0.0 0.0 0.0 0.0 0.05

In [ ]:

Let's Explore


In [15]:
all_show = TimeSeries(data,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(all_show)


/Users/jagan/anaconda2/lib/python2.7/site-packages/bokeh/core/json_encoder.py:33: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp
  if pd and isinstance(obj, pd.tslib.Timestamp):

In [ ]:

A Week ! Yes X'Mas Week


In [16]:
xmask = (data.DATE_TIME >= pd.to_datetime("12/20/2016")) & (data.DATE_TIME <= pd.to_datetime("12/27/2016"))

In [17]:
xmas_week = data.loc[xmask]

In [18]:
xmas_show = TimeSeries(xmas_week,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(xmas_show)


A Day ! New Year 2017


In [19]:
dmask = (data.DATE_TIME >= pd.to_datetime("01/01/2017")) & (data.DATE_TIME < pd.to_datetime("01/02/2017"))
nyd = data.loc[dmask]

In [20]:
nyd_show = TimeSeries(nyd,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(nyd_show)


Train and Test Data


In [21]:
training_data = data[data.DATE_TIME < pd.to_datetime("08/01/2017")]

In [22]:
val_mask = (data.DATE_TIME >= pd.to_datetime("08/01/2017")) & (data.DATE_TIME < pd.to_datetime("09/01/2017"))
val_data = data.loc[val_mask]

In [23]:
test_data = data[data.DATE_TIME >= pd.to_datetime("09/01/2017")]

In [24]:
training_data.tail(3)


Out[24]:
TYPE DATE START TIME END TIME USAGE UNITS COST NOTES DATE_TIME DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5
26006 Electric usage 7/31/17 23:15 23:29 0.12 kWh $0.02 0 2017-07-31 23:29:00 0 0 0.15 0.10 0.27 0.44 0.12
26007 Electric usage 7/31/17 23:30 23:44 0.11 kWh $0.02 0 2017-07-31 23:44:00 0 0 0.12 0.15 0.10 0.27 0.44
26008 Electric usage 7/31/17 23:45 23:59 0.10 kWh $0.02 0 2017-07-31 23:59:00 0 0 0.11 0.12 0.15 0.10 0.27

In [25]:
test_data.head(2)


Out[25]:
TYPE DATE START TIME END TIME USAGE UNITS COST NOTES DATE_TIME DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5
28984 Electric usage 9/1/17 0:00 0:14 0.03 kWh $0.01 0 2017-09-01 00:14:00 0 0 0.03 0.03 0.04 0.04 0.05
28985 Electric usage 9/1/17 0:15 0:29 0.04 kWh $0.01 0 2017-09-01 00:29:00 0 0 0.03 0.03 0.03 0.04 0.04

In [26]:
clean_train = training_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_test = test_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_val = val_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]

In [27]:
clean_train.head(2)


Out[27]:
DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5 USAGE
0 0 0 0.00 0.0 0.0 0.0 0.0 0.07
1 0 0 0.07 0.0 0.0 0.0 0.0 0.05

In [28]:
clean_test.head(2)


Out[28]:
DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5 USAGE
28984 0 0 0.03 0.03 0.04 0.04 0.05 0.03
28985 0 0 0.03 0.03 0.03 0.04 0.04 0.04

In [29]:
clean_val.head(3)


Out[29]:
DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5 USAGE
26009 0 0 0.1 0.11 0.12 0.15 0.10 0.10
26010 0 0 0.1 0.10 0.11 0.12 0.15 0.10
26011 0 0 0.1 0.10 0.10 0.11 0.12 0.08

Let's Model and Predict


In [30]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error

In [31]:
X_train,y_train = clean_train.drop(["USAGE"],axis=1),clean_train.USAGE
X_test,y_test = clean_test.drop(["USAGE"],axis=1),clean_test.USAGE
X_val,y_val = clean_val.drop(["USAGE"],axis=1),clean_val.USAGE

In [32]:
scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))
rfr  = RandomForestRegressor(random_state=2017,verbose=2,n_jobs=5)

In [33]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_valid_scaled = scaler.fit_transform(X_val)

In [ ]:


In [34]:
rfr.fit(X_train_scaled,y_train)


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.1s finished
Out[34]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=5,
           oob_score=False, random_state=2017, verbose=2, warm_start=False)

In [35]:
rfr.score(X_val,y_val)


[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.0s finished
Out[35]:
0.26352935135691857

In [36]:
rfr.score(X_test,y_test)


[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.0s finished
Out[36]:
0.32707009777378615

In [37]:
test_data["RF_PREDICTED"] = rfr.predict(X_test_scaled)


[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.0s finished
/Users/jagan/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [38]:
test_data.head(5)


Out[38]:
TYPE DATE START TIME END TIME USAGE UNITS COST NOTES DATE_TIME DAY_TYPE IS_HOLIDAY T_1 T_2 T_3 T_4 T_5 RF_PREDICTED
28984 Electric usage 9/1/17 0:00 0:14 0.03 kWh $0.01 0 2017-09-01 00:14:00 0 0 0.03 0.03 0.04 0.04 0.05 0.021306
28985 Electric usage 9/1/17 0:15 0:29 0.04 kWh $0.01 0 2017-09-01 00:29:00 0 0 0.03 0.03 0.03 0.04 0.04 0.060113
28986 Electric usage 9/1/17 0:30 0:44 0.06 kWh $0.01 0 2017-09-01 00:44:00 0 0 0.04 0.03 0.03 0.03 0.04 0.048081
28987 Electric usage 9/1/17 0:45 0:59 0.03 kWh $0.01 0 2017-09-01 00:59:00 0 0 0.06 0.04 0.03 0.03 0.03 0.061000
28988 Electric usage 9/1/17 1:00 1:14 0.03 kWh $0.01 0 2017-09-01 01:14:00 0 0 0.03 0.06 0.04 0.03 0.03 0.028246

Prediction with Random Forest Model in Test Data


In [39]:
pred_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=800, plot_height=350)
show(pred_show)


Prediction Single Day in Test Data


In [40]:
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]

sep_30rf = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=900, plot_height=350)
show(sep_30rf)


LSTM Modelling


In [41]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

LSTM Model


In [42]:
model_k = Sequential()
model_k.add(LSTM(1, input_shape=(1,7)))
model_k.add(Dense(1))
model_k.compile(loss='mean_squared_error', optimizer='adam')

In [43]:
SVG(model_to_dot(model_k).create(prog='dot', format='svg'))


Out[43]:
G 120866850192 lstm_1_input: InputLayer 120866849360 lstm_1: LSTM 120866850192->120866849360 120866849680 dense_1: Dense 120866849360->120866849680

Reshape the data to 3D


In [44]:
X_t_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))

In [45]:
X_val_resaped = X_valid_scaled.reshape((X_valid_scaled.shape[0], 1, X_valid_scaled.shape[1]))
Fit the Model

In [46]:
history = model_k.fit(X_t_reshaped, y_train, validation_data=(X_val_resaped, y_val),\
epochs=10, batch_size=96, verbose=2)


Train on 26009 samples, validate on 2975 samples
Epoch 1/10
2s - loss: 0.0757 - val_loss: 0.0165
Epoch 2/10
1s - loss: 0.0605 - val_loss: 0.0191
Epoch 3/10
1s - loss: 0.0506 - val_loss: 0.0166
Epoch 4/10
1s - loss: 0.0421 - val_loss: 0.0207
Epoch 5/10
1s - loss: 0.0370 - val_loss: 0.0267
Epoch 6/10
1s - loss: 0.0339 - val_loss: 0.0307
Epoch 7/10
1s - loss: 0.0321 - val_loss: 0.0357
Epoch 8/10
1s - loss: 0.0309 - val_loss: 0.0394
Epoch 9/10
1s - loss: 0.0303 - val_loss: 0.0396
Epoch 10/10
1s - loss: 0.0298 - val_loss: 0.0413

In [47]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()


Out[47]:
<matplotlib.legend.Legend at 0x1c2e8c6c90>

In [48]:
X_te_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

In [49]:
res = model_k.predict(X_te_reshaped)

In [50]:
test_data["DL_PRED"] = res


/Users/jagan/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

LSTM Prediction on Test Data


In [51]:
keras_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(keras_show)


A Day on LSTM Predcted Result


In [52]:
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]

sep_30 = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(sep_30)


RMSE Value of Random Forest and LSTM


In [53]:
from numpy import sqrt
sqrt(mean_squared_error(test_data.USAGE,test_data.DL_PRED))


Out[53]:
0.16743906314666301

In [54]:
sqrt(mean_squared_error(test_data.USAGE,test_data.RF_PREDICTED))


Out[54]:
0.18412616700088405