In [1]:

    
%matplotlib inline
from matplotlib import pylab as plt
import matplotlib.dates as mdates
plt.rcParams['figure.figsize'] = (15.0, 8.0)
import pandas as pd
import seaborn as sns

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot









    



Using TensorFlow backend.






    



Couldn't import dot_parser, loading of dot files will not be possible.



In [2]:

    
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar



In [3]:

    
from bokeh.charts import TimeSeries, output_file, show
from bokeh.io import output_notebook
output_notebook()









    





    
        
        Loading BokehJS ...

Data Preparation

Eleven months data from one bed one bath apartment unit in San Jose, CA region was picked for this experiment. The electricity consumption is recorded in 15 minutes interval by the energy supply company. The raw data contains information such as type, date, start time, end time, usage, units, cost and notes fields. The start time and end time is the measurement interval. In this data, the interval is 15 minutes. The usage in 15 minutes interval is provided in kWh unit, and the cost for the consumption is presented in the dollar. Before we deep dive into the data, some quick feature engineering steps are done to enrich the data with more features.



In [4]:

    
data = pd.read_csv("D202.csv")
data.head(2)









    Out[4]:







  
    
      
      TYPE
      DATE
      START TIME
      END TIME
      USAGE
      UNITS
      COST
      NOTES
    
  
  
    
      0
      Electric usage
      11/1/16
      0:00
      0:14
      0.07
      kWh
      $0.01
      NaN
    
    
      1
      Electric usage
      11/1/16
      0:15
      0:29
      0.05
      kWh
      $0.01
      NaN

Creating Date and Time Filed



In [5]:

    
data["DATE_TIME"] = pd.to_datetime(data.DATE + " " + data["END TIME"])

Working Day or Not



In [6]:

    
data["DAY_TYPE"] = data.DATE_TIME.apply(lambda x: 1 if x.dayofweek > 5 else 0  )

Finding Fedaral Holidays



In [7]:

    
cal = calendar()
holidays = cal.holidays(start = data.DATE_TIME.min(), end = data.DATE_TIME.max())
data["IS_HOLIDAY"] = data.DATE_TIME.isin(holidays)



In [8]:

    
data.head(3)









    Out[8]:







  
    
      
      TYPE
      DATE
      START TIME
      END TIME
      USAGE
      UNITS
      COST
      NOTES
      DATE_TIME
      DAY_TYPE
      IS_HOLIDAY
    
  
  
    
      0
      Electric usage
      11/1/16
      0:00
      0:14
      0.07
      kWh
      $0.01
      NaN
      2016-11-01 00:14:00
      0
      False
    
    
      1
      Electric usage
      11/1/16
      0:15
      0:29
      0.05
      kWh
      $0.01
      NaN
      2016-11-01 00:29:00
      0
      False
    
    
      2
      Electric usage
      11/1/16
      0:30
      0:44
      0.06
      kWh
      $0.01
      NaN
      2016-11-01 00:44:00
      0
      False

Previous Five Observations



In [9]:

    
for obs in range(1,6):
    data["T_" + str(obs)] = data.USAGE.shift(obs)



In [10]:

    
data.fillna(0.00,inplace=True)
data.head(10)









    Out[10]:







  
    
      
      TYPE
      DATE
      START TIME
      END TIME
      USAGE
      UNITS
      COST
      NOTES
      DATE_TIME
      DAY_TYPE
      IS_HOLIDAY
      T_1
      T_2
      T_3
      T_4
      T_5
    
  
  
    
      0
      Electric usage
      11/1/16
      0:00
      0:14
      0.07
      kWh
      $0.01
      0
      2016-11-01 00:14:00
      0
      False
      0.00
      0.00
      0.00
      0.00
      0.00
    
    
      1
      Electric usage
      11/1/16
      0:15
      0:29
      0.05
      kWh
      $0.01
      0
      2016-11-01 00:29:00
      0
      False
      0.07
      0.00
      0.00
      0.00
      0.00
    
    
      2
      Electric usage
      11/1/16
      0:30
      0:44
      0.06
      kWh
      $0.01
      0
      2016-11-01 00:44:00
      0
      False
      0.05
      0.07
      0.00
      0.00
      0.00
    
    
      3
      Electric usage
      11/1/16
      0:45
      0:59
      0.05
      kWh
      $0.01
      0
      2016-11-01 00:59:00
      0
      False
      0.06
      0.05
      0.07
      0.00
      0.00
    
    
      4
      Electric usage
      11/1/16
      1:00
      1:14
      0.03
      kWh
      $0.01
      0
      2016-11-01 01:14:00
      0
      False
      0.05
      0.06
      0.05
      0.07
      0.00
    
    
      5
      Electric usage
      11/1/16
      1:15
      1:29
      0.03
      kWh
      $0.01
      0
      2016-11-01 01:29:00
      0
      False
      0.03
      0.05
      0.06
      0.05
      0.07
    
    
      6
      Electric usage
      11/1/16
      1:30
      1:44
      0.03
      kWh
      $0.01
      0
      2016-11-01 01:44:00
      0
      False
      0.03
      0.03
      0.05
      0.06
      0.05
    
    
      7
      Electric usage
      11/1/16
      1:45
      1:59
      0.06
      kWh
      $0.01
      0
      2016-11-01 01:59:00
      0
      False
      0.03
      0.03
      0.03
      0.05
      0.06
    
    
      8
      Electric usage
      11/1/16
      2:00
      2:14
      0.04
      kWh
      $0.01
      0
      2016-11-01 02:14:00
      0
      False
      0.06
      0.03
      0.03
      0.03
      0.05
    
    
      9
      Electric usage
      11/1/16
      2:15
      2:29
      0.03
      kWh
      $0.01
      0
      2016-11-01 02:29:00
      0
      False
      0.04
      0.06
      0.03
      0.03
      0.03



In [11]:

    
data.IS_HOLIDAY = data.IS_HOLIDAY.astype("int")



In [12]:

    
data.head(2)









    Out[12]:







  
    
      
      TYPE
      DATE
      START TIME
      END TIME
      USAGE
      UNITS
      COST
      NOTES
      DATE_TIME
      DAY_TYPE
      IS_HOLIDAY
      T_1
      T_2
      T_3
      T_4
      T_5
    
  
  
    
      0
      Electric usage
      11/1/16
      0:00
      0:14
      0.07
      kWh
      $0.01
      0
      2016-11-01 00:14:00
      0
      0
      0.00
      0.0
      0.0
      0.0
      0.0
    
    
      1
      Electric usage
      11/1/16
      0:15
      0:29
      0.05
      kWh
      $0.01
      0
      2016-11-01 00:29:00
      0
      0
      0.07
      0.0
      0.0
      0.0
      0.0

Clean Data



In [13]:

    
clean_data = data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]



In [14]:

    
clean_data.head(2)



In [ ]:

Let's Explore



In [15]:

    
all_show = TimeSeries(data,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(all_show)









    



/Users/jagan/anaconda2/lib/python2.7/site-packages/bokeh/core/json_encoder.py:33: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp
  if pd and isinstance(obj, pd.tslib.Timestamp):



In [ ]:

A Week ! Yes X'Mas Week



In [16]:

    
xmask = (data.DATE_TIME >= pd.to_datetime("12/20/2016")) & (data.DATE_TIME <= pd.to_datetime("12/27/2016"))



In [17]:

    
xmas_week = data.loc[xmask]



In [18]:

    
xmas_show = TimeSeries(xmas_week,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(xmas_show)

A Day ! New Year 2017



In [19]:

    
dmask = (data.DATE_TIME >= pd.to_datetime("01/01/2017")) & (data.DATE_TIME < pd.to_datetime("01/02/2017"))
nyd = data.loc[dmask]



In [20]:

    
nyd_show = TimeSeries(nyd,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(nyd_show)

Train and Test Data



In [21]:

    
training_data = data[data.DATE_TIME < pd.to_datetime("08/01/2017")]



In [22]:

    
val_mask = (data.DATE_TIME >= pd.to_datetime("08/01/2017")) & (data.DATE_TIME < pd.to_datetime("09/01/2017"))
val_data = data.loc[val_mask]



In [23]:

    
test_data = data[data.DATE_TIME >= pd.to_datetime("09/01/2017")]



In [24]:

    
training_data.tail(3)









    Out[24]:







  
    
      
      TYPE
      DATE
      START TIME
      END TIME
      USAGE
      UNITS
      COST
      NOTES
      DATE_TIME
      DAY_TYPE
      IS_HOLIDAY
      T_1
      T_2
      T_3
      T_4
      T_5
    
  
  
    
      26006
      Electric usage
      7/31/17
      23:15
      23:29
      0.12
      kWh
      $0.02
      0
      2017-07-31 23:29:00
      0
      0
      0.15
      0.10
      0.27
      0.44
      0.12
    
    
      26007
      Electric usage
      7/31/17
      23:30
      23:44
      0.11
      kWh
      $0.02
      0
      2017-07-31 23:44:00
      0
      0
      0.12
      0.15
      0.10
      0.27
      0.44
    
    
      26008
      Electric usage
      7/31/17
      23:45
      23:59
      0.10
      kWh
      $0.02
      0
      2017-07-31 23:59:00
      0
      0
      0.11
      0.12
      0.15
      0.10
      0.27



In [25]:

    
test_data.head(2)









    Out[25]:







  
    
      
      TYPE
      DATE
      START TIME
      END TIME
      USAGE
      UNITS
      COST
      NOTES
      DATE_TIME
      DAY_TYPE
      IS_HOLIDAY
      T_1
      T_2
      T_3
      T_4
      T_5
    
  
  
    
      28984
      Electric usage
      9/1/17
      0:00
      0:14
      0.03
      kWh
      $0.01
      0
      2017-09-01 00:14:00
      0
      0
      0.03
      0.03
      0.04
      0.04
      0.05
    
    
      28985
      Electric usage
      9/1/17
      0:15
      0:29
      0.04
      kWh
      $0.01
      0
      2017-09-01 00:29:00
      0
      0
      0.03
      0.03
      0.03
      0.04
      0.04



In [26]:

    
clean_train = training_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_test = test_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_val = val_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]



In [27]:

    
clean_train.head(2)



In [28]:

    
clean_test.head(2)



In [29]:

    
clean_val.head(3)

Let's Model and Predict



In [30]:

    
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error



In [31]:

    
X_train,y_train = clean_train.drop(["USAGE"],axis=1),clean_train.USAGE
X_test,y_test = clean_test.drop(["USAGE"],axis=1),clean_test.USAGE
X_val,y_val = clean_val.drop(["USAGE"],axis=1),clean_val.USAGE



In [32]:

    
scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))
rfr  = RandomForestRegressor(random_state=2017,verbose=2,n_jobs=5)



In [33]:

    
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_valid_scaled = scaler.fit_transform(X_val)



In [ ]:



In [34]:

    
rfr.fit(X_train_scaled,y_train)









    



building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10






    



[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.1s finished






    Out[34]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=5,
           oob_score=False, random_state=2017, verbose=2, warm_start=False)



In [35]:

    
rfr.score(X_val,y_val)









    



[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.0s finished






    Out[35]:





0.26352935135691857



In [36]:

    
rfr.score(X_test,y_test)









    



[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.0s finished






    Out[36]:





0.32707009777378615



In [37]:

    
test_data["RF_PREDICTED"] = rfr.predict(X_test_scaled)









    



[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.0s finished
/Users/jagan/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [38]:

    
test_data.head(5)









    Out[38]:







  
    
      
      TYPE
      DATE
      START TIME
      END TIME
      USAGE
      UNITS
      COST
      NOTES
      DATE_TIME
      DAY_TYPE
      IS_HOLIDAY
      T_1
      T_2
      T_3
      T_4
      T_5
      RF_PREDICTED
    
  
  
    
      28984
      Electric usage
      9/1/17
      0:00
      0:14
      0.03
      kWh
      $0.01
      0
      2017-09-01 00:14:00
      0
      0
      0.03
      0.03
      0.04
      0.04
      0.05
      0.021306
    
    
      28985
      Electric usage
      9/1/17
      0:15
      0:29
      0.04
      kWh
      $0.01
      0
      2017-09-01 00:29:00
      0
      0
      0.03
      0.03
      0.03
      0.04
      0.04
      0.060113
    
    
      28986
      Electric usage
      9/1/17
      0:30
      0:44
      0.06
      kWh
      $0.01
      0
      2017-09-01 00:44:00
      0
      0
      0.04
      0.03
      0.03
      0.03
      0.04
      0.048081
    
    
      28987
      Electric usage
      9/1/17
      0:45
      0:59
      0.03
      kWh
      $0.01
      0
      2017-09-01 00:59:00
      0
      0
      0.06
      0.04
      0.03
      0.03
      0.03
      0.061000
    
    
      28988
      Electric usage
      9/1/17
      1:00
      1:14
      0.03
      kWh
      $0.01
      0
      2017-09-01 01:14:00
      0
      0
      0.03
      0.06
      0.04
      0.03
      0.03
      0.028246

Prediction with Random Forest Model in Test Data



In [39]:

    
pred_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=800, plot_height=350)
show(pred_show)

Prediction Single Day in Test Data



In [40]:

    
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]

sep_30rf = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=900, plot_height=350)
show(sep_30rf)

LSTM Modelling



In [41]:

    
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

LSTM Model



In [42]:

    
model_k = Sequential()
model_k.add(LSTM(1, input_shape=(1,7)))
model_k.add(Dense(1))
model_k.compile(loss='mean_squared_error', optimizer='adam')



In [43]:

    
SVG(model_to_dot(model_k).create(prog='dot', format='svg'))









    Out[43]:

Reshape the data to 3D



In [44]:

    
X_t_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))



In [45]:

    
X_val_resaped = X_valid_scaled.reshape((X_valid_scaled.shape[0], 1, X_valid_scaled.shape[1]))

Fit the Model



In [46]:

    
history = model_k.fit(X_t_reshaped, y_train, validation_data=(X_val_resaped, y_val),\
epochs=10, batch_size=96, verbose=2)









    



Train on 26009 samples, validate on 2975 samples
Epoch 1/10
2s - loss: 0.0757 - val_loss: 0.0165
Epoch 2/10
1s - loss: 0.0605 - val_loss: 0.0191
Epoch 3/10
1s - loss: 0.0506 - val_loss: 0.0166
Epoch 4/10
1s - loss: 0.0421 - val_loss: 0.0207
Epoch 5/10
1s - loss: 0.0370 - val_loss: 0.0267
Epoch 6/10
1s - loss: 0.0339 - val_loss: 0.0307
Epoch 7/10
1s - loss: 0.0321 - val_loss: 0.0357
Epoch 8/10
1s - loss: 0.0309 - val_loss: 0.0394
Epoch 9/10
1s - loss: 0.0303 - val_loss: 0.0396
Epoch 10/10
1s - loss: 0.0298 - val_loss: 0.0413



In [47]:

    
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()









    Out[47]:





<matplotlib.legend.Legend at 0x1c2e8c6c90>



In [48]:

    
X_te_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))



In [49]:

    
res = model_k.predict(X_te_reshaped)



In [50]:

    
test_data["DL_PRED"] = res









    



/Users/jagan/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

LSTM Prediction on Test Data



In [51]:

    
keras_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(keras_show)

A Day on LSTM Predcted Result



In [52]:

    
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]

sep_30 = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(sep_30)

RMSE Value of Random Forest and LSTM



In [53]:

    
from numpy import sqrt
sqrt(mean_squared_error(test_data.USAGE,test_data.DL_PRED))









    Out[53]:





0.16743906314666301



In [54]:

    
sqrt(mean_squared_error(test_data.USAGE,test_data.RF_PREDICTED))









    Out[54]:





0.18412616700088405

	TYPE	DATE	START TIME	END TIME	USAGE	UNITS	COST	NOTES
0	Electric usage	11/1/16	0:00	0:14	0.07	kWh	$0.01	NaN
1	Electric usage	11/1/16	0:15	0:29	0.05	kWh	$0.01	NaN

	TYPE	DATE	START TIME	END TIME	USAGE	UNITS	COST	DATE_TIME	T_1	T_2	T_3	T_4	T_5
26006	Electric usage	7/31/17	23:15	23:29	0.12	kWh	$0.02	2017-07-31 23:29:00	0.15	0.10	0.27	0.44	0.12
26007	Electric usage	7/31/17	23:30	23:44	0.11	kWh	$0.02	2017-07-31 23:44:00	0.12	0.15	0.10	0.27	0.44
26008	Electric usage	7/31/17	23:45	23:59	0.10	kWh	$0.02	2017-07-31 23:59:00	0.11	0.12	0.15	0.10	0.27

	TYPE	DATE	START TIME	END TIME	USAGE	UNITS	COST	NOTES	DATE_TIME	DAY_TYPE	IS_HOLIDAY	T_1	T_2	T_3	T_4	T_5
28984	Electric usage	9/1/17	0:00	0:14	0.03	kWh	$0.01	0	2017-09-01 00:14:00	0	0	0.03	0.03	0.04	0.04	0.05
28985	Electric usage	9/1/17	0:15	0:29	0.04	kWh	$0.01	0	2017-09-01 00:29:00	0	0	0.03	0.03	0.03	0.04	0.04

	T_1	T_2	T_3	T_4	T_5	USAGE
26009	0.1	0.11	0.12	0.15	0.10	0.10
26010	0.1	0.10	0.11	0.12	0.15	0.10
26011	0.1	0.10	0.10	0.11	0.12	0.08