In [1]:
%matplotlib inline
from matplotlib import pylab as plt
import matplotlib.dates as mdates
plt.rcParams['figure.figsize'] = (15.0, 8.0)
import pandas as pd
import seaborn as sns
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
In [2]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
In [3]:
from bokeh.charts import TimeSeries, output_file, show
from bokeh.io import output_notebook
output_notebook()
Eleven months data from one bed one bath apartment unit in San Jose, CA region was picked for this experiment. The electricity consumption is recorded in 15 minutes interval by the energy supply company. The raw data contains information such as type, date, start time, end time, usage, units, cost and notes fields. The start time and end time is the measurement interval. In this data, the interval is 15 minutes. The usage in 15 minutes interval is provided in kWh unit, and the cost for the consumption is presented in the dollar. Before we deep dive into the data, some quick feature engineering steps are done to enrich the data with more features.
In [4]:
data = pd.read_csv("D202.csv")
data.head(2)
Out[4]:
In [5]:
data["DATE_TIME"] = pd.to_datetime(data.DATE + " " + data["END TIME"])
In [6]:
data["DAY_TYPE"] = data.DATE_TIME.apply(lambda x: 1 if x.dayofweek > 5 else 0 )
In [7]:
cal = calendar()
holidays = cal.holidays(start = data.DATE_TIME.min(), end = data.DATE_TIME.max())
data["IS_HOLIDAY"] = data.DATE_TIME.isin(holidays)
In [8]:
data.head(3)
Out[8]:
In [9]:
for obs in range(1,6):
data["T_" + str(obs)] = data.USAGE.shift(obs)
In [10]:
data.fillna(0.00,inplace=True)
data.head(10)
Out[10]:
In [11]:
data.IS_HOLIDAY = data.IS_HOLIDAY.astype("int")
In [12]:
data.head(2)
Out[12]:
In [13]:
clean_data = data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
In [14]:
clean_data.head(2)
Out[14]:
In [ ]:
In [15]:
all_show = TimeSeries(data,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(all_show)
In [ ]:
In [16]:
xmask = (data.DATE_TIME >= pd.to_datetime("12/20/2016")) & (data.DATE_TIME <= pd.to_datetime("12/27/2016"))
In [17]:
xmas_week = data.loc[xmask]
In [18]:
xmas_show = TimeSeries(xmas_week,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(xmas_show)
In [19]:
dmask = (data.DATE_TIME >= pd.to_datetime("01/01/2017")) & (data.DATE_TIME < pd.to_datetime("01/02/2017"))
nyd = data.loc[dmask]
In [20]:
nyd_show = TimeSeries(nyd,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(nyd_show)
In [21]:
training_data = data[data.DATE_TIME < pd.to_datetime("08/01/2017")]
In [22]:
val_mask = (data.DATE_TIME >= pd.to_datetime("08/01/2017")) & (data.DATE_TIME < pd.to_datetime("09/01/2017"))
val_data = data.loc[val_mask]
In [23]:
test_data = data[data.DATE_TIME >= pd.to_datetime("09/01/2017")]
In [24]:
training_data.tail(3)
Out[24]:
In [25]:
test_data.head(2)
Out[25]:
In [26]:
clean_train = training_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_test = test_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_val = val_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
In [27]:
clean_train.head(2)
Out[27]:
In [28]:
clean_test.head(2)
Out[28]:
In [29]:
clean_val.head(3)
Out[29]:
In [30]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
In [31]:
X_train,y_train = clean_train.drop(["USAGE"],axis=1),clean_train.USAGE
X_test,y_test = clean_test.drop(["USAGE"],axis=1),clean_test.USAGE
X_val,y_val = clean_val.drop(["USAGE"],axis=1),clean_val.USAGE
In [32]:
scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))
rfr = RandomForestRegressor(random_state=2017,verbose=2,n_jobs=5)
In [33]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_valid_scaled = scaler.fit_transform(X_val)
In [ ]:
In [34]:
rfr.fit(X_train_scaled,y_train)
Out[34]:
In [35]:
rfr.score(X_val,y_val)
Out[35]:
In [36]:
rfr.score(X_test,y_test)
Out[36]:
In [37]:
test_data["RF_PREDICTED"] = rfr.predict(X_test_scaled)
In [38]:
test_data.head(5)
Out[38]:
In [39]:
pred_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=800, plot_height=350)
show(pred_show)
In [40]:
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]
sep_30rf = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=900, plot_height=350)
show(sep_30rf)
In [41]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
In [42]:
model_k = Sequential()
model_k.add(LSTM(1, input_shape=(1,7)))
model_k.add(Dense(1))
model_k.compile(loss='mean_squared_error', optimizer='adam')
In [43]:
SVG(model_to_dot(model_k).create(prog='dot', format='svg'))
Out[43]:
In [44]:
X_t_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
In [45]:
X_val_resaped = X_valid_scaled.reshape((X_valid_scaled.shape[0], 1, X_valid_scaled.shape[1]))
In [46]:
history = model_k.fit(X_t_reshaped, y_train, validation_data=(X_val_resaped, y_val),\
epochs=10, batch_size=96, verbose=2)
In [47]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
Out[47]:
In [48]:
X_te_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))
In [49]:
res = model_k.predict(X_te_reshaped)
In [50]:
test_data["DL_PRED"] = res
In [51]:
keras_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(keras_show)
In [52]:
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]
sep_30 = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(sep_30)
In [53]:
from numpy import sqrt
sqrt(mean_squared_error(test_data.USAGE,test_data.DL_PRED))
Out[53]:
In [54]:
sqrt(mean_squared_error(test_data.USAGE,test_data.RF_PREDICTED))
Out[54]: