In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import itertools
import pickle
# import openpyxl as px
# from pyexcel_xls import get_data
%matplotlib inline
First, read the csv data files, and convert the index 'Timestamp' to datetimeindex.
In [33]:
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
CSVdata=pd.read_csv('Building Electrical.csv', parse_dates=[0], date_parser=dateparse)
In [34]:
data=pd.read_csv('Building Electrical.csv', parse_dates=[0], date_parser=dateparse)
data['Hour']=data['Timestamp'].dt.hour
data['Date']=data['Timestamp'].dt.date
data['Date1']=data['Timestamp'].dt.date
data['Porter Hall Electric Real Power']=data['Porter Hall Electric Real Power'].convert_objects(convert_numeric=True)
data
Out[34]:
Now reset the index of CSVdata as Timestamp.
In [35]:
CSVdata.set_index('Timestamp', drop=True, append=False, inplace=True, verify_integrity=False)
CSVdata
Out[35]:
Because we are not going to use the data of Baker Hall, therefore we dropped the column of Baker Hall consumption.
In [36]:
CSVdata.drop('Baker Hall Electric Real Power',axis=1, inplace=True)
CSVdata['Porter Hall Electric Real Power'] = CSVdata['Porter Hall Electric Real Power'].convert_objects(convert_numeric=True)
Because the data is to numerous, therefore we resampled the data with 5 minutes period.
In [37]:
resampled_data=CSVdata.resample('5T').mean()
resampled_data
Out[37]:
There are some Nulls in the dataset, we used the interpolate method to filled these null
In [38]:
filled_data=resampled_data.interpolate()
filled_data.isnull().sum().sum()
Out[38]:
Now we use the dataset grouped by date to plot the dailt elecricty consumption of the Porter Hall and Hunt Library.
In [39]:
fig1=plt.figure(figsize=(10,5))
plt.plot(filled_data['Porter Hall Electric Real Power'])
plt.title('Porter Hall daily electricity consumption')
plt.show()
fig2=plt.figure(figsize=(10,5))
plt.title('Hunt Library daily electricity consumption')
plt.plot(filled_data['Hunt Library Real Power'])
plt.show()
Now we use the data grouped by Hour to plot the hourly consumption of Porter Hall and Hunt Library.
In [40]:
data_groupbyHour=data.groupby(['Hour']).mean()
data_groupbyHour
Out[40]:
In [41]:
plt.plot(data_groupbyHour['Porter Hall Electric Real Power'])
plt.title('Porter Hall hourly consumption')
plt.xlabel('Hour')
plt.ylabel('Porter Hall Electric Real Power')
plt.show()
plt.plot(data_groupbyHour['Hunt Library Real Power'])
plt.title('Hunt Library hourly consumption')
plt.xlabel('Hour')
plt.ylabel('Hunt Library Real Powe')
plt.show()
We plot the hourly consumption of both dataset in one figure in order to compared the trend of the electric consumption.
In [42]:
fig6=plt.figure()
ax1=plt.subplot()
ax1.plot(data_groupbyHour['Porter Hall Electric Real Power'],color='b')
plt.ylabel('Porter Hall Electric Real Power')
plt.xlabel('Hour')
ax2=ax1.twinx()
ax2.plot(data_groupbyHour['Hunt Library Real Power'],color='r')
plt.ylabel('Hunt Library Real Power')
plt.xlabel('Hour')
plt.legend()
plt.show()
Now we use the data grouped by date to plot the daily consumption of Porter Hall and Hunt Library.
In [72]:
data_groupbyDate=data.groupby(['Date']).mean()
data_groupbyDate
Out[72]:
In [44]:
fig3=plt.figure(figsize=(12,5))
plt.plot(data_groupbyDate['Porter Hall Electric Real Power'])
plt.title('Porter Hall daily consumption')
plt.ylabel('Porter Hall Electric Real Power')
plt.show()
fig4=plt.figure(figsize=(12,5))
plt.title('Hunt Library daily consumption')
plt.plot(data_groupbyDate['Hunt Library Real Power'])
plt.ylabel('Hunt Library Electric Real Power')
plt.show()
We plot the daily consumption of both dataset in one figure in order to compared the trend of the electric consumption.
In [45]:
fig5=plt.figure(figsize=(12,5))
ax1=plt.subplot()
ax1.plot(data_groupbyDate['Porter Hall Electric Real Power'],color='b')
plt.ylabel('Porter Hall daily consumption')
plt.xlabel('Date')
ax2=ax1.twinx()
ax2.plot(data_groupbyDate['Hunt Library Real Power'],color='r')
plt.ylabel('Hunt Library daily consumption')
plt.xlabel('Date')
plt.legend()
plt.show()
Now we are going to plot the heat map of the electric consumption of both dataset.
In [46]:
data['DayOfYear'] = data['Timestamp'].dt.dayofyear
loadCurves1 = data.groupby(['DayOfYear', 'Hour'])['Porter Hall Electric Real Power'].mean().unstack()
loadCurves2 = data.groupby(['DayOfYear', 'Hour'])['Hunt Library Real Power'].mean().unstack()
In [47]:
import matplotlib.colors as clrs
plt.imshow(loadCurves1, aspect='auto',cmap='summer')
plt.title('Heatmap of Porter Hall Electric Consumption')
plt.ylabel('Day of Year')
plt.xlabel('Hour of the Day')
plt.colorbar()
Out[47]:
In [48]:
plt.imshow(loadCurves2, aspect='auto',cmap='summer')
plt.title('Heatmap of Hunt Library Electric Consumption')
plt.ylabel('Day of Year')
plt.xlabel('Hour of the Day')
plt.colorbar()
Out[48]:
In [71]:
data_groupbyDate
Out[71]:
Now we are using the regression tree to analyze the data.
In [81]:
def plot_regdataOfPorter():
plt.plot(data_groupbyDate['DayOfYear'],data_groupbyDate['Porter Hall Electric Real Power'],'rd')
plt.xlabel('DayOfYear')
plt.ylabel('Porter Hall Electric Real Power')
def plot_regdataOfHunt():
plt.plot(data_groupbyDate['DayOfYear'],data_groupbyDate['Hunt Library Real Power'],'rd')
plt.xlabel('DayOfYear')
plt.ylabel('Hunt Library Real Power')
In [85]:
from sklearn import tree
x = data_groupbyDate['DayOfYear']
y = data_groupbyDate['Porter Hall Electric Real Power']
xrange = np.arange(x.min(),x.max(),(x.max()-x.min())/100).reshape(100,1)
x = x[:, None]
reg = tree.DecisionTreeRegressor() # Default parameters, though you can tweak these!
reg.fit(x,y)
plot_regdataOfPorter()
plt.title('Regression of Porter Hall Electric Consumption')
plt.plot(xrange,reg.predict(xrange),'b--',linewidth=3)
plt.show()
In [86]:
print(reg.score(x,y))
In [87]:
x = data_groupbyDate['DayOfYear']
y = data_groupbyDate['Hunt Library Real Power']
xrange = np.arange(x.min(),x.max(),(x.max()-x.min())/100).reshape(100,1)
x = x[:, None]
reg = tree.DecisionTreeRegressor() # Default parameters, though you can tweak these!
reg.fit(x,y)
plot_regdataOfHunt()
plt.title('Regression of Hunt Library Consumption')
plt.plot(xrange,reg.predict(xrange),'b--',linewidth=3)
plt.show()
In [88]:
print(reg.score(x,y))
In [ ]:
In [ ]: