In [1]:
# Data Manipulation
import psycopg2
import os
import pandas as pd
import numpy as np
import datetime, time

# Exploratory Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
import graphlab as gl
gl.canvas.set_target('ipynb')
sns.set_style('white')

# Interactive Graphics

# To communicate with Plotly's server, sign in with credentials file
import plotly.plotly as py

# Useful Python/Plotly tools
import plotly.tools as tls

# Graph objects to piece together plots
from plotly.graph_objs import *

%matplotlib inline


[INFO] GraphLab Create v1.8.3 started. Logging: /tmp/graphlab_server_1457473672.log

In [2]:
# Setup connection to Pecan Street Dataport
try:
    conn = psycopg2.connect("dbname='postgres' user='2c53Epq3kSyQ' host='dataport.pecanstreet.org' port='5434' password=''")
except:
    "Error: Check there aren't any open connections in notebook or pgAdmin"

In [209]:
conn.close()

In [ ]:
# psql -h dataport.pecanstreet.org -p 5434 -U 2c53Epq3kSyQ -W '' postgres

In [69]:
cursor = conn.cursor()

In [78]:
cursor.execute("SELECT dataid, localhour, SUM(use) FROM university.electricity_egauge_hours GROUP BY dataid, localhour")
for row in cursor:
    print row
    if row == None:
        break

In [38]:
electricity_df = pd.read_sql("SELECT localhour, SUM(use) AS usage, SUM(air1) AS cooling, SUM(furnace1) AS heating, \
                             SUM(car1) AS electric_vehicle \
                             FROM university.electricity_egauge_hours \
                             WHERE dataid = 114 AND use > 0  \
                             AND localhour BETWEEN '2013-10-16 00:00:00'::timestamp AND \
                             '2016-02-26 08:00:00'::timestamp \
                             GROUP BY dataid, localhour \
                             ORDER BY localhour", conn)

In [39]:
electricity_df['localhour'] = electricity_df.localhour.apply(pd.to_datetime)

In [56]:
electricity_df.set_index('localhour', inplace=True)

In [40]:
electricity_df.fillna(value=0.0, inplace=True)

In [12]:
# Min: 2013-10-16 00:00:00
# Max: 2016-02-26 08:00:00
# Count: 20,721
electricity_df.tail()


Out[12]:
usage cooling heating electric_vehicle
localhour
2016-02-26 10:00:00 3.634383 0.000000 0.087900 3.357417
2016-02-26 11:00:00 0.876583 0.000000 0.106583 0.581233
2016-02-26 12:00:00 0.349783 0.000000 0.094233 0.000000
2016-02-26 13:00:00 1.110150 0.000167 0.445933 0.000000
2016-02-26 14:00:00 1.182817 0.000867 0.510467 0.000000

In [13]:
electricity_df_nocar = electricity_df[['usage', 'cooling', 'heating']]

In [14]:
electricity_df_car = electricity_df[['usage','electric_vehicle']]

In [15]:
electricity_df_nocar.plot(figsize=(18,9), title="Pecan Street Household 114 Hourly Energy Consumption")
sns.despine();


Turns out house 114 has an electric car!


In [16]:
electricity_df_car.plot(figsize=(18,9), title="Pecan Street Household 114 Hourly Energy Consumption")
sns.despine();



In [ ]:
# Geohash: 30.292432 -97.699662 Austin, TX

In [17]:
from bokeh.io import output_file, output_notebook
from bokeh.plotting import show
from bokeh.models import GMapPlot, GMapOptions, DataRange1d, PanTool, WheelZoomTool, BoxSelectTool

In [18]:
map_options = GMapOptions(lat=30.292432, lng=-97.699662, map_type="roadmap", zoom=11)

In [19]:
plot = GMapPlot(x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options, title="Austin, TX")

In [20]:
plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())

In [21]:
output_notebook()
# output_file("gmap_plot.html")


BokehJS successfully loaded.

In [22]:
show(plot);



In [67]:
weather_df = pd.read_sql("SELECT localhour, temperature \
                             FROM university.weather \
                             WHERE localhour BETWEEN '2013-10-16 00:00:00'::timestamp AND \
                             '2016-02-26 08:00:00'::timestamp \
                             AND latitude = 30.292432 \
                             ORDER BY localhour", conn)

In [68]:
weather_df['localhour'] = weather_df.localhour.apply(pd.to_datetime)

In [61]:
weather_df.set_index('localhour', inplace=True)

In [69]:
weather_df.fillna(value=0.0, inplace=True)

In [24]:
# Count: 20,673
weather_df.count()


Out[24]:
temperature    20673
dtype: int64

In [31]:
print "Austin had a temperate range of {} degrees between 2013 and 2016".format(weather_df.temperature.max() - weather_df.temperature.min())
weather_df.plot(figsize=(16,8), title='Austin, TX Hourly Weather')
plt.ylabel('Temperature (F)')
plt.axhline(weather_df.temperature.max(), c='r')
plt.axhline(weather_df.temperature.min(), c='blue')
sns.despine();


Austin had a temperate range of 82.16 degrees between 2013 and 2016

Plotly


In [5]:
tls.set_credentials_file(stream_ids=[
        "bcjrhlt0lz",
        "pkkb5aq85l",
        "c5ygaf48l0",
        "l3nh9ls79j"
    ])

In [7]:
stream_ids = tls.get_credentials_file()['stream_ids']
stream_ids


Out[7]:
[u'bcjrhlt0lz', u'pkkb5aq85l', u'c5ygaf48l0', u'l3nh9ls79j']

In [9]:
# help(Stream)

In [329]:
electricity = Stream(
    token="c5ygaf48l0",  # (!) link stream id to 'token' key
    maxpoints=500      # (!) keep a max of 80 pts on screen
)
weather = Stream(
    token="l3nh9ls79j",  # (!) link stream id to 'token' key
    maxpoints=500      # (!) keep a max of 80 pts on screen
)

In [330]:
trace1 = Scatter(x=[], y=[], mode='lines+markers', stream=electricity, name='electricity usage')
trace2 = Scatter(x=[], y=[], mode='lines+markers', stream=weather, name='weather')
data = Data([trace1,trace2])

In [331]:
# Add title to layout object
layout = Layout(title='Pecan Street Sensor Data')

# Make a figure object
fig = Figure(data=data, layout=layout)

# (@) Send fig to Plotly, initialize streaming plot, open new tab
py.iplot(fig, filename='Pecan Street Streaming Electricity Usage')


Out[331]:

In [332]:
# (@) Make instance of the Stream link object, 
#     with same stream id as Stream id object
s1 = py.Stream("c5ygaf48l0")
s2 = py.Stream("l3nh9ls79j")

# (@) Open the stream
s1.open()
s2.open()

In [84]:
i = 0    # a counter
k = 5    # some shape parameter
k2 = 10
N = 200  # number of points to be plotted

# Delay start of stream by 5 sec (time to switch tabs)


while i<N:
    i += 1   # add to counter

    # Current time on x-axis, random numbers on y-axis
    x = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
    y = (np.cos(k*i/50.)*np.cos(i/50.)+np.random.randn(1))[0]
    
    x2 = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
    y2 = (np.cos(k2*i/50.)*np.cos(i/50.)+np.random.randn(1))[0]
    
    # (-) Both x and y are numbers (i.e. not lists nor arrays)

    # (@) write to Plotly stream!
    s1.write(dict(x=x, y=y))
    s2.write(dict(x=x2, y=y2))

    # (!) Write numbers to stream to append current data on plot,
    #     write lists to overwrite existing data on plot (more in 7.2).

    time.sleep(0.08)  # (!) plot a point every 80 ms, for smoother plotting

# (@) Close the stream when done plotting
s1.close()
s2.close()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-84-c92c8cfc121a> in <module>()
     20 
     21     # (@) write to Plotly stream!
---> 22     s1.write(dict(x=x, y=y))
     23     s2.write(dict(x=x2, y=y2))
     24 

NameError: name 's1' is not defined

In [213]:
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')


Out[213]:
'2016-03-03 23:29:26.873469'

In [269]:
# tls.embed('streaming-demos','12')

In [204]:
class PlotlyStream(object):
    
    def __init__(self, electricity_data, weather_data, stream_tokens):
        self.electricity = electricity_data
        self.weather = weather_data
        self.tokens = stream_tokens
    
    def create_plot(self, chart_title, x_title, y_title, maxpoints):
        """Method to generate Plotly plot in notebook for rendering streaming data"""
        
        e_stream = Stream(token= self.tokens[0], maxpoints= maxpoints)
        trace1 = Scatter(x=[], y=[], mode='lines+markers', stream = e_stream, name='Usage')
        
#         w_stream = Stream(token= self.tokens[1], maxpoints= maxpoints)
#         trace2 = Scatter(x=[], y=[], mode='lines+markers', stream = w_stream, yaxis='y2', name='Temp')
        
#         data = Data([trace1, trace2])
        data = Data([trace1])
        
        # Initialize layout object
        layout = Layout(title= chart_title, 
                        showlegend=True,
                        xaxis= dict(title= x_title,
                                    autorange=True,
                                    range= [self.electricity.index.min(),self.electricity.index.max()],
                                    ticks='outside',
                                    type='date'
                                  ),
                        yaxis= dict(title = y_title,
                                    autorange=True,
                                    range=[self.electricity.min(),self.electricity.index.max()],
                                  ),
#                         yaxis2 = dict(title = y2_title,
#                                       range=[self.weather.min(), self.weather.max()],
#                                       overlaying='y',
#                                       side='right'
#                                      ),
                        hovermode='closest'
                       )
        
        # Create figure object
        fig = Figure(data=data, layout=layout)
        
        # (@) Send fig to Plotly, initialize streaming plot, open new tab
        return py.iplot(fig, filename='Pecan Street Streaming Electricity Usage')
    
    def plot_stream(self, plot_freq=0.2, start_delay=0.1):
        """Method to write data to Plotly servers to render on graph"""
        
        s1 = py.Stream(self.tokens[0])
#         s2 = py.Stream(self.tokens[1])
        
        s1.open()
#         s2.open()
        
        counter = 0
        N = 1000
        
        # Create small delay before plotting begins
#         time.sleep(start_delay)
        
        electricity = self.electricity.iterrows()
#         weather = self.weather.iterrows()
        
        while counter < N:
            counter += 1
            
            timestamp1, usage = electricity.next()
#             timestamp2, temperature = weather.next()
            
            # .strftime('%Y-%m-%d %H.%f')
            
#             times = []
#             usages = []
#             temperatures = []
            
            x1 = timestamp1.strftime('%Y-%m-%d %H')
            y1 = usage
            
#             x2 = timestamp2.strftime('%Y-%m-%d %H:%M:%S.%f')
#             y2 = temperature
            
            s1.write(dict(x=x1, y=y1))
#             time.sleep(plot_freq)
#             s2.write(dict(x=x2, y=y2))
            time.sleep(plot_freq)
        
        s1.close()
#         s2.close()

In [205]:
PecanStreet = PlotlyStream(electricity_df_usage, weather_df, ["pkkb5aq85l","c5ygaf48l0"])

In [206]:
PecanStreet.create_plot("Pecan Street Household 114 Electricity Usage", 
                        "Time (Hours)", 
                        "KwH (Kilowats per Hour)",
                        100)


Out[206]:

In [207]:
PecanStreet.plot_stream()


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-207-c495658bc04b> in <module>()
----> 1 PecanStreet.plot_stream()

<ipython-input-204-603d6d4678ca> in plot_stream(self, plot_freq, start_delay)
     84 #             time.sleep(plot_freq)
     85 #             s2.write(dict(x=x2, y=y2))
---> 86             time.sleep(plot_freq)
     87 
     88         s1.close()

KeyboardInterrupt: 

In [43]:
electricity_df_usage = electricity_df[electricity_df.columns[:2]]

In [44]:
electricity_df_usage.values


Out[44]:
array([[Timestamp('2013-10-16 05:00:00'), 0.59295],
       [Timestamp('2013-10-16 06:00:00'), 0.35618333333333335],
       [Timestamp('2013-10-16 07:00:00'), 2.6811833333333333],
       ..., 
       [Timestamp('2016-02-26 12:00:00'), 0.34978333333333333],
       [Timestamp('2016-02-26 13:00:00'), 1.11015],
       [Timestamp('2016-02-26 14:00:00'), 1.1828166666666666]], dtype=object)

GraphLab (to the rescue)


In [45]:
electricity_sf = gl.SFrame(data=electricity_df_usage)
electricity_sf.tail()


Out[45]:
localhour usage
2016-02-26 05:00:00+00:00 0.444633333333
2016-02-26 06:00:00+00:00 0.3029
2016-02-26 07:00:00+00:00 0.794866666667
2016-02-26 08:00:00+00:00 3.63953333333
2016-02-26 09:00:00+00:00 3.6399
2016-02-26 10:00:00+00:00 3.63438333333
2016-02-26 11:00:00+00:00 0.876583333333
2016-02-26 12:00:00+00:00 0.349783333333
2016-02-26 13:00:00+00:00 1.11015
2016-02-26 14:00:00+00:00 1.18281666667
[10 rows x 2 columns]

In [46]:
electricity_sf.show(view='Line Chart')



In [ ]:
#### Bayesian Changepoints https://dato.com/learn/userguide/anomaly_detection/bayesian_changepoints.html

In [47]:
model = gl.anomaly_detection.bayesian_changepoints.create(electricity_sf, feature='usage', lag=7)

In [48]:
model.summary()


Class                                   : BayesianChangepointsModel

Schema
------
Number of examples                      : 20722
Feature name                            : usage
Expected run length                     : 250.0
Lag observations                        : 7

Training summary
----------------
Total training time (seconds)           : 1.8781

Accessible fields
-----------------
scores                                  : Changepoint score for each row in the input dataset.

In [49]:
scores = model['scores']

In [50]:
scores.head()


Out[50]:
row_id changepoint_score usage model_update_time
0 0.00363362045466 0.59295 2016-03-08
14:05:39.910733 ...
1 0.0059748827345 0.356183333333 2016-03-08
14:05:39.910733 ...
2 0.00753854265712 2.68118333333 2016-03-08
14:05:39.910733 ...
3 1.6488818848e-05 3.62593333333 2016-03-08
14:05:39.910733 ...
4 2.6755134654e-06 3.60446666667 2016-03-08
14:05:39.910733 ...
5 0.000108351636069 1.81925 2016-03-08
14:05:39.910733 ...
6 0.896932118078 0.260316666667 2016-03-08
14:05:39.910733 ...
7 0.0828438352306 0.468833333333 2016-03-08
14:05:39.910733 ...
8 0.00545402438539 0.518583333333 2016-03-08
14:05:39.910733 ...
9 0.00106904279919 0.33455 2016-03-08
14:05:39.910733 ...
[10 rows x 4 columns]


In [51]:
sketch = scores['changepoint_score'].sketch_summary()
threshold = sketch.quantile(0.995)
changepoints = scores[scores['changepoint_score'] > threshold]
changepoints.print_rows(4, max_row_width=100, max_column_width=30)


+--------+-------------------+----------------+----------------------------+
| row_id | changepoint_score |     usage      |     model_update_time      |
+--------+-------------------+----------------+----------------------------+
|   78   |   0.945461592791  |     0.1915     | 2016-03-08 14:05:39.910733 |
|  150   |   0.931435273294  | 0.239916666667 | 2016-03-08 14:05:39.910733 |
|  246   |   0.915574037723  |     0.2979     | 2016-03-08 14:05:39.910733 |
|  366   |   0.914159413573  | 0.234483333333 | 2016-03-08 14:05:39.910733 |
+--------+-------------------+----------------+----------------------------+
[107 rows x 4 columns]


In [52]:
electricity_df['changepoint_score'] = scores['changepoint_score']

In [53]:
electricity_df.changepoint_score = electricity_df.changepoint_score.astype('float64')

In [54]:
def plot_changepoints(electricity_df, weather_df, quantile=0.995):
    """Function to plot changepoints on time series"""
    
    # Referenced: http://matplotlib.org/examples/api/two_scales.html
    fig, ax1 = plt.subplots(figsize=(24,10))
    ax1.plot(electricity_df.index, electricity_df.usage)
    plt.plot(electricity_df.index, electricity_df.electric_vehicle)
    ax1.set_xlabel('Time (h)')
    # Make the y-axis label and tick labels match the line color.
    ax1.set_ylabel('KwH (Kilowats Per Hour)', color='b')
    for tl in ax1.get_yticklabels():
        tl.set_color('b')
    changepoint_ts = electricity_df.index[electricity_df.changepoint_score > electricity_df.changepoint_score.quantile(quantile)]
    for changepoint in changepoint_ts:
        plt.axvline(changepoint, c='r')
    plt.legend()

    ax2 = ax1.twinx()
    ax2.plot(weather_df.index, weather_df.temperature, 'm--')
    ax2.set_ylabel('Temperature (F)', color='m')
    for ytick in ax2.get_yticklabels():
        ytick.set_color('m')
    plt.show()

In [57]:
plot_changepoints(electricity_df['2014-08-18'], weather_df['2014-08-18']);



In [223]:
plot_changepoints(electricity_df['2015'], weather_df['2015']);



In [222]:
plot_changepoints(electricity_df['2016'], weather_df['2016']);



In [70]:
weather_sf = gl.SFrame(data=weather_df)

In [71]:
weather_sf.head()


Out[71]:
localhour temperature
2013-10-16 05:00:00+00:00 59.44
2013-10-16 06:00:00+00:00 57.53
2013-10-16 07:00:00+00:00 57.53
2013-10-16 08:00:00+00:00 56.74
2013-10-16 09:00:00+00:00 56.07
2013-10-16 10:00:00+00:00 55.76
2013-10-16 11:00:00+00:00 55.42
2013-10-16 12:00:00+00:00 54.07
2013-10-16 13:00:00+00:00 54.51
2013-10-16 14:00:00+00:00 55.08
[10 rows x 2 columns]


In [72]:
weather_ts = gl.TimeSeries(data=weather_sf, index='localhour')

In [73]:
weather_ts.head()


Out[73]:
localhour temperature
2013-10-16 05:00:00+00:00 59.44
2013-10-16 06:00:00+00:00 57.53
2013-10-16 07:00:00+00:00 57.53
2013-10-16 08:00:00+00:00 56.74
2013-10-16 09:00:00+00:00 56.07
2013-10-16 10:00:00+00:00 55.76
2013-10-16 11:00:00+00:00 55.42
2013-10-16 12:00:00+00:00 54.07
2013-10-16 13:00:00+00:00 54.51
2013-10-16 14:00:00+00:00 55.08
[10 rows x 2 columns]


In [74]:
weather_sf.show(view='Line Chart')



In [ ]: