In [1]:
##### Dining Room #####

#### In this notebook, we use thingflow to read in some previously
# captured lux (light level) sensor data. We use a sliding window
# transducer to smooth the data and then convert both the raw data and
# the smoothed data to Pandas "Series" data values. These can then be
# plotted directly in jupyter.

# Some initial setup
import pandas as pd
# needed to get the graphs to show up
%matplotlib inline
import matplotlib.style
# To workaround a bug in pandas, see
# http://stackoverflow.com/questions/33995707/attributeerror-unknown-property-color-cycle
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt

print("pandas version=%s"% pd.__version__)
print("matplotlib version=%s" % matplotlib.__version__)


pandas version=0.18.1
matplotlib version=1.5.1

In [2]:
# Make sure we can see the thingflow package
import sys
import os.path
antpath=os.path.abspath('../../../thingflow-python')
assert os.path.exists(antpath)
assert os.path.exists(os.path.join(antpath, 'thingflow'))
if antpath not in sys.path:
    sys.path.append(antpath)
import thingflow.base

In [3]:
# Some setup for our analysis
import asyncio
from pytz import timezone
from thingflow.base import Filter,OutputThing,Scheduler,SensorEvent,\
                           filtermethod
from thingflow.adapters.csv import CsvReader
from thingflow.adapters.pandas import PandasSeriesWriter
from thingflow.filters.transducer import SensorSlidingMean
import thingflow.filters.combinators
import thingflow.filters.select

PDT=timezone('America/Los_Angeles')
scheduler = Scheduler(asyncio.get_event_loop())

In [4]:
# Now, read in the spreadsheet of lux values and build the
# dataflow to process the data
import math
from lux_analysis import SensorSlidingMeanPassNaNs, fill_in_missing_times, CaptureNaNIndexes

DIR = '../data_files/'
ROOM = 'dining-room'
reader = CsvReader(DIR+ROOM+'.csv')


# The raw writer captures the raw spreadsheet data
raw_series_writer = PandasSeriesWriter(tz=PDT)
# The smoothed writer captures the data that was passed through a
# sliding average  transducer. We average over the last 5 samples.
smoothed_series_writer = PandasSeriesWriter(tz=PDT)

# We build and maintain a list of the NaN indexes, so that we can
# update them at will. This is needed because the clustering algorithm
# doesn't handle them
capture_nan_indexes = CaptureNaNIndexes()

# The smoothed data coming out of the sliding window mean is floating
# point. We round everything to the nearest integer, since that the
# accuracy we started with.
def round_event_val(x):
    if not math.isnan(x.val):
        return SensorEvent(ts=x.ts, sensor_id=x.sensor_id,
                           val=int(round(x.val)))
    else:
        return x

# Now we can put all the processing steps together
reader.fill_in_missing_times()\
      .passthrough(raw_series_writer)\
      .transduce(SensorSlidingMeanPassNaNs(5)).select(round_event_val).passthrough(smoothed_series_writer)\
      .passthrough(capture_nan_indexes).output_count()
reader.print_downstream()
    
# run it!
scheduler.schedule_recurring(reader)
scheduler.run_forever()
print("Ran the stream")


***** Dump of all paths from CsvReader(../data_files/dining-room.csv) *****
  CsvReader(../data_files/dining-room.csv) => fill_in_missing_times() => <thingflow.adapters.pandas.PandasSeriesWriter object at 0x114b26b70>
  CsvReader(../data_files/dining-room.csv) => fill_in_missing_times() => transduce(SensorSlidingMean(5)) => select => <thingflow.adapters.pandas.PandasSeriesWriter object at 0x11603e550>
  CsvReader(../data_files/dining-room.csv) => fill_in_missing_times() => transduce(SensorSlidingMean(5)) => select => <lux_analysis.CaptureNaNIndexes object at 0x11603e5f8>
  CsvReader(../data_files/dining-room.csv) => fill_in_missing_times() => transduce(SensorSlidingMean(5)) => select => OutputCount()
****************************************************
Found 22004 missing samples
Found 6 missing samples
Found 318 missing samples
************************************
*      15816 events processed      *
************************************
No more active schedules, will exit event loop
Ran the stream

In [5]:
# graph the full raw series
raw_series_writer.result.plot(figsize=(15,10))


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x105eb4518>

In [6]:
# Graph the last day of the raw data
raw_series_writer.result[:1440].plot(figsize=(15,10))


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x118b12320>

In [7]:
# graph the smoothed series
smoothed_series_writer.result.plot(figsize=(15,10))


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x118b05f60>

In [8]:
# Graph the last day of the smoothed data
smoothed_series_writer.result[:1440].plot(figsize=(15,10))


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x1187a3dd8>

In [9]:
# Now, lets try clustering the data
CLUSTER_LABELS = 4 # how many groups will we create
import numpy as np
from sklearn.cluster import KMeans
# create an array that just has the values, no timestamps
npa = np.array(smoothed_series_writer.result).reshape(-1,1)
capture_nan_indexes.replace_nans(npa, 0.0)
kmeans = KMeans(CLUSTER_LABELS)
kmeans.fit(npa)
# print the raw clustered data
print("kmeans labels: " + str(kmeans.labels_))
from numpy.core.numeric import NaN
labels_with_nans = capture_nan_indexes.new_array_replace_nans(kmeans.labels_, NaN)
labels_against_time = pd.Series(labels_with_nans, index=smoothed_series_writer.index)
labels_against_time.plot(figsize=(15,10))
import pylab
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines


replacing index 2572 with 0.0
replacing index 7659 with 0.0
replacing index 7661 with 0.0
kmeans labels: [3 3 3 ..., 1 1 2]
Out[9]:
(0, 3.5)

In [10]:
# Unfortunately, the clustered data is creating arbitrary groups. We want the
# integer number to be proportional to the lux value so that the graphs are
# easier to interpret. To do this, we map 0 to the lowest lux level, 3 to
# the highest, etc. In cluster_mapping, the index is the cluster number and
# the value is the one we will graph.
label_totals = [0]*CLUSTER_LABELS
label_counts = [0]*CLUSTER_LABELS
for i in range(len(labels_with_nans)):
    l = labels_with_nans[i]
    if not math.isnan(l):
        lux = smoothed_series_writer.result[i]
        label_totals[l] += lux
        label_counts[l] += 1
averages = [(label_totals[i]/label_counts[i], i) for i in range(CLUSTER_LABELS)]
averages.sort()
print("averages = %s" % averages)
def pivot(array):
    result = [-1]*len(array)
    for i in range(len(array)):
        result[array[i]] = i
    return result
cluster_mapping = pivot([i for (avg, i) in averages])
print("cluster_mapping = %s" % cluster_mapping)
kmeans_lux = [cluster_mapping[l] if not math.isnan(l) else NaN for l in labels_with_nans]
# Now, we can convert it back to a Pandas series by using our original timestamp
# array with the clustered data.
clusters_against_time = pd.Series(kmeans_lux, index=smoothed_series_writer.index)
clusters_against_time.plot(figsize=(15,10))
import pylab
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines


averages = [(4.701115963124697, 0), (36.386980108499095, 3), (89.049347702779357, 2), (210.86758732737613, 1)]
cluster_mapping = [0, 3, 2, 1]
Out[10]:
(0, 3.5)

In [11]:
# Lets look at the last day of data to get a closer view
clusters_against_time[:1440].plot(figsize=(15,10))
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines


Out[11]:
(0, 3.5)

In [12]:
# Now, lets map this to on-off values
# We pick an "on threshold" based on the values we see in the above graph,
# trying to account for ambiant light.
ON_THRESHOLD = 2

def on_off_mapping(v):
    if math.isnan(v):
        return NaN
    elif v>=ON_THRESHOLD:
        return 1
    else:
        return 0

on_off_series = pd.Series([on_off_mapping(l) for l in kmeans_lux], index=smoothed_series_writer.index)
on_off_series.plot(figsize=(15,10))
pylab.ylim([0,1.5])


Out[12]:
(0, 1.5)

In [13]:
# Graph the last day of the data to get a closer view
on_off_series[:1440].plot(figsize=(15,10))
pylab.ylim([0,1.5])


Out[13]:
(0, 1.5)

In [15]:
# Import our code that will analyze the data.
import numpy as np
from numpy import float64
import datetime
from lux_analysis import time_of_day_to_zone, dt_to_minutes, NUM_ZONES, get_sunrise_sunset
SUNRISE = 351 # time in minutes since midnight on July 2 (5:52 am)
SUNSET = 1232 # time in minutes since midnight on July 2 (8:33 pm)
(sunrise, sunset) = get_sunrise_sunset(2016, 7, 2)
assert SUNRISE==sunrise
assert SUNSET==sunset

# We divide a day into "zones" based on time relative to sunrise/sunset.
# Let's plot our zones for a day
zone_test_index = [datetime.datetime(year=2016, month=7, day=2, hour=h, minute=m)
                   for (h, m) in [(h, m) for h in range(0, 24) for m in range(0, 60)]]
zone_test_data = [time_of_day_to_zone(dt_to_minutes(dt), SUNRISE, SUNSET) for dt in zone_test_index]
zone_test_series = pd.Series(zone_test_data, index=zone_test_index)
zone_test_series.plot(figsize=(15,10))
pylab.ylim([0, NUM_ZONES])


Out[15]:
(0, 4)

In [16]:
# Build some histograms of the data
from lux_analysis import build_length_histogram_data

length_data = build_length_histogram_data(on_off_series, smoothed_series_writer.index)

# look at the distribution of lengths
from itertools import chain
all_on_lengths = list(chain.from_iterable(length_data.on_lengths))
plt.hist(all_on_lengths)


initial state is WAITING
changing to RECORDING_STATE(1) at 2016-07-01 20:51:52.137163-07:00
ON sequence zone 2, length 30
OFF sequence zone 2, length 632
ON sequence zone 1, length 185
OFF sequence zone 1, length 428
ON sequence zone 1, length 29
OFF sequence zone 1, length 17
ON sequence zone 1, length 1
OFF sequence zone 1, length 780
ON sequence zone 1, length 184
changing to NAN_STATE at 2016-07-03 15:36:04.724411-07:00
changing to WAITING_STATE at 2016-07-18 22:20:57.302003-07:00
changing to RECORDING_STATE(1) at 2016-07-19 07:48:21.244623-07:00
ON sequence zone 1, length 196
OFF sequence zone 1, length 420
ON sequence zone 1, length 37
OFF sequence zone 1, length 187
ON sequence zone 3, length 38
OFF sequence zone 3, length 562
ON sequence zone 1, length 196
OFF sequence zone 1, length 419
ON sequence zone 1, length 36
OFF sequence zone 1, length 154
ON sequence zone 2, length 8
OFF sequence zone 2, length 711
ON sequence zone 1, length 107
OFF sequence zone 1, length 3
ON sequence zone 1, length 4
OFF sequence zone 1, length 416
ON sequence zone 1, length 37
OFF sequence zone 1, length 84
ON sequence zone 2, length 128
OFF sequence zone 3, length 574
ON sequence zone 1, length 198
changing to NAN_STATE at 2016-07-22 11:10:32.100944-07:00
changing to WAITING_STATE at 2016-07-22 11:16:35.056434-07:00
changing to NAN_STATE at 2016-07-22 11:17:35.056434-07:00
changing to WAITING_STATE at 2016-07-22 16:36:29.326077-07:00
changing to RECORDING_STATE(1) at 2016-07-22 16:42:29.618677-07:00
ON sequence zone 1, length 33
OFF sequence zone 1, length 47
ON sequence zone 1, length 55
OFF sequence zone 1, length 63
ON sequence zone 2, length 57
OFF sequence zone 2, length 651
ON sequence zone 1, length 197
OFF sequence zone 1, length 419
ON sequence zone 1, length 25
OFF sequence zone 1, length 1
ON sequence zone 1, length 11
OFF sequence zone 1, length 80
ON sequence zone 2, length 80
OFF sequence zone 2, length 627
ON sequence zone 1, length 198
OFF sequence zone 1, length 416
ON sequence zone 1, length 32
OFF sequence zone 1, length 9
ON sequence zone 1, length 222
OFF sequence zone 3, length 561
ON sequence zone 1, length 201
OFF sequence zone 1, length 414
ON sequence zone 1, length 23
OFF sequence zone 1, length 806
ON sequence zone 1, length 195
OFF sequence zone 1, length 416
ON sequence zone 1, length 22
OFF sequence zone 1, length 14
ON sequence zone 1, length 1
OFF sequence zone 1, length 790
ON sequence zone 1, length 200
OFF sequence zone 1, length 412
ON sequence zone 1, length 4
OFF sequence zone 1, length 11
ON sequence zone 1, length 1
OFF sequence zone 1, length 831
Out[16]:
(array([ 9.,  9.,  2.,  1.,  1.,  1.,  0.,  0.,  8.,  3.]),
 array([   1. ,   23.1,   45.2,   67.3,   89.4,  111.5,  133.6,  155.7,
         177.8,  199.9,  222. ]),
 <a list of 10 Patch objects>)

In [17]:
# Show the histogram for all off lengths
all_off_lengths = list(chain.from_iterable(length_data.off_lengths))
plt.hist(all_off_lengths)


Out[17]:
(array([ 9.,  2.,  1.,  0.,  2.,  7.,  3.,  3.,  1.,  4.]),
 array([   1.,   84.,  167.,  250.,  333.,  416.,  499.,  582.,  665.,
         748.,  831.]),
 <a list of 10 Patch objects>)

In [18]:
from hmmlearn.hmm import MultinomialHMM as MHMM
from lux_analysis import HmmScanner

# Scan all the samples and produce subsequences of samples for each
# zone, suitable for the hmm fit() method.
scanner = HmmScanner()
scanner.process_samples(on_off_series, smoothed_series_writer.index)

# train hmms by zone
hmm_by_zone = []
for z in range(NUM_ZONES):
    num_states = 5 #max(3, int(1+len(trainer.on_lengths[z])/2))
    #print("%d states for zone %d" % (num_states, z))            
    hmm = MHMM(n_components=num_states)
    data = np.array(scanner.samples_by_zone[z]).reshape(-1,1)
    lengths = np.array(scanner.lengths_by_zone[z])
    print("zone %s: %s samples in %s subsequences" % (z, len(data), len(lengths)))
    hmm.fit(data, lengths)
    hmm_by_zone.append(hmm)

def predict_hmm_by_zone(dt_series):
    predictions = []
    last_zone = None
    last_cnt = None
    for dt in dt_series:
        (sunrise, sunset) = get_sunrise_sunset(dt.year, dt.month, dt.day)
        zone = time_of_day_to_zone(dt_to_minutes(dt), sunrise, sunset)
        if zone != last_zone:
            if last_cnt is not None:
                print("add %d events for zone %d on day %d/%d" % (last_cnt, last_zone, dt.month, dt.day))
                (samples, states) = hmm_by_zone[last_zone].sample(last_cnt)
                predictions.extend([x[0] for x in samples])
            last_cnt = 1
        else:
            last_cnt += 1
        last_zone = zone
    if last_zone is not None:
        print("add %d events for zone %d" % (last_cnt, last_zone))
        (samples, states) = hmm_by_zone[last_zone].sample(last_cnt)
        predictions.extend([x[0] for x in samples])
    assert len(predictions)==len(dt_series)
    return pd.Series(predictions, index=dt_series)

# generate some perdictions for July 31, 2016
predict_dts_short = [datetime.datetime(year=2016, month=7, day=31, hour=h, minute=m)
                     for (h, m) in [(h, m) for h in range(0, 24) for m in range(0, 60)]]
short_predictions = predict_hmm_by_zone(predict_dts_short)
def plot(predictions):
    predictions.plot(figsize=(15,10))
    pylab.ylim([0, 1.5])
plot(short_predictions)


zone 0: 4349 samples in 12 subsequences
zone 1: 8721 samples in 14 subsequences
zone 2: 1012 samples in 11 subsequences
zone 3: 1731 samples in 12 subsequences
add 371 events for zone 0 on day 7/31
add 817 events for zone 1 on day 7/31
add 103 events for zone 2 on day 7/31
add 149 events for zone 3

In [19]:
# Now, let's look at some predictions over a week
predict_dts_long = [datetime.datetime(year=2016, month=8, day=d, hour=h, minute=m)
                    for (d, h, m) in [(d, h, m) for d in range(1, 8) for h in range(0, 24)
                                      for m in range(0, 60)]]
long_predictions = predict_hmm_by_zone(predict_dts_long)
plot(long_predictions)


add 372 events for zone 0 on day 8/1
add 815 events for zone 1 on day 8/1
add 104 events for zone 2 on day 8/1
add 149 events for zone 3 on day 8/2
add 373 events for zone 0 on day 8/2
add 813 events for zone 1 on day 8/2
add 105 events for zone 2 on day 8/2
add 149 events for zone 3 on day 8/3
add 374 events for zone 0 on day 8/3
add 811 events for zone 1 on day 8/3
add 106 events for zone 2 on day 8/3
add 149 events for zone 3 on day 8/4
add 375 events for zone 0 on day 8/4
add 809 events for zone 1 on day 8/4
add 107 events for zone 2 on day 8/4
add 149 events for zone 3 on day 8/5
add 375 events for zone 0 on day 8/5
add 808 events for zone 1 on day 8/5
add 108 events for zone 2 on day 8/5
add 149 events for zone 3 on day 8/6
add 376 events for zone 0 on day 8/6
add 806 events for zone 1 on day 8/6
add 109 events for zone 2 on day 8/6
add 149 events for zone 3 on day 8/7
add 377 events for zone 0 on day 8/7
add 804 events for zone 1 on day 8/7
add 110 events for zone 2 on day 8/7
add 149 events for zone 3

In [20]:
# save the trained HMM to a file
from sklearn.externals import joblib
import os.path
if not os.path.exists('saved_hmms'):
    os.mkdir('saved_hmms')
    print("created saved_hmms direectory")
for zone in range(NUM_ZONES):
    fname = 'saved_hmms/hmm-%s-zone-%d.pkl' % (ROOM, zone)
    joblib.dump(hmm_by_zone[zone], fname)
    print("save hmm to %s" % fname)


save hmm to saved_hmms/hmm-dining-room-zone-0.pkl
save hmm to saved_hmms/hmm-dining-room-zone-1.pkl
save hmm to saved_hmms/hmm-dining-room-zone-2.pkl
save hmm to saved_hmms/hmm-dining-room-zone-3.pkl