In [1]:
##### Back room #####

#### In this notebook, we use thingflow to read in some previously
# captured lux (light level) sensor data. We use a sliding window
# transducer to smooth the data and then convert both the raw data and
# the smoothed data to Pandas "Series" data values. These can then be
# plotted directly in jupyter.

# Some initial setup
import pandas as pd
# needed to get the graphs to show up
%matplotlib inline
import matplotlib.style
# To workaround a bug in pandas, see
# http://stackoverflow.com/questions/33995707/attributeerror-unknown-property-color-cycle
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt

print("pandas version=%s"% pd.__version__)
print("matplotlib version=%s" % matplotlib.__version__)


pandas version=0.18.1
matplotlib version=1.5.1

In [2]:
# Make sure we can see the thingflow package
import sys
import os.path
antpath=os.path.abspath('../../../thingflow-python')
assert os.path.exists(antpath)
assert os.path.exists(os.path.join(antpath, 'thingflow'))
if antpath not in sys.path:
    sys.path.append(antpath)
import thingflow.base

In [3]:
# Some setup for our analysis
import asyncio
from pytz import timezone
from thingflow.base import Filter,OutputThing,Scheduler,SensorEvent,\
                           filtermethod
from thingflow.adapters.csv import CsvReader
from thingflow.adapters.pandas import PandasSeriesWriter
from thingflow.filters.transducer import SensorSlidingMean
import thingflow.filters.combinators
import thingflow.filters.select

PDT=timezone('America/Los_Angeles')
scheduler = Scheduler(asyncio.get_event_loop())

In [4]:
# Now, read in the spreadsheet of lux values and build the
# dataflow to process the data
import math
from lux_analysis import SensorSlidingMeanPassNaNs, fill_in_missing_times, CaptureNaNIndexes

DIR = '../data_files/'
ROOM = 'back-room'
reader = CsvReader(DIR+ROOM+'.csv')


# The raw writer captures the raw spreadsheet data
raw_series_writer = PandasSeriesWriter(tz=PDT)
# The smoothed writer captures the data that was passed through a
# sliding average  transducer. We average over the last 5 samples.
smoothed_series_writer = PandasSeriesWriter(tz=PDT)

# We build and maintain a list of the NaN indexes, so that we can
# update them at will. This is needed because the clustering algorithm
# doesn't handle them
capture_nan_indexes = CaptureNaNIndexes()

# The smoothed data coming out of the sliding window mean is floating
# point. We round everything to the nearest integer, since that the
# accuracy we started with.
def round_event_val(x):
    if not math.isnan(x.val):
        return SensorEvent(ts=x.ts, sensor_id=x.sensor_id,
                           val=int(round(x.val)))
    else:
        return x

# Now we can put all the processing steps together
reader.fill_in_missing_times()\
      .passthrough(raw_series_writer)\
      .transduce(SensorSlidingMeanPassNaNs(5)).select(round_event_val).passthrough(smoothed_series_writer)\
      .passthrough(capture_nan_indexes).output_count()
reader.print_downstream()
    
# run it!
scheduler.schedule_recurring(reader)
scheduler.run_forever()
print("Ran the stream")


***** Dump of all paths from CsvReader(../data_files/back-room.csv) *****
  CsvReader(../data_files/back-room.csv) => fill_in_missing_times() => <thingflow.adapters.pandas.PandasSeriesWriter object at 0x105fcdfd0>
  CsvReader(../data_files/back-room.csv) => fill_in_missing_times() => transduce(SensorSlidingMean(5)) => select => <thingflow.adapters.pandas.PandasSeriesWriter object at 0x11613e550>
  CsvReader(../data_files/back-room.csv) => fill_in_missing_times() => transduce(SensorSlidingMean(5)) => select => <lux_analysis.CaptureNaNIndexes object at 0x11613e5f8>
  CsvReader(../data_files/back-room.csv) => fill_in_missing_times() => transduce(SensorSlidingMean(5)) => select => OutputCount()
**************************************************
Found 22004 missing samples
Found 631 missing samples
Found 5 missing samples
************************************
*      12714 events processed      *
************************************
No more active schedules, will exit event loop
Ran the stream

In [5]:
# graph the full raw series
raw_series_writer.result.plot(figsize=(15,10))


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x105facd68>

In [6]:
# Graph the last day of the raw data
raw_series_writer.result[:1440].plot(figsize=(15,10))


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x116456dd8>

In [7]:
# graph the smoothed series
smoothed_series_writer.result.plot(figsize=(15,10))


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x118adfdd8>

In [8]:
# Graph the last day of the smoothed data
smoothed_series_writer.result[:1440].plot(figsize=(15,10))


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x119c95ac8>

In [9]:
# Now, lets try clustering the data
CLUSTER_LABELS = 4 # how many groups will we create
import numpy as np
from sklearn.cluster import KMeans
# create an array that just has the values, no timestamps
npa = np.array(smoothed_series_writer.result).reshape(-1,1)
capture_nan_indexes.replace_nans(npa, 0.0)
kmeans = KMeans(CLUSTER_LABELS)
kmeans.fit(npa)
# print the raw clustered data
print("kmeans labels: " + str(kmeans.labels_))
from numpy.core.numeric import NaN
labels_with_nans = capture_nan_indexes.new_array_replace_nans(kmeans.labels_, NaN)
labels_against_time = pd.Series(labels_with_nans, index=smoothed_series_writer.index)
labels_against_time.plot(figsize=(15,10))
import pylab
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines


replacing index 1541 with 0.0
replacing index 1543 with 0.0
replacing index 5406 with 0.0
kmeans labels: [3 3 3 ..., 0 0 0]
Out[9]:
(0, 3.5)

In [10]:
# Unfortunately, the clustered data is creating arbitrary groups. We want the
# integer number to be proportional to the lux value so that the graphs are
# easier to interpret. To do this, we map 0 to the lowest lux level, 3 to
# the highest, etc. In cluster_mapping, the index is the cluster number and
# the value is the one we will graph.
label_totals = [0]*CLUSTER_LABELS
label_counts = [0]*CLUSTER_LABELS
for i in range(len(labels_with_nans)):
    l = labels_with_nans[i]
    if not math.isnan(l):
        lux = smoothed_series_writer.result[i]
        label_totals[l] += lux
        label_counts[l] += 1
averages = [(label_totals[i]/label_counts[i], i) for i in range(CLUSTER_LABELS)]
averages.sort()
print("averages = %s" % averages)
def pivot(array):
    result = [-1]*len(array)
    for i in range(len(array)):
        result[array[i]] = i
    return result
cluster_mapping = pivot([i for (avg, i) in averages])
print("cluster_mapping = %s" % cluster_mapping)
kmeans_lux = [cluster_mapping[l] if not math.isnan(l) else NaN for l in labels_with_nans]
# Now, we can convert it back to a Pandas series by using our original timestamp
# array with the clustered data.
clusters_against_time = pd.Series(kmeans_lux, index=smoothed_series_writer.index)
clusters_against_time.plot(figsize=(15,10))
import pylab
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines


averages = [(2.2691604858647749, 0), (16.773250239693194, 3), (52.665806451612902, 2), (165.24497257769653, 1)]
cluster_mapping = [0, 3, 2, 1]
Out[10]:
(0, 3.5)

In [11]:
# Lets look at the last day of data to get a closer view
clusters_against_time[:1440].plot(figsize=(15,10))
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines


Out[11]:
(0, 3.5)

In [12]:
# Now, lets map this to on-off values
# We pick an "on threshold" based on the values we see in the above graph,
# trying to account for ambiant light.
ON_THRESHOLD = 1

def on_off_mapping(v):
    if math.isnan(v):
        return NaN
    elif v>=ON_THRESHOLD:
        return 1
    else:
        return 0

on_off_series = pd.Series([on_off_mapping(l) for l in kmeans_lux], index=smoothed_series_writer.index)
on_off_series.plot(figsize=(15,10))
pylab.ylim([0,1.5])


Out[12]:
(0, 1.5)

In [13]:
# Graph the last day of the data to get a closer view
on_off_series[:1440].plot(figsize=(15,10))
pylab.ylim([0,1.5])


Out[13]:
(0, 1.5)

In [15]:
# Import our code that will analyze the data.
import numpy as np
from numpy import float64
import datetime
from lux_analysis import time_of_day_to_zone, dt_to_minutes, NUM_ZONES, get_sunrise_sunset
SUNRISE = 351 # time in minutes since midnight on July 2 (5:52 am)
SUNSET = 1232 # time in minutes since midnight on July 2 (8:33 pm)
(sunrise, sunset) = get_sunrise_sunset(2016, 7, 2)
assert SUNRISE==sunrise
assert SUNSET==sunset

# We divide a day into "zones" based on time relative to sunrise/sunset.
# Let's plot our zones for a day
zone_test_index = [datetime.datetime(year=2016, month=7, day=2, hour=h, minute=m)
                   for (h, m) in [(h, m) for h in range(0, 24) for m in range(0, 60)]]
zone_test_data = [time_of_day_to_zone(dt_to_minutes(dt), SUNRISE, SUNSET) for dt in zone_test_index]
zone_test_series = pd.Series(zone_test_data, index=zone_test_index)
zone_test_series.plot(figsize=(15,10))
pylab.ylim([0, NUM_ZONES])


Out[15]:
(0, 4)

In [16]:
# Build some histograms of the data
from lux_analysis import build_length_histogram_data

length_data = build_length_histogram_data(on_off_series, smoothed_series_writer.index)

# look at the distribution of lengths
from itertools import chain
all_on_lengths = list(chain.from_iterable(length_data.on_lengths))
plt.hist(all_on_lengths)


initial state is WAITING
changing to RECORDING_STATE(0) at 2016-07-02 10:53:59.282483-07:00
OFF sequence zone 1, length 45
ON sequence zone 1, length 8
OFF sequence zone 1, length 12
ON sequence zone 1, length 2
OFF sequence zone 1, length 20
ON sequence zone 1, length 3
OFF sequence zone 1, length 4
ON sequence zone 1, length 2
OFF sequence zone 1, length 24
ON sequence zone 1, length 1
OFF sequence zone 1, length 82
ON sequence zone 1, length 174
OFF sequence zone 1, length 26
ON sequence zone 1, length 48
OFF sequence zone 1, length 114
ON sequence zone 3, length 74
OFF sequence zone 3, length 802
ON sequence zone 1, length 3
OFF sequence zone 1, length 26
ON sequence zone 1, length 1
OFF sequence zone 1, length 27
ON sequence zone 1, length 4
OFF sequence zone 1, length 7
changing to NAN_STATE at 2016-07-03 15:35:58.385299-07:00
changing to WAITING_STATE at 2016-07-18 22:20:56.341119-07:00
changing to NAN_STATE at 2016-07-18 22:21:56.341119-07:00
changing to WAITING_STATE at 2016-07-19 08:53:40.890873-07:00
changing to RECORDING_STATE(1) at 2016-07-19 14:42:55.426597-07:00
ON sequence zone 1, length 9
OFF sequence zone 1, length 4
ON sequence zone 1, length 162
OFF sequence zone 1, length 152
ON sequence zone 2, length 84
OFF sequence zone 3, length 516
ON sequence zone 1, length 1
OFF sequence zone 1, length 324
ON sequence zone 1, length 8
OFF sequence zone 1, length 4
ON sequence zone 1, length 157
OFF sequence zone 1, length 150
ON sequence zone 2, length 23
OFF sequence zone 2, length 5
ON sequence zone 2, length 4
OFF sequence zone 3, length 901
ON sequence zone 1, length 8
OFF sequence zone 1, length 2
ON sequence zone 1, length 153
OFF sequence zone 1, length 124
ON sequence zone 2, length 140
OFF sequence zone 3, length 518
ON sequence zone 1, length 5
OFF sequence zone 1, length 33
ON sequence zone 1, length 2
OFF sequence zone 1, length 20
ON sequence zone 1, length 8
OFF sequence zone 1, length 21
ON sequence zone 1, length 5
changing to NAN_STATE at 2016-07-22 11:10:43.856991-07:00
changing to WAITING_STATE at 2016-07-22 11:16:36.013828-07:00
changing to RECORDING_STATE(1) at 2016-07-22 13:57:20.172464-07:00
ON sequence zone 1, length 1
OFF sequence zone 1, length 21
ON sequence zone 1, length 1
OFF sequence zone 1, length 26
ON sequence zone 1, length 7
OFF sequence zone 1, length 5
ON sequence zone 1, length 150
OFF sequence zone 1, length 151
ON sequence zone 2, length 287
OFF sequence zone 0, length 369
ON sequence zone 1, length 3
OFF sequence zone 1, length 261
ON sequence zone 1, length 10
OFF sequence zone 1, length 4
ON sequence zone 1, length 158
OFF sequence zone 1, length 21
ON sequence zone 1, length 251
OFF sequence zone 3, length 532
ON sequence zone 1, length 3
OFF sequence zone 1, length 178
ON sequence zone 1, length 5
OFF sequence zone 1, length 28
ON sequence zone 1, length 225
OFF sequence zone 1, length 170
ON sequence zone 2, length 77
OFF sequence zone 3, length 535
ON sequence zone 1, length 6
OFF sequence zone 1, length 43
ON sequence zone 1, length 34
OFF sequence zone 1, length 213
ON sequence zone 1, length 8
OFF sequence zone 1, length 5
ON sequence zone 1, length 158
OFF sequence zone 1, length 12
ON sequence zone 1, length 3
OFF sequence zone 1, length 109
ON sequence zone 2, length 76
OFF sequence zone 3, length 609
ON sequence zone 1, length 30
OFF sequence zone 1, length 120
ON sequence zone 1, length 35
OFF sequence zone 1, length 5
ON sequence zone 1, length 9
OFF sequence zone 1, length 38
ON sequence zone 1, length 18
OFF sequence zone 1, length 7
ON sequence zone 1, length 166
OFF sequence zone 1, length 1
ON sequence zone 1, length 6
OFF sequence zone 1, length 8
ON sequence zone 1, length 7
OFF sequence zone 1, length 127
ON sequence zone 2, length 86
OFF sequence zone 3, length 535
ON sequence zone 1, length 3
OFF sequence zone 1, length 13
ON sequence zone 1, length 6
OFF sequence zone 1, length 169
ON sequence zone 1, length 45
OFF sequence zone 1, length 4
ON sequence zone 1, length 58
OFF sequence zone 1, length 3
ON sequence zone 1, length 169
OFF sequence zone 1, length 5
ON sequence zone 1, length 8
OFF sequence zone 1, length 6
ON sequence zone 1, length 7
OFF sequence zone 1, length 64
ON sequence zone 2, length 156
OFF sequence zone 3, length 93
ON sequence zone 0, length 14
Out[16]:
(array([ 37.,   6.,   5.,   0.,   1.,   9.,   1.,   1.,   1.,   1.]),
 array([   1. ,   29.6,   58.2,   86.8,  115.4,  144. ,  172.6,  201.2,
         229.8,  258.4,  287. ]),
 <a list of 10 Patch objects>)

In [17]:
# Show the histogram for all off lengths
all_off_lengths = list(chain.from_iterable(length_data.off_lengths))
plt.hist(all_off_lengths)


Out[17]:
(array([ 37.,  12.,   2.,   1.,   1.,   5.,   1.,   0.,   1.,   1.]),
 array([   1.,   91.,  181.,  271.,  361.,  451.,  541.,  631.,  721.,
         811.,  901.]),
 <a list of 10 Patch objects>)

In [20]:
from hmmlearn.hmm import MultinomialHMM as MHMM
from lux_analysis import HmmScanner

# Scan all the samples and produce subsequences of samples for each
# zone, suitable for the hmm fit() method.
scanner = HmmScanner()
scanner.process_samples(on_off_series, smoothed_series_writer.index)

# train hmms by zone
hmm_by_zone = []
for z in range(NUM_ZONES):
    num_states = 5 #max(3, int(1+len(trainer.on_lengths[z])/2))
    #print("%d states for zone %d" % (num_states, z))            
    hmm = MHMM(n_components=num_states)
    data = np.array(scanner.samples_by_zone[z]).reshape(-1,1)
    lengths = np.array(scanner.lengths_by_zone[z])
    print("zone %s: %s samples in %s subsequences" % (z, len(data), len(lengths)))
    hmm.fit(data, lengths)
    hmm_by_zone.append(hmm)

def predict_hmm_by_zone(dt_series):
    predictions = []
    last_zone = None
    last_cnt = None
    for dt in dt_series:
        (sunrise, sunset) = get_sunrise_sunset(dt.year, dt.month, dt.day)
        zone = time_of_day_to_zone(dt_to_minutes(dt), sunrise, sunset)
        if zone != last_zone:
            if last_cnt is not None:
                print("add %d events for zone %d on day %d/%d" % (last_cnt, last_zone, dt.month, dt.day))
                (samples, states) = hmm_by_zone[last_zone].sample(last_cnt)
                predictions.extend([x[0] for x in samples])
            last_cnt = 1
        else:
            last_cnt += 1
        last_zone = zone
    if last_zone is not None:
        print("add %d events for zone %d" % (last_cnt, last_zone))
        (samples, states) = hmm_by_zone[last_zone].sample(last_cnt)
        predictions.extend([x[0] for x in samples])
    assert len(predictions)==len(dt_series)
    return pd.Series(predictions, index=dt_series)

# generate some perdictions for July 31, 2016
predict_dts_short = [datetime.datetime(year=2016, month=7, day=31, hour=h, minute=m)
                     for (h, m) in [(h, m) for h in range(0, 24) for m in range(0, 60)]]
short_predictions = predict_hmm_by_zone(predict_dts_short)
def plot(predictions):
    predictions.plot(figsize=(15,10))
    pylab.ylim([0, 1.5])
plot(short_predictions)


zone 0: 3163 samples in 10 subsequences
zone 1: 7443 samples in 13 subsequences
zone 2: 821 samples in 10 subsequences
zone 3: 1284 samples in 11 subsequences
add 371 events for zone 0 on day 7/31
add 817 events for zone 1 on day 7/31
add 103 events for zone 2 on day 7/31
add 149 events for zone 3

In [21]:
# Now, let's look at some predictions over a week
predict_dts_long = [datetime.datetime(year=2016, month=8, day=d, hour=h, minute=m)
                    for (d, h, m) in [(d, h, m) for d in range(1, 8) for h in range(0, 24)
                                      for m in range(0, 60)]]
long_predictions = predict_hmm_by_zone(predict_dts_long)
plot(long_predictions)


add 372 events for zone 0 on day 8/1
add 815 events for zone 1 on day 8/1
add 104 events for zone 2 on day 8/1
add 149 events for zone 3 on day 8/2
add 373 events for zone 0 on day 8/2
add 813 events for zone 1 on day 8/2
add 105 events for zone 2 on day 8/2
add 149 events for zone 3 on day 8/3
add 374 events for zone 0 on day 8/3
add 811 events for zone 1 on day 8/3
add 106 events for zone 2 on day 8/3
add 149 events for zone 3 on day 8/4
add 375 events for zone 0 on day 8/4
add 809 events for zone 1 on day 8/4
add 107 events for zone 2 on day 8/4
add 149 events for zone 3 on day 8/5
add 375 events for zone 0 on day 8/5
add 808 events for zone 1 on day 8/5
add 108 events for zone 2 on day 8/5
add 149 events for zone 3 on day 8/6
add 376 events for zone 0 on day 8/6
add 806 events for zone 1 on day 8/6
add 109 events for zone 2 on day 8/6
add 149 events for zone 3 on day 8/7
add 377 events for zone 0 on day 8/7
add 804 events for zone 1 on day 8/7
add 110 events for zone 2 on day 8/7
add 149 events for zone 3

In [22]:
# save the trained HMM to a file
from sklearn.externals import joblib
import os.path
if not os.path.exists('saved_hmms'):
    os.mkdir('saved_hmms')
    print("created saved_hmms direectory")
for zone in range(NUM_ZONES):
    fname = 'saved_hmms/hmm-%s-zone-%d.pkl' % (ROOM, zone)
    joblib.dump(hmm_by_zone[zone], fname)
    print("save hmm to %s" % fname)


save hmm to saved_hmms/hmm-back-room-zone-0.pkl
save hmm to saved_hmms/hmm-back-room-zone-1.pkl
save hmm to saved_hmms/hmm-back-room-zone-2.pkl
save hmm to saved_hmms/hmm-back-room-zone-3.pkl