In [1]:
#### Front Room #####
#
#### In this notebook, we use thingflow to read in some previously
# captured lux (light level) sensor data. We use a sliding window
# transducer to smooth the data and then convert both the raw data and
# the smoothed data to Pandas "Series" data values. These can then be
# plotted directly in jupyter.
# Some initial setup
import pandas as pd
# needed to get the graphs to show up
%matplotlib inline
import matplotlib.style
# To workaround a bug in pandas, see
# http://stackoverflow.com/questions/33995707/attributeerror-unknown-property-color-cycle
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt
print("pandas version=%s"% pd.__version__)
print("matplotlib version=%s" % matplotlib.__version__)
In [2]:
# Make sure we can see the thingflow package
import sys
import os.path
antpath=os.path.abspath('../../../thingflow-python')
assert os.path.exists(antpath)
assert os.path.exists(os.path.join(antpath, 'thingflow'))
if antpath not in sys.path:
sys.path.append(antpath)
import thingflow.base
In [3]:
# Some setup for our analysis
import asyncio
from pytz import timezone
from thingflow.base import Filter,OutputThing,Scheduler,SensorEvent,\
filtermethod
from thingflow.adapters.csv import CsvReader
from thingflow.adapters.pandas import PandasSeriesWriter
from thingflow.filters.transducer import SensorSlidingMean
import thingflow.filters.combinators
import thingflow.filters.select
PDT=timezone('America/Los_Angeles')
scheduler = Scheduler(asyncio.get_event_loop())
In [4]:
# Now, read in the spreadsheet of lux values and build the
# dataflow to process the data
import math
from lux_analysis import SensorSlidingMeanPassNaNs, fill_in_missing_times, CaptureNaNIndexes
DIR = '../data_files/'
ROOM = 'front-room'
reader = CsvReader(DIR+ROOM+'.csv')
# The raw writer captures the raw spreadsheet data
raw_series_writer = PandasSeriesWriter(tz=PDT)
# The smoothed writer captures the data that was passed through a
# sliding average transducer. We average over the last 5 samples.
smoothed_series_writer = PandasSeriesWriter(tz=PDT)
# We build and maintain a list of the NaN indexes, so that we can
# update them at will. This is needed because the clustering algorithm
# doesn't handle them
capture_nan_indexes = CaptureNaNIndexes()
# The smoothed data coming out of the sliding window mean is floating
# point. We round everything to the nearest integer, since that the
# accuracy we started with.
def round_event_val(x):
if not math.isnan(x.val):
return SensorEvent(ts=x.ts, sensor_id=x.sensor_id,
val=int(round(x.val)))
else:
return x
# Now we can put all the processing steps together
reader.fill_in_missing_times()\
.passthrough(raw_series_writer)\
.transduce(SensorSlidingMeanPassNaNs(5)).select(round_event_val).passthrough(smoothed_series_writer)\
.passthrough(capture_nan_indexes).output_count()
reader.print_downstream()
# run it!
scheduler.schedule_recurring(reader)
scheduler.run_forever()
print("Ran the stream")
In [5]:
# graph the full raw series
raw_series_writer.result.plot(figsize=(15,10))
Out[5]:
In [6]:
# Graph the last day of the raw data
raw_series_writer.result[:1440].plot(figsize=(15,10))
Out[6]:
In [7]:
# graph the smoothed series
smoothed_series_writer.result.plot(figsize=(15,10))
Out[7]:
In [8]:
# Graph the last day of the smoothed data
smoothed_series_writer.result[:1440].plot(figsize=(15,10))
Out[8]:
In [9]:
# Now, lets try clustering the data
CLUSTER_LABELS = 4 # how many groups will we create
import numpy as np
from sklearn.cluster import KMeans
# create an array that just has the values, no timestamps
npa = np.array(smoothed_series_writer.result).reshape(-1,1)
capture_nan_indexes.replace_nans(npa, 0.0)
kmeans = KMeans(CLUSTER_LABELS)
kmeans.fit(npa)
# print the raw clustered data
print("kmeans labels: " + str(kmeans.labels_))
from numpy.core.numeric import NaN
labels_with_nans = capture_nan_indexes.new_array_replace_nans(kmeans.labels_, NaN)
labels_against_time = pd.Series(labels_with_nans, index=smoothed_series_writer.index)
labels_against_time.plot(figsize=(15,10))
import pylab
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines
Out[9]:
In [10]:
# Unfortunately, the clustered data is creating arbitrary groups. We want the
# integer number to be proportional to the lux value so that the graphs are
# easier to interpret. To do this, we map 0 to the lowest lux level, 3 to
# the highest, etc. In cluster_mapping, the index is the cluster number and
# the value is the one we will graph.
label_totals = [0]*CLUSTER_LABELS
label_counts = [0]*CLUSTER_LABELS
for i in range(len(labels_with_nans)):
l = labels_with_nans[i]
if not math.isnan(l):
lux = smoothed_series_writer.result[i]
label_totals[l] += lux
label_counts[l] += 1
averages = [(label_totals[i]/label_counts[i], i) for i in range(CLUSTER_LABELS)]
averages.sort()
print("averages = %s" % averages)
def pivot(array):
result = [-1]*len(array)
for i in range(len(array)):
result[array[i]] = i
return result
cluster_mapping = pivot([i for (avg, i) in averages])
print("cluster_mapping = %s" % cluster_mapping)
kmeans_lux = [cluster_mapping[l] if not math.isnan(l) else NaN for l in labels_with_nans]
# Now, we can convert it back to a Pandas series by using our original timestamp
# array with the clustered data.
clusters_against_time = pd.Series(kmeans_lux, index=smoothed_series_writer.index)
clusters_against_time.plot(figsize=(15,10))
import pylab
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines
Out[10]:
In [11]:
# Lets look at the last day of data to get a closer view
clusters_against_time[:1440].plot(figsize=(15,10))
pylab.ylim([0,3.5]) # force the graph top a little higher so that we can see the 3.0 lines
Out[11]:
In [12]:
# Now, lets map this to on-off values
# We pick an "on threshold" based on the values we see in the above graph,
# trying to account for ambiant light.
ON_THRESHOLD = 1
def on_off_mapping(v):
if math.isnan(v):
return NaN
elif v>=ON_THRESHOLD:
return 1
else:
return 0
on_off_series = pd.Series([on_off_mapping(l) for l in kmeans_lux], index=smoothed_series_writer.index)
on_off_series.plot(figsize=(15,10))
pylab.ylim([0,1.5])
Out[12]:
In [13]:
# Graph the last day of the data to get a closer view
on_off_series[:1440].plot(figsize=(15,10))
pylab.ylim([0,1.5])
Out[13]:
In [14]:
# Import our code that will analyze the data.
import numpy as np
from numpy import float64
import datetime
from lux_analysis import time_of_day_to_zone, dt_to_minutes, NUM_ZONES, get_sunrise_sunset
SUNRISE = 351 # time in minutes since midnight on July 2 (5:52 am)
SUNSET = 1232 # time in minutes since midnight on July 2 (8:33 pm)
(sunrise, sunset) = get_sunrise_sunset(2016, 7, 2)
assert SUNRISE==sunrise
assert SUNSET==sunset
# We divide a day into "zones" based on time relative to sunrise/sunset.
# Let's plot our zones for a day
zone_test_index = [datetime.datetime(year=2016, month=7, day=2, hour=h, minute=m)
for (h, m) in [(h, m) for h in range(0, 24) for m in range(0, 60)]]
zone_test_data = [time_of_day_to_zone(dt_to_minutes(dt), SUNRISE, SUNSET) for dt in zone_test_index]
zone_test_series = pd.Series(zone_test_data, index=zone_test_index)
zone_test_series.plot(figsize=(15,10))
pylab.ylim([0, NUM_ZONES])
Out[14]:
In [15]:
# Build some histograms of the data
from lux_analysis import build_length_histogram_data
length_data = build_length_histogram_data(on_off_series, smoothed_series_writer.index)
# look at the distribution of lengths
from itertools import chain
all_on_lengths = list(chain.from_iterable(length_data.on_lengths))
plt.hist(all_on_lengths)
Out[15]:
In [16]:
# Show the histogram for all off lengths
all_off_lengths = list(chain.from_iterable(length_data.off_lengths))
plt.hist(all_off_lengths)
Out[16]:
In [17]:
from hmmlearn.hmm import MultinomialHMM as MHMM
from lux_analysis import HmmScanner
# Scan all the samples and produce subsequences of samples for each
# zone, suitable for the hmm fit() method.
scanner = HmmScanner()
scanner.process_samples(on_off_series, smoothed_series_writer.index)
# train hmms by zone
hmm_by_zone = []
for z in range(NUM_ZONES):
num_states = 5 #max(3, int(1+len(trainer.on_lengths[z])/2))
#print("%d states for zone %d" % (num_states, z))
hmm = MHMM(n_components=num_states)
data = np.array(scanner.samples_by_zone[z]).reshape(-1,1)
lengths = np.array(scanner.lengths_by_zone[z])
print("zone %s: %s samples in %s subsequences" % (z, len(data), len(lengths)))
hmm.fit(data, lengths)
hmm_by_zone.append(hmm)
def predict_hmm_by_zone(dt_series):
predictions = []
last_zone = None
last_cnt = None
for dt in dt_series:
(sunrise, sunset) = get_sunrise_sunset(dt.year, dt.month, dt.day)
zone = time_of_day_to_zone(dt_to_minutes(dt), sunrise, sunset)
if zone != last_zone:
if last_cnt is not None:
print("add %d events for zone %d on day %d/%d" % (last_cnt, last_zone, dt.month, dt.day))
(samples, states) = hmm_by_zone[last_zone].sample(last_cnt)
predictions.extend([x[0] for x in samples])
last_cnt = 1
else:
last_cnt += 1
last_zone = zone
if last_zone is not None:
print("add %d events for zone %d" % (last_cnt, last_zone))
(samples, states) = hmm_by_zone[last_zone].sample(last_cnt)
predictions.extend([x[0] for x in samples])
assert len(predictions)==len(dt_series)
return pd.Series(predictions, index=dt_series)
# generate some perdictions for July 31, 2016
predict_dts_short = [datetime.datetime(year=2016, month=7, day=31, hour=h, minute=m)
for (h, m) in [(h, m) for h in range(0, 24) for m in range(0, 60)]]
short_predictions = predict_hmm_by_zone(predict_dts_short)
def plot(predictions):
predictions.plot(figsize=(15,10))
pylab.ylim([0, 1.5])
plot(short_predictions)
In [18]:
# Now, let's look at some predictions over a week
predict_dts_long = [datetime.datetime(year=2016, month=8, day=d, hour=h, minute=m)
for (d, h, m) in [(d, h, m) for d in range(1, 8) for h in range(0, 24)
for m in range(0, 60)]]
long_predictions = predict_hmm_by_zone(predict_dts_long)
plot(long_predictions)
In [19]:
# save the trained HMM to a file
from sklearn.externals import joblib
import os.path
if not os.path.exists('saved_hmms'):
os.mkdir('saved_hmms')
print("created saved_hmms direectory")
for zone in range(NUM_ZONES):
fname = 'saved_hmms/hmm-%s-zone-%d.pkl' % (ROOM, zone)
joblib.dump(hmm_by_zone[zone], fname)
print("save hmm to %s" % fname)