In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from collections import OrderedDict
import itertools
from copy import deepcopy
import pandas as pd
import matplotlib.pylab as pylab
import os
%matplotlib inline


/usr/local/lib/python2.7/dist-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))

In [2]:
#Produces arrays of arrays for HMM
def values_to_array(values):
    a=[]
    X=[]
    for i in values:
        a.append(i)
        X.append(a)
        a=[]
    return np.array(X)

In [3]:
#Takes the JSON files and reloads them back into python
def open_instance_15min(device_type, instance_name,filename):
    with open("Devices/{0}/{1}/{2}".format(device_type,instance_name,filename)) as f:
        return pd.DataFrame(json.load(f)['time_15'])

In [4]:
def return_sorting_mapping(means):
    means_copy = deepcopy(means)
    # Sorting 
    means_copy = np.sort(means_copy, axis = 0)  
    # Finding mapping
    mapping = {}
    for i, val in enumerate(means_copy):
        assert val==means[np.where(val==means)[0]]
        mapping[i] = np.where(val==means)[0][0]
    return mapping

In [5]:
def sort_startprob(mapping, startprob):
    """ Sort the startprob according to power means; as returned by mapping
    """
    num_elements = len(startprob)
    new_startprob = np.zeros(num_elements)
    for i in xrange(len(startprob)):
        new_startprob[i] = startprob[mapping[i]]
    return new_startprob

In [6]:
def sort_covars(mapping, covars):
    num_elements = len(covars)
    new_covars = np.zeros_like(covars)
    for i in xrange(len(covars)):
        new_covars[i] = covars[mapping[i]]
    return new_covars

In [7]:
def sort_transition_matrix(mapping, A):
    """ Sorts the transition matrix according to power means; as returned by mapping
    """
    num_elements = len(A)
    A_new = np.zeros((num_elements, num_elements))
    for i in range(num_elements):
        for j in range(num_elements):
            A_new[i,j] = A[mapping[i], mapping[j]]   
    return A_new

In [8]:
def sort_learnt_parameters(startprob, means, covars, transmat):
    mapping = return_sorting_mapping(means)
    means_new = np.sort(means, axis = 0)
    startprob_new = sort_startprob(mapping, startprob)
    covars_new = sort_covars(mapping, covars)
    transmat_new = sort_transition_matrix(mapping, transmat)
    assert np.shape(means_new) == np.shape(means)
    assert np.shape(startprob_new) == np.shape(startprob)
    assert np.shape(transmat_new) == np.shape(transmat)
    
    return [startprob_new, means_new, covars_new, transmat_new]

In [9]:
device={}
models={}
pi=OrderedDict()
a=OrderedDict()
mean=OrderedDict()
cov=OrderedDict()
model=OrderedDict()
sorted_model=OrderedDict()
power=OrderedDict()
state=OrderedDict()


device_name='Refrigerator'
directory= os.getcwd()+'/Devices/'+device_name+'/'
pi_prior=np.array([0.5,0.5])
a_prior=np.array([[0.95,0.05],[0.05,0.95]])
mean_prior=np.array([[0],[120]])
cov_prior=np.tile(np.identity(1), (2, 1, 1))
model['all']=hmm.GaussianHMM(pi_prior.size, "full", pi_prior,a_prior)
model['all'].means=mean_prior  
model['all'].covars=cov_prior
for i,instance_name in enumerate(os.listdir(directory)):
    device[instance_name] = [open_instance_15min(device_name,instance_name,filename)for filename in os.listdir(directory+'/'+instance_name)]
    pi[instance_name]=pi_prior
    a[instance_name]=a_prior
    mean[instance_name]=mean_prior
    cov[instance_name]=cov_prior
    model[instance_name]=hmm.GaussianHMM(pi[instance_name].size, "full", pi[instance_name],a[instance_name])
    model[instance_name].means_ = mean[instance_name]
    model[instance_name].covars_ = cov[instance_name]
    for l,trace in enumerate(device[instance_name][1:]):
        trace_values=values_to_array(trace["values"].values)
        model[instance_name].fit([trace_values])
        startprob, means, covars, transmat = sort_learnt_parameters(model[instance_name].startprob_, model[instance_name].means_, model[instance_name].covars_ , model[instance_name].transmat_) 
        model[instance_name]=hmm.GaussianHMM(startprob.size, "full", startprob, transmat)
        model[instance_name].means_ = means
        model[instance_name].covars_ = covars
        model['all'].fit([trace_values])
        startprob, means, covars, transmat = sort_learnt_parameters(model['all'].startprob_, model['all'].means_, model['all'].covars_ , model['all'].transmat_) 
        model['all']=hmm.GaussianHMM(startprob.size, "full", startprob, transmat)
        model['all'].means_ = means
        model['all'].covars_ = covars

In [10]:
model.keys()


Out[10]:
['all',
 '76C07F',
 'D331DA',
 '98C08A',
 'Refrigerator',
 'D32131',
 'B83B9E',
 '599393',
 'B7E6F4']

In [11]:
other_instance='B7E6F4'
test=values_to_array(device[instance_name][0]['values'].values)
print instance_name+" model using test "+instance_name+" data: " + str(model[instance_name].score(test))
print other_instance+" model  using "+instance_name+ " test data: " + str(model[other_instance].score(test))
print "All model using "+instance_name+" test data: " + str(model['all'].score(test))
power_s, state = model[instance_name].sample(96)
print
print instance_name+" model using samples from model: " + str(model[instance_name].score(power_s))
trained=values_to_array(device[instance_name][1]['values'].values)
print instance_name+" model using training data from model: " + str(model[instance_name].score(trained))


B7E6F4 model using test B7E6F4 data: -132.826128467
B7E6F4 model  using B7E6F4 test data: -132.826128467
All model using B7E6F4 test data: -132.826128467

B7E6F4 model using samples from model: -139.67649982
B7E6F4 model using training data from model: -124.106316699

In [12]:
dfs_test={}
avg_prob={}
for key in model:
    if(key!='all'):
        test=values_to_array(device[key][0]['values'].values)
        a=[]
        for key2 in model:
            val=model[key2].score(test)
            a.append([key,key2,val])
        dfs_test[key] = pd.DataFrame(data=a,columns=['Test_Instance','Model_Instance','Value'])
        print dfs_test[key].sort('Value',ascending=False)
        print


  Test_Instance Model_Instance       Value
1        76C07F         76C07F -132.845789
4        76C07F   Refrigerator -162.433736
8        76C07F         B7E6F4 -187.569344
0        76C07F            all -187.569344
7        76C07F         599393 -272.547281
6        76C07F         B83B9E -288.619679
5        76C07F         D32131 -407.931817
2        76C07F         D331DA -491.894891
3        76C07F         98C08A -565.008965

  Test_Instance Model_Instance       Value
2        D331DA         D331DA -427.359291
1        D331DA         76C07F -473.288866
5        D331DA         D32131 -480.483261
4        D331DA   Refrigerator -519.805504
7        D331DA         599393 -530.810339
8        D331DA         B7E6F4 -634.840923
0        D331DA            all -634.840923
6        D331DA         B83B9E -919.641820
3        D331DA         98C08A -984.270831

  Test_Instance Model_Instance        Value
4        98C08A   Refrigerator  -381.073139
7        98C08A         599393  -404.520675
3        98C08A         98C08A  -434.450893
2        98C08A         D331DA  -701.940640
5        98C08A         D32131  -754.250777
1        98C08A         76C07F  -902.806754
0        98C08A            all -2321.134474
8        98C08A         B7E6F4 -2321.134474
6        98C08A         B83B9E -4807.538204

  Test_Instance Model_Instance        Value
4  Refrigerator   Refrigerator  -149.379426
1  Refrigerator         76C07F  -255.505381
7  Refrigerator         599393  -258.730785
5  Refrigerator         D32131  -459.451345
3  Refrigerator         98C08A  -528.730092
2  Refrigerator         D331DA  -542.514052
0  Refrigerator            all  -627.245992
8  Refrigerator         B7E6F4  -627.245992
6  Refrigerator         B83B9E -1281.418289

  Test_Instance Model_Instance       Value
4        D32131   Refrigerator -215.263255
1        D32131         76C07F -239.030432
7        D32131         599393 -280.552798
5        D32131         D32131 -329.696305
2        D32131         D331DA -381.921304
0        D32131            all -425.016597
8        D32131         B7E6F4 -425.016597
3        D32131         98C08A -533.339744
6        D32131         B83B9E -762.013843

  Test_Instance Model_Instance       Value
6        B83B9E         B83B9E  -22.240970
8        B83B9E         B7E6F4  -36.753878
0        B83B9E            all  -36.753878
1        B83B9E         76C07F  -65.087631
4        B83B9E   Refrigerator -104.470790
7        B83B9E         599393 -237.236131
3        B83B9E         98C08A -417.105914
5        B83B9E         D32131 -433.651417
2        B83B9E         D331DA -498.277939

  Test_Instance Model_Instance        Value
4        599393   Refrigerator  -161.814692
1        599393         76C07F  -254.620520
7        599393         599393  -269.480604
5        599393         D32131  -461.180392
3        599393         98C08A  -525.123521
2        599393         D331DA  -534.335557
0        599393            all  -594.564736
8        599393         B7E6F4  -594.564736
6        599393         B83B9E -1198.308340

  Test_Instance Model_Instance       Value
8        B7E6F4         B7E6F4 -132.826128
0        B7E6F4            all -132.826128
6        B7E6F4         B83B9E -134.600902
1        B7E6F4         76C07F -176.122617
4        B7E6F4   Refrigerator -217.278358
7        B7E6F4         599393 -326.402303
5        B7E6F4         D32131 -449.222948
3        B7E6F4         98C08A -452.755094
2        B7E6F4         D331DA -487.705797


In [28]:
#Looking at each model, averaging how well they do for each test case and ranking the models
dfs_model={}
for key in model:
    a=[]
    for key2 in model:
        if(key2!='all'):
            test=values_to_array(device[key2][0]['values'].values)
            val=model[key].score(test)
            a.append([key2,key,val])  
            dfs_model[key] = pd.DataFrame(data=a,columns=['Test_Instance','Model_Instance','Value'])
    #print dfs_model[key].sort('Value',ascending=False)
b=[]   
for key in dfs_model:
    sum=0
    for row in dfs_model[key].iterrows():
        sum=sum+row[1]['Value']
    b.append([key,sum/len(dfs_model[key].index)])
avg_model = pd.DataFrame(data=b,columns=['Model_Instance','Avg Probability'])
print avg_model.sort('Avg Probability',ascending=False)


  Model_Instance  Avg Probability
8   Refrigerator      -238.939863
2         76C07F      -312.413499
4         599393      -322.535114
3         D32131      -471.983533
6         D331DA      -508.243684
0         98C08A      -555.098132
1            all      -619.994009
7         B7E6F4      -619.994009
5         B83B9E     -1176.797756

In [26]:
print dfs_model['Refrigerator'].sort('Value',ascending=False)


  Test_Instance Model_Instance       Value
5        B83B9E   Refrigerator -104.470790
3  Refrigerator   Refrigerator -149.379426
6        599393   Refrigerator -161.814692
0        76C07F   Refrigerator -162.433736
4        D32131   Refrigerator -215.263255
7        B7E6F4   Refrigerator -217.278358
2        98C08A   Refrigerator -381.073139
1        D331DA   Refrigerator -519.805504

In [22]:
#Enter device name you would like to look at below
instance_name='76C07F'
test=values_to_array(device[instance_name][0]['values'].values)
A=50*model[instance_name].predict(test)
A_ag=50*model['all'].predict(test)
A_best=50*model['Refrigerator'].predict(test)
B=test
pylab.rcParams['figure.figsize'] = 16, 12
plt.plot(B,'r')
plt.plot(A_best,'g')

In [23]:
pylab.rcParams['figure.figsize'] = 16, 12
plt.plot(B,'r')
plt.plot(A_best,'g')


Out[23]:
[<matplotlib.lines.Line2D at 0x7f3ef333fe90>]

In [ ]: