In [2]:
from sklearn import svm
import pandas as pd
import sys
import pickle
import os
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
sys.path.append('../../')
import disaggregator.utils as utils
import matplotlib.pyplot as plt
import itertools
from sklearn.decomposition import PCA
import fold
from disaggregator import appliance
from disaggregator import evaluation_metrics as evm
import disaggregator as da
from scipy.interpolate import interp1d


/Users/sabina/anaconda/lib/python2.7/site-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))

In [3]:
reload(utils)
reload(appliance)
reload(fold)
reload(evm)


Out[3]:
<module 'disaggregator.evaluation_metrics' from '../../disaggregator/evaluation_metrics.pyc'>

In [5]:
use_traces = pickle.load(open(os.path.join('../../','data/use_validated_05_2014.p'),'rb'))

In [6]:
ev_traces = pickle.load((open(os.path.join('../../','data/car1_validated_05_2014.p'),'rb')))

In [36]:
#test = [utils.split_trace_into_rate(x,'D') for x in ev_traces]

In [7]:
len(use_traces)


Out[7]:
200

In [8]:
len(ev_traces)


Out[8]:
47

In [9]:
car_ids = [car.metadata['dataid'] for car in ev_traces]

In [13]:
all_ids = [use.metadata['dataid'] for use in use_traces]

In [10]:
X_cars = []
X_without_cars = []
for x in use_traces:
    if x.metadata['dataid'] in car_ids:
        X_cars.append(x)
    else:
        X_without_cars.append(x)

In [11]:
equal_class_size = [x for x in X_without_cars[:47]]

In [65]:
#drop nones using resample
class_cars = [x.resample('1T') for x in X_cars]
class_no_cars = [x.resample('1T') for x in equal_class_size]

In [66]:
#split into days
class_cars = [x.split_by('D') for x in class_cars]

In [67]:
class_no_cars = [x.split_by('D') for x in class_no_cars]

In [68]:
class_cars = [car[:30] for car in class_cars]
class_no_cars = [house[:30] for house in class_no_cars]

In [69]:
indices = []
for hidx, home in enumerate(class_cars):
    for didx, day in enumerate(home):
        if len(day.series)!=1440:
            
            indices.append([hidx,didx])

In [125]:
def fix_one(inds):
    home_index=inds[0]
    day_index=inds[1]
    day = class_cars[home_index][day_index].series.index[0].day
    month = class_cars[home_index][day_index].series.index[0].month
    year = class_cars[home_index][day_index].series.index[0].year
    date = '{}/{}/{}'.format(month,day,year)
    print date
    rng = pd.date_range(date, periods = 1440, freq='T')
    temp = class_cars[home_index][day_index].series.reindex(rng)
    temp = temp.astype(float)
    temp = temp.interpolate()
    temp_app = appliance.ApplianceTrace(temp,class_cars[home_index][day_index].metadata)
    return temp_app

In [71]:
for i in indices:
    class_cars[i[0]][i[1]]=fix(i)

In [29]:
indices_2 = []
for hidx, home in enumerate(class_no_cars):
    for didx, day in enumerate(home):
        if len(day.series)!=1440:
            indices.append([hidx,didx])

In [31]:
for i in indices_2:
    class_no_cars[i[0]][i[1]]=fix(i)

In [74]:
for home in class_cars:
    for day in home:
        if len(day.series)!=1440:
            print 'oops'

In [87]:
chain_class_cars = list(itertools.chain(*new_cars))
chain_class_no_cars = list(itertools.chain(*new_no_cars))

In [104]:
for i in chain_class_cars:
    if len(i.series)!=1440:
        print 'help'

In [79]:
new_cars = []
for i in class_cars:
    if len(i)==30:
        new_cars.append(i)

In [85]:
new_no_cars = class_no_cars[:-4]

In [90]:
#set up scaler
X_car_train = [x.series for x in chain_class_cars]
scaler = preprocessing.StandardScaler().fit(X_car_train)


/Users/sabina/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:278: UserWarning: StandardScaler assumes floating point values as input, got object
  "got %s" % (estimator, X.dtype))

In [113]:
reload(fold)
a,b,c,d= fold.folds(chain_class_cars,chain_class_no_cars,5)

In [163]:
minu =0
plus = 0 
for i in b[0]:
    if i==-1:
        minu=minu+1
    else:
        plus=plus+1

In [121]:
#FIX ALL THE FOLDS FIRST - this makes way more sense to do it the other way but oh well for now
#returns fixed version
def fix(x_list):
    list_return=[]
    #imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for i,train_x in enumerate(x_list):
        X_train = [x.series for x in train_x]
        #imp.fit(X_train)  
        #new_X = imp.transform(X_train)
        new_X = scaler.transform(X_train)
        list_return.append(new_X)
    return list_return

In [127]:
#X = list(itertools.chain(*X_air))+list(itertools.chain(*X_ev))

In [144]:
#set up
fixed_train = fix(a)
#fixed_test = fix(c)

In [148]:
fixed_test = fix(c)

In [203]:
clf = svm.SVC(gamma = .1)

In [204]:
clf.fit(fixed_train[0],b[0])


Out[204]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.1,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [205]:
predictions = clf.predict(fixed_test[0])

In [206]:
stats = evm.get_positive_negative_stats_neg(d[0], predictions)

In [207]:
print(evm.get_table_of_confusion(stats))


+----------+------------+------------+
|          |   Positive |   Negative |
+==========+============+============+
| Positive |         20 |         22 |
+----------+------------+------------+
| Negative |        238 |        236 |
+----------+------------+------------+

In [208]:
print(evm.get_accuracy(stats))
print(evm.get_f1_score(stats))


0.496124031008
0.133333333333

In [115]:
clf.fit(fixed_train[0],b[0])


Out[115]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=2, degree=3, gamma=0.1,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [151]:
#cross validate!!!
def test_with_diff_params(g):
    clf = svm.SVC(gamma = g)
    errors = []
    stats_all = []
    for i in range(5):
        clf.fit(fixed_train[i],b[i])
        predictions = clf.predict(fixed_test[i])
        stats = evm.get_positive_negative_stats_neg(d[i], predictions)
        errors.append(1-float(evm.get_accuracy(stats)))
        stats_all.append(stats) 
    return errors,stats_all

In [148]:
st = {}
err = {}

In [178]:
for gam in range(1,11,1):
    errs,st_all = test_with_diff_params(float(gam)/14000)
    st[gam]=st_all
    err[gam]=errs

In [177]:
for e in err.values():
    print np.array(e).mean()


0.146774193548
0.131720430108
0.124327956989
0.11935483871
0.116532258065
0.114247311828
0.112903225806
0.1125
0.111962365591
0.110349462366