notebook.community

Edit and run



In [2]:

    
from sklearn import svm
import pandas as pd
import sys
import pickle
import os
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
sys.path.append('../../')
import disaggregator.utils as utils
import matplotlib.pyplot as plt
import itertools
from sklearn.decomposition import PCA
import fold
from disaggregator import appliance
from disaggregator import evaluation_metrics as evm
import disaggregator as da
from scipy.interpolate import interp1d









    



/Users/sabina/anaconda/lib/python2.7/site-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))



In [3]:

    
reload(utils)
reload(appliance)
reload(fold)
reload(evm)









    Out[3]:





<module 'disaggregator.evaluation_metrics' from '../../disaggregator/evaluation_metrics.pyc'>



In [5]:

    
use_traces = pickle.load(open(os.path.join('../../','data/use_validated_05_2014.p'),'rb'))



In [6]:

    
ev_traces = pickle.load((open(os.path.join('../../','data/car1_validated_05_2014.p'),'rb')))



In [36]:

    
#test = [utils.split_trace_into_rate(x,'D') for x in ev_traces]



In [7]:

    
len(use_traces)









    Out[7]:





200



In [8]:

    
len(ev_traces)









    Out[8]:





47



In [9]:

    
car_ids = [car.metadata['dataid'] for car in ev_traces]



In [13]:

    
all_ids = [use.metadata['dataid'] for use in use_traces]



In [10]:

    
X_cars = []
X_without_cars = []
for x in use_traces:
    if x.metadata['dataid'] in car_ids:
        X_cars.append(x)
    else:
        X_without_cars.append(x)



In [11]:

    
equal_class_size = [x for x in X_without_cars[:47]]



In [65]:

    
#drop nones using resample
class_cars = [x.resample('1T') for x in X_cars]
class_no_cars = [x.resample('1T') for x in equal_class_size]



In [66]:

    
#split into days
class_cars = [x.split_by('D') for x in class_cars]



In [67]:

    
class_no_cars = [x.split_by('D') for x in class_no_cars]



In [68]:

    
class_cars = [car[:30] for car in class_cars]
class_no_cars = [house[:30] for house in class_no_cars]



In [69]:

    
indices = []
for hidx, home in enumerate(class_cars):
    for didx, day in enumerate(home):
        if len(day.series)!=1440:
            
            indices.append([hidx,didx])



In [125]:

    
def fix_one(inds):
    home_index=inds[0]
    day_index=inds[1]
    day = class_cars[home_index][day_index].series.index[0].day
    month = class_cars[home_index][day_index].series.index[0].month
    year = class_cars[home_index][day_index].series.index[0].year
    date = '{}/{}/{}'.format(month,day,year)
    print date
    rng = pd.date_range(date, periods = 1440, freq='T')
    temp = class_cars[home_index][day_index].series.reindex(rng)
    temp = temp.astype(float)
    temp = temp.interpolate()
    temp_app = appliance.ApplianceTrace(temp,class_cars[home_index][day_index].metadata)
    return temp_app



In [71]:

    
for i in indices:
    class_cars[i[0]][i[1]]=fix(i)



In [29]:

    
indices_2 = []
for hidx, home in enumerate(class_no_cars):
    for didx, day in enumerate(home):
        if len(day.series)!=1440:
            indices.append([hidx,didx])



In [31]:

    
for i in indices_2:
    class_no_cars[i[0]][i[1]]=fix(i)



In [74]:

    
for home in class_cars:
    for day in home:
        if len(day.series)!=1440:
            print 'oops'



In [87]:

    
chain_class_cars = list(itertools.chain(*new_cars))
chain_class_no_cars = list(itertools.chain(*new_no_cars))



In [104]:

    
for i in chain_class_cars:
    if len(i.series)!=1440:
        print 'help'



In [79]:

    
new_cars = []
for i in class_cars:
    if len(i)==30:
        new_cars.append(i)



In [85]:

    
new_no_cars = class_no_cars[:-4]



In [90]:

    
#set up scaler
X_car_train = [x.series for x in chain_class_cars]
scaler = preprocessing.StandardScaler().fit(X_car_train)









    



/Users/sabina/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:278: UserWarning: StandardScaler assumes floating point values as input, got object
  "got %s" % (estimator, X.dtype))



In [113]:

    
reload(fold)
a,b,c,d= fold.folds(chain_class_cars,chain_class_no_cars,5)



In [163]:

    
minu =0
plus = 0 
for i in b[0]:
    if i==-1:
        minu=minu+1
    else:
        plus=plus+1



In [121]:

    
#FIX ALL THE FOLDS FIRST - this makes way more sense to do it the other way but oh well for now
#returns fixed version
def fix(x_list):
    list_return=[]
    #imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for i,train_x in enumerate(x_list):
        X_train = [x.series for x in train_x]
        #imp.fit(X_train)  
        #new_X = imp.transform(X_train)
        new_X = scaler.transform(X_train)
        list_return.append(new_X)
    return list_return



In [127]:

    
#X = list(itertools.chain(*X_air))+list(itertools.chain(*X_ev))



In [144]:

    
#set up
fixed_train = fix(a)
#fixed_test = fix(c)



In [148]:

    
fixed_test = fix(c)



In [203]:

    
clf = svm.SVC(gamma = .1)



In [204]:

    
clf.fit(fixed_train[0],b[0])









    Out[204]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.1,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [205]:

    
predictions = clf.predict(fixed_test[0])



In [206]:

    
stats = evm.get_positive_negative_stats_neg(d[0], predictions)



In [207]:

    
print(evm.get_table_of_confusion(stats))









    



+----------+------------+------------+
|          |   Positive |   Negative |
+==========+============+============+
| Positive |         20 |         22 |
+----------+------------+------------+
| Negative |        238 |        236 |
+----------+------------+------------+



In [208]:

    
print(evm.get_accuracy(stats))
print(evm.get_f1_score(stats))









    



0.496124031008
0.133333333333



In [115]:

    
clf.fit(fixed_train[0],b[0])









    Out[115]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=2, degree=3, gamma=0.1,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [151]:

    
#cross validate!!!
def test_with_diff_params(g):
    clf = svm.SVC(gamma = g)
    errors = []
    stats_all = []
    for i in range(5):
        clf.fit(fixed_train[i],b[i])
        predictions = clf.predict(fixed_test[i])
        stats = evm.get_positive_negative_stats_neg(d[i], predictions)
        errors.append(1-float(evm.get_accuracy(stats)))
        stats_all.append(stats) 
    return errors,stats_all



In [148]:

    
st = {}
err = {}



In [178]:

    
for gam in range(1,11,1):
    errs,st_all = test_with_diff_params(float(gam)/14000)
    st[gam]=st_all
    err[gam]=errs



In [177]:

    
for e in err.values():
    print np.array(e).mean()









    



0.146774193548
0.131720430108
0.124327956989
0.11935483871
0.116532258065
0.114247311828
0.112903225806
0.1125
0.111962365591
0.110349462366