Load data, clean the data, combine separte data together, divide training data and test data.


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#outdoor air temperature
oa = pd.read_csv("../data/oa_temp_utc_f.csv");
oa.columns = ['time', 'oa']
oa.set_index("time", drop = True, inplace = True);

oa.index = pd.to_datetime(oa.index)
oa = oa.replace('/', np.nan)
oa['oa'] = oa['oa'].astype(float)
oa = oa.interpolate(method = "time")

#relative humidity
rh = pd.read_csv("../data/rh_utc_perct.csv");
rh.columns = ['time', 'rh']
rh.set_index("time", drop = True, inplace = True);

rh.index = pd.to_datetime(rh.index)
rh = rh.replace('/', np.nan)
rh['rh'] = rh['rh'].astype(float)
rh = rh.interpolate(method = "time")

#solar radiation
sr = pd.read_csv("../data/total_solar_utc_btuh-ft2.csv");
sr.columns = ['time', 'sr'];
sr.set_index('time', drop = True, inplace = True);

sr.index = pd.to_datetime(sr.index)
sr = sr.replace('/', np.nan)
sr['sr'] = sr['sr'].astype(float)
sr = sr.interpolate(method = "time")
#wind speed
ws = pd.read_csv("../data/wind_speed_utc_mph.csv");
ws.columns = ['time', 'ws'];
ws.set_index('time', drop = True, inplace = True);

ws.index = pd.to_datetime(ws.index)
ws = ws.replace('/', np.nan)
ws['ws'] = ws['ws'].astype(float)
ws = ws.interpolate(method = "time")
#damper position of room 107n
dp = pd.read_csv("../data/107n_damper_utc_0to7or10.csv");
dp.columns = ['time', 'dp'];
dp.set_index('time', drop = True, inplace = True); 

dp.index = pd.to_datetime(dp.index)
dp = dp.replace('/', np.nan)
dp['dp'] = dp['dp'].astype(float)
dp = dp.interpolate(method = "time")
#some data has scale 0-10, some has scale 0-7, so transfer 0-7 to 0-10
dpUpper = dp.loc['2014-06-26 00:10:00':'2015-02-02 14:40:00',:];
dpLower = dp.loc['2015-02-02 14:50:00':,:]
dpLower = dpLower.multiply(10.0/7.0, axis='columns');

frames = [dpUpper, dpLower]
dp = pd.concat(frames)

#supply air temperature of room 107n
st = pd.read_csv("../data/107n_vavtemp_utc_f.csv");
st.columns = ['time', 'st'];
st.set_index('time', drop = True, inplace = True);

st.index = pd.to_datetime(st.index)
st = st.replace('/', np.nan)
st['st'] = st['st'].astype(float)
st = st.interpolate(method = "time");

#indoor air temperature of 107n
at = pd.read_csv("../data/107n_temp_utc_f.csv");
at.columns = ['time', 'at'];
at.set_index('time', drop = True, inplace = True);

at.index = pd.to_datetime(at.index)
at = at.replace('/', np.nan)
at['at'] = at['at'].astype(float)
at = at.interpolate(method = "time");

#merge together, change original utc time to local time
allDataRaw = (oa.merge(rh,how = 'inner',left_index = True, right_index = True)
              .merge(sr,how = 'inner',left_index = True, right_index = True)
              .merge(ws,how = 'inner',left_index = True, right_index = True)
              .merge(dp,how = 'inner',left_index = True, right_index = True)
              .merge(st,how = 'inner',left_index = True, right_index = True)
              .merge(at,how = 'inner',left_index = True, right_index = True));
import pytz
eastern = pytz.timezone('US/Eastern');
origIndex = allDataRaw.index;
newTimeIndex = origIndex.tz_localize(pytz.utc).tz_convert(eastern);
allDataRaw.index = newTimeIndex;

#add month, weekday, hour of day information
allDataRaw['month'] = allDataRaw.index;
allDataRaw['weekday'] = allDataRaw.index;
allDataRaw['hour'] = allDataRaw.index;
allDataRaw['month'] = (allDataRaw['month']
                             .apply(lambda x: x.month))
allDataRaw['weekday'] = (allDataRaw['weekday']
                             .apply(lambda x: x.weekday()))
allDataRaw['hour'] = (allDataRaw['hour']
                             .apply(lambda x: x.hour));

#seperate one year for train, one year for test
train = allDataRaw.loc['2014-11-01 00:00:00':'2015-11-01 00:00:00',:];
test = allDataRaw.loc['2015-11-01 00:10:00':'2016-11-01 00:10:00'];
train


Out[1]:
oa rh sr ws dp st at month weekday hour
time
2014-11-01 00:00:00-04:00 42.700000 86.589980 0.704026 0.000000 0.000000 62.440970 68.200000 11 5 0
2014-11-01 00:10:00-04:00 42.710010 86.600000 1.244994 0.000000 0.000000 62.341324 68.100000 11 5 0
2014-11-01 00:20:00-04:00 42.800000 86.600000 1.115994 0.000000 0.000000 63.057070 68.100000 11 5 0
2014-11-01 00:30:00-04:00 42.900000 86.500000 0.900000 0.062532 0.000000 64.773110 68.100000 11 5 0
2014-11-01 00:40:00-04:00 42.900000 86.410010 0.560000 0.000000 0.000000 66.950250 67.946335 11 5 0
2014-11-01 00:50:00-04:00 42.900000 86.500000 0.584016 0.000000 0.000000 68.192020 67.900000 11 5 0
2014-11-01 01:00:00-04:00 43.000000 86.510025 1.250954 0.000000 0.000000 68.516280 67.900000 11 5 1
2014-11-01 01:10:00-04:00 42.989980 86.510020 0.489018 1.250000 0.000000 68.658485 67.900000 11 5 1
2014-11-01 01:20:00-04:00 42.900000 86.600000 0.605010 1.124740 0.000000 68.700000 67.900000 11 5 1
2014-11-01 01:30:00-04:00 42.900000 86.500000 1.080000 0.125264 0.000000 68.758470 67.900000 11 5 1
2014-11-01 01:40:00-04:00 42.800000 86.720010 0.644996 0.000000 0.000000 68.741170 67.900000 11 5 1
2014-11-01 01:50:00-04:00 42.900000 86.900000 0.550990 0.000000 0.000000 68.700000 67.900000 11 5 1
2014-11-01 02:00:00-04:00 42.889980 86.700000 0.760000 0.000000 0.000000 68.758446 67.900000 11 5 2
2014-11-01 02:10:00-04:00 42.700000 86.410030 0.830000 0.000000 0.000000 68.741540 67.900000 11 5 2
2014-11-01 02:20:00-04:00 42.700000 86.410000 0.944005 0.000000 0.000000 68.700000 67.900000 11 5 2
2014-11-01 02:30:00-04:00 42.700000 86.400000 0.650981 0.000000 0.000000 68.700000 67.900000 11 5 2
2014-11-01 02:40:00-04:00 42.600000 86.500000 0.764005 0.000000 0.000000 68.641610 67.900000 11 5 2
2014-11-01 02:50:00-04:00 42.600000 86.500000 0.700000 0.000000 0.000000 68.600000 67.900000 11 5 2
2014-11-01 03:00:00-04:00 42.600000 86.189990 0.614994 0.000000 0.000000 68.600000 67.900000 11 5 3
2014-11-01 03:10:00-04:00 42.600000 86.000000 0.640979 0.000000 0.000000 68.600000 67.900000 11 5 3
2014-11-01 03:20:00-04:00 42.500000 85.300000 0.980000 0.000000 0.000000 68.600000 67.900000 11 5 3
2014-11-01 03:30:00-04:00 42.500000 85.289990 0.899011 0.000000 0.000000 68.600000 67.900000 11 5 3
2014-11-01 03:40:00-04:00 42.400000 85.210020 1.314989 0.000000 10.000000 68.659294 67.900000 11 5 3
2014-11-01 03:50:00-04:00 42.300000 85.100000 0.610000 0.000000 10.000000 68.640690 67.900000 11 5 3
2014-11-01 04:00:00-04:00 42.110000 85.100000 0.860003 0.000000 10.000000 68.600000 67.900000 11 5 4
2014-11-01 04:10:00-04:00 42.200000 85.110020 1.399976 0.563868 10.000000 68.541730 67.900000 11 5 4
2014-11-01 04:20:00-04:00 42.200000 84.989990 0.570008 0.000000 10.000000 68.500000 67.900000 11 5 4
2014-11-01 04:30:00-04:00 42.100000 84.689990 0.790976 0.000000 10.000000 68.441360 67.900000 11 5 4
2014-11-01 04:40:00-04:00 42.100000 84.600000 0.660000 0.000000 10.000000 68.458694 67.900000 11 5 4
2014-11-01 04:50:00-04:00 42.110012 82.689980 0.715007 0.000000 10.000000 68.440674 67.900000 11 5 4
... ... ... ... ... ... ... ... ... ... ...
2015-10-31 19:10:00-04:00 56.484997 44.500000 0.800000 1.062479 0.000000 72.188810 68.700000 10 5 19
2015-10-31 19:20:00-04:00 56.385000 44.515003 0.800000 0.843843 0.000000 72.088700 68.700000 10 5 19
2015-10-31 19:30:00-04:00 56.315002 44.984997 0.781005 3.218771 0.000000 71.988945 68.700000 10 5 19
2015-10-31 19:40:00-04:00 56.500000 45.000000 1.162498 1.062441 0.000000 71.888540 68.700000 10 5 19
2015-10-31 19:50:00-04:00 56.500000 45.115000 1.108995 0.656396 0.000000 71.788830 68.700000 10 5 19
2015-10-31 20:00:00-04:00 56.784996 45.184998 0.968506 4.125077 0.000000 71.688830 68.600000 10 5 20
2015-10-31 20:10:00-04:00 56.700000 45.200000 0.828998 1.625041 0.000000 71.600000 68.600000 10 5 20
2015-10-31 20:20:00-04:00 56.800000 45.100000 0.961005 0.000000 0.000000 71.588870 68.600000 10 5 20
2015-10-31 20:30:00-04:00 56.800000 45.100000 1.320000 2.345005 0.000000 71.489150 68.600000 10 5 20
2015-10-31 20:40:00-04:00 56.800000 45.100000 0.750000 3.125000 0.428571 71.400000 68.600000 10 5 20
2015-10-31 20:50:00-04:00 56.800000 45.200000 0.801494 0.656385 0.428571 71.389150 68.600000 10 5 20
2015-10-31 21:00:00-04:00 56.700000 45.315002 1.040000 0.281312 0.428571 71.289000 68.500000 10 5 21
2015-10-31 21:10:00-04:00 56.500000 45.800000 0.563551 0.000000 0.428571 71.200000 68.500000 10 5 21
2015-10-31 21:20:00-04:00 56.300000 46.300000 0.955003 0.000000 0.000000 71.189200 68.500000 10 5 21
2015-10-31 21:30:00-04:00 56.200000 46.415005 0.882498 0.000000 0.000000 71.089480 68.500000 10 5 21
2015-10-31 21:40:00-04:00 56.100000 47.584995 0.928993 0.000000 0.000000 71.000000 68.500000 10 5 21
2015-10-31 21:50:00-04:00 56.400000 47.000000 0.728039 0.000000 0.000000 70.988650 68.500000 10 5 21
2015-10-31 22:00:00-04:00 56.084995 48.175015 0.982498 0.000000 0.000000 70.888560 68.492330 10 5 22
2015-10-31 22:10:00-04:00 55.824993 54.635014 1.020000 0.000000 0.000000 70.800000 68.400000 10 5 22
2015-10-31 22:20:00-04:00 54.613335 58.473330 0.301331 0.000000 0.000000 70.777040 68.400000 10 5 22
2015-10-31 22:30:00-04:00 53.700000 61.900000 0.560000 0.000000 0.000000 70.577410 68.400000 10 5 22
2015-10-31 22:40:00-04:00 53.059990 66.266680 0.902003 0.000000 0.000000 70.388985 68.400000 10 5 22
2015-10-31 22:50:00-04:00 53.213337 65.259995 1.043337 0.000000 0.000000 70.277630 68.400000 10 5 22
2015-10-31 23:00:00-04:00 52.586662 69.153340 1.073332 0.000000 0.000000 70.088684 68.400000 10 5 23
2015-10-31 23:10:00-04:00 52.513336 67.446655 0.921330 0.000000 0.000000 69.988830 68.400000 10 5 23
2015-10-31 23:20:00-04:00 52.100000 72.040050 0.568689 0.000000 0.000000 69.888830 68.392650 10 5 23
2015-10-31 23:30:00-04:00 52.213337 70.606650 0.731331 0.000000 0.000000 69.777630 68.300000 10 5 23
2015-10-31 23:40:00-04:00 51.944275 70.700000 0.577379 5.717234 0.000000 69.588670 68.300000 10 5 23
2015-10-31 23:50:00-04:00 54.015003 65.539990 0.666001 6.968762 0.000000 69.488686 68.300000 10 5 23
2015-11-01 00:00:00-04:00 53.200000 65.200000 0.840000 0.127669 0.000000 69.388830 68.300000 11 6 0

52561 rows × 10 columns


In [2]:
#Move at (the y) to the end of the last column of the dataframe
at = train['at'];
train.drop(labels=['at'], axis=1,inplace = True)
train.insert(len(train.columns), 'at', at)

at = test['at'];
test.drop(labels=['at'], axis=1,inplace = True)
test.insert(len(test.columns), 'at', at)
test


/home/zhiang/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/zhiang/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[2]:
oa rh sr ws dp st month weekday hour at
time
2015-11-01 00:10:00-04:00 55.811670 57.576660 1.297827 1.322937 0.0 69.288820 11 6 0 68.292000
2015-11-01 00:20:00-04:00 56.600000 54.911670 0.565834 3.864502 0.0 69.188680 11 6 0 68.200000
2015-11-01 00:30:00-04:00 56.611477 54.788520 0.654262 7.161926 0.0 69.088810 11 6 0 68.200000
2015-11-01 00:40:00-04:00 58.188522 54.500000 0.586068 8.391342 0.0 69.011314 11 6 0 68.208000
2015-11-01 00:50:00-04:00 60.210003 48.429990 0.995001 3.937762 0.0 69.100000 11 6 0 68.200000
2015-11-01 01:00:00-04:00 61.089996 44.900000 0.920994 4.249959 0.0 69.111040 11 6 1 68.200000
2015-11-01 01:10:00-04:00 60.576656 43.900000 0.465836 2.020896 0.0 69.222664 11 6 1 68.200000
2015-11-01 01:20:00-04:00 59.188330 44.123337 0.890000 5.697937 0.0 69.388670 11 6 1 68.200000
2015-11-01 01:30:00-04:00 58.076660 47.135010 0.840000 4.739687 0.0 69.288690 11 6 1 68.200000
2015-11-01 01:40:00-04:00 57.700000 47.435013 0.990503 7.500000 0.0 69.188810 11 6 1 68.200000
2015-11-01 01:50:00-04:00 57.900000 47.088330 0.905834 5.479141 0.0 69.097170 11 6 1 68.200000
2015-11-01 01:00:00-05:00 57.900000 47.000000 1.063660 10.520546 0.0 69.072174 11 6 1 68.200000
2015-11-01 01:10:00-05:00 57.888332 48.811670 0.454165 7.176813 0.0 69.047190 11 6 1 68.200000
2015-11-01 01:20:00-05:00 57.611668 49.176662 0.532170 3.937438 0.0 69.022200 11 6 1 68.200000
2015-11-01 01:30:00-05:00 57.600000 49.551690 0.913664 4.302073 0.0 68.997210 11 6 1 68.200000
2015-11-01 01:40:00-05:00 57.600000 53.300000 0.554165 6.729125 0.0 68.972220 11 6 1 68.200000
2015-11-01 01:50:00-05:00 57.488330 53.300000 0.565835 4.781188 0.0 68.947230 11 6 1 68.200000
2015-11-01 02:00:00-05:00 57.288330 54.400000 0.620502 7.750124 0.0 68.922240 11 6 2 68.200000
2015-11-01 02:10:00-05:00 56.300000 60.035010 1.199497 14.156187 0.0 68.888960 11 6 2 68.200000
2015-11-01 02:20:00-05:00 55.700000 63.646680 1.224165 7.614438 0.0 68.800000 11 6 2 68.191990
2015-11-01 02:30:00-05:00 55.400000 66.223335 0.280000 5.958255 0.0 68.778040 11 6 2 68.100000
2015-11-01 02:40:00-05:00 54.900000 68.111670 0.620503 3.604125 0.0 68.589130 11 6 2 68.100000
2015-11-01 02:50:00-05:00 54.600000 69.400000 0.735333 4.781219 0.0 68.500000 11 6 2 68.100000
2015-11-01 03:00:00-05:00 54.300000 70.746670 1.146336 2.427073 0.0 68.477740 11 6 3 68.100000
2015-11-01 03:10:00-05:00 54.000000 73.311676 1.437827 5.958250 0.0 68.289116 11 6 3 68.100000
2015-11-01 03:20:00-05:00 53.976660 74.011670 0.620503 1.656191 0.0 68.200000 11 6 3 68.100000
2015-11-01 03:30:00-05:00 53.800000 74.711670 1.181670 6.437375 0.0 68.188700 11 6 3 68.100000
2015-11-01 03:40:00-05:00 53.810444 74.910446 0.655377 7.994441 0.0 68.088960 11 6 3 68.100000
2015-11-01 03:50:00-05:00 53.700000 75.822960 0.654261 6.444604 0.0 67.977450 11 6 3 68.100000
2015-11-01 04:00:00-05:00 53.800000 76.600000 0.894005 1.125375 0.0 67.789130 11 6 4 68.100000
... ... ... ... ... ... ... ... ... ... ...
2016-10-31 19:20:00-04:00 48.200000 56.900000 0.410000 0.000000 0.0 69.572000 10 0 19 71.600000
2016-10-31 19:30:00-04:00 48.243324 54.510070 0.539344 0.000000 0.0 69.307680 10 0 19 71.599330
2016-10-31 19:40:00-04:00 47.600000 56.156677 0.600650 0.000000 0.0 69.007180 10 0 19 71.499330
2016-10-31 19:50:00-04:00 47.100000 57.500000 0.278342 0.000000 0.0 68.772606 10 0 19 71.400000
2016-10-31 20:00:00-04:00 46.800000 57.556683 0.510000 0.000000 0.0 68.508400 10 0 20 71.300000
2016-10-31 20:10:00-04:00 46.900000 57.543320 0.211011 0.000000 0.0 68.336160 10 0 20 71.200000
2016-10-31 20:20:00-04:00 46.656670 58.656670 0.630661 0.000000 0.0 68.175970 10 0 20 71.200000
2016-10-31 20:30:00-04:00 46.800000 58.513350 0.150000 0.000000 0.0 67.977390 10 0 20 71.100000
2016-10-31 20:40:00-04:00 46.756683 58.626736 0.459357 0.000000 0.0 67.774060 10 0 20 71.099335
2016-10-31 20:50:00-04:00 46.500000 59.113350 0.600657 0.708423 0.0 67.571690 10 0 20 71.000000
2016-10-31 21:00:00-04:00 46.400000 58.640068 0.397329 0.000000 0.0 67.438240 10 0 21 70.900000
2016-10-31 21:10:00-04:00 46.300000 58.643314 0.418982 0.000000 0.0 67.275300 10 0 21 70.899330
2016-10-31 21:20:00-04:00 46.100000 58.943314 0.202675 0.000000 0.0 67.137276 10 0 21 70.800000
2016-10-31 21:30:00-04:00 45.943330 58.886658 0.291003 0.000000 0.0 67.036450 10 0 21 70.700000
2016-10-31 21:40:00-04:00 44.743324 61.413350 0.370000 0.000000 0.0 66.871980 10 0 21 70.699326
2016-10-31 21:50:00-04:00 44.600000 61.500000 0.357326 0.000000 0.0 66.673386 10 0 21 70.600000
2016-10-31 22:00:00-04:00 44.943314 60.200000 0.108982 0.000000 0.0 66.536150 10 0 22 70.599330
2016-10-31 22:10:00-04:00 43.943317 64.256680 0.132031 0.000000 0.0 66.376480 10 0 22 70.500000
2016-10-31 22:20:00-04:00 43.243317 66.000000 0.551659 0.000000 0.0 66.242240 10 0 22 70.400000
2016-10-31 22:30:00-04:00 43.300000 66.400000 0.227995 0.000000 0.0 66.143050 10 0 22 70.400000
2016-10-31 22:40:00-04:00 43.044983 63.944984 0.452492 0.000000 0.0 65.983696 10 0 22 70.400000
2016-10-31 22:50:00-04:00 43.800000 61.200000 0.209043 0.000000 0.0 65.776405 10 0 22 70.400000
2016-10-31 23:00:00-04:00 43.779984 62.795036 0.402498 0.000000 0.0 65.700000 10 0 23 70.300000
2016-10-31 23:10:00-04:00 42.955940 66.000000 0.147970 0.000000 0.0 65.637310 10 0 23 70.300000
2016-10-31 23:20:00-04:00 43.800000 62.000000 0.280975 0.000000 0.0 65.473580 10 0 23 70.200000
2016-10-31 23:30:00-04:00 44.100000 61.655754 0.402302 0.000000 0.0 65.336876 10 0 23 70.199326
2016-10-31 23:40:00-04:00 43.456690 64.373240 0.650000 0.000000 0.0 65.238236 10 0 23 70.100000
2016-10-31 23:50:00-04:00 43.686653 64.026690 0.477331 0.000000 0.0 65.138695 10 0 23 70.000000
2016-11-01 00:00:00-04:00 43.375790 64.059960 0.214004 0.000000 0.0 65.040290 11 1 0 70.000000
2016-11-01 00:10:00-04:00 42.733345 66.400000 0.534995 0.000000 0.0 64.941410 11 1 0 70.000000

52705 rows × 10 columns


In [3]:
#The error metric
def chouErrorMetric(pred, real):
    """
    Reference:
    Jui Sheng Chou and Dac Khuong Bui. Modeling heating and cooling
    loads by artificial intelligence for energy-efficient building design.
    Energy and Buildings, 82:437–446, 2014.
    
    pred: numpy array of the predicted value
    real: numpy array of the real value
    """
    n = pred.shape[0];
    rmse = (
            (np.sum((pred-real)**2.0)/n)**0.5
            );
    
    mae = (
            np.sum(np.absolute((pred-real)))/n
            );
    
    r = (
         (np.sum(np.multiply(real,pred))*n 
        - np.sum(real)*np.sum(pred))/
        (np.sqrt(n*np.sum(real**2)-(np.sum(real)**2))*
         np.sqrt(n*np.sum(pred**2)-(np.sum(pred)**2)))
            );
    
    r_normal = (1-r)/2.0 #make r value between 0 and 1, 0 is the best
    errorList = [rmse, mae, r_normal];
    print ("error list",errorList);
    avgError = sum(errorList)/3.0;
    return avgError,errorList;

In [4]:
#min-max normalization
def normalize(data):
    """
    min max normalization
    data: np array, n*(m+1), n is sample number,
          m is feature number, plus 1 for the true value
    return: (np array 1*(m+1) for the max-min for each column,
             np array 1*(m+1) for the minimum value for each colmn)
    """
    colNum = data.shape[1];
    division = [];
    minVList = [];
    for i in range(colNum):
        col = data[:, i];
        maxV = (np.max(col));
        minV = (np.min(col));
        division.append(maxV-minV);
        minVList.append(minV);
        data[:,i] = data[:,i] - minV;

    division = np.array(division);
    minVList = np.array(minVList);
    return (np.true_divide(data, division[None,:]), division, minVList);

In [5]:
#Lazy learning

In [6]:
#lazy learning with non-weighted average kernel
class lazyLearning():
    
    def __init__(self, train, k):

        """
        perform lazy learning with linear average kernel and 
        eculidean distance.
        inputs: 
        train: numpy array, n*(m+1),  
               each row is a sample (total n samples),
               each col is a feature (total m features), 
               the last col is the real y
        k: k-nn in lazy learning
        """
        self.trainFeat = train[:,0:-1];
        self.trainY = train[:,-1];
        self.k = k;
    
    def predict(self,test):
        """
        test: numpy array 1*m, m is the number of feature
        return the predicted value
        """
        eclideanDist = np.linalg.norm(self.trainFeat - test, axis = 1);
        eclideanDistSort = eclideanDist.argsort(axis = 0);
        eclideanDistSort = eclideanDistSort[0:self.k];
        knneighbours = ([self.trainY[index] for 
                         index in eclideanDistSort]);
        yt = np.mean(knneighbours);
        return yt;

#cross validation

def getCrossValidationData(trainArray, fold):
    """
    get the data list for cross validation
    trainArray: raw train data, numpy array n*(m+1)
                where n is sample size, m is feature
                size, plus 1 for the true result
    fold: fold value
    return tuple ([trainNpArray_1, trainNpArray_2,...]
                 ,[testNpArray_1, testNpArray_2,...])
    """
    trainLength = trainArray.shape[0];
    numEachFold = int(trainLength/10);
    import random;
    np.random.shuffle(trainArray);#shuffle rows
    cvTrainList = [];
    cvTestList = [];
    for foldK in range(fold):
        foldStart = int(foldK*(trainLength)/fold);
        foldEnd = int(foldStart + trainLength/fold);
        if foldEnd >= trainLength:
            foldEnd = trainLength - 1;
        cvTest = trainArray[foldStart:foldEnd,:];
        cvTrain = np.delete(trainArray
                            ,range(foldStart, foldEnd, 1)
                            ,axis = 0);
        cvTrainList.append(cvTrain);
        cvTestList.append(cvTest);
    return (cvTrainList, cvTestList);


def crossValidateForChioces(choiceList, trainArray, mlMethod,
                            fold = 10):
    """
    Perform cross validation to select model.
    choiceList: 1D list containing different numeric choice
    trainArray: np array n*(m+1) for the training
    mlMethod: ml class
    fold: cross validation fold
    return: cross validation error using the chouErrorMetric
            over the choice list
    """
    cvTrainList, cvTestList = (
        getCrossValidationData(trainArray, fold));
    print ('Performing cross validation to find the best k...');
    cvErrorOverK = [];
    for k in choiceList:
        print ('Testing k=', k, ' ...')
        error_k = [];
        for fold_i in range(len(cvTrainList)):
            print ('Testing k=',k,' for fold=',fold_i,' ...')
            pred_fold_i = [];
            cvTrain_i = cvTrainList[fold_i];
            cvTest_i = cvTestList[fold_i];
            model = mlMethod(cvTrain_i,k);
            for testSample_i in range(len(cvTest_i)):
                pred_fold_i.append(model
                          .predict(cvTest_i[testSample_i,0:-1]));
            pred_fold_i = np.array(pred_fold_i);
            error_fold_i = chouErrorMetric(pred_fold_i,cvTest_i[:,-1]);
            print ('Error metric for k=',k,' fold=', fold_i,
              ' is ', error_fold_i);
            error_k.append(error_fold_i[0]);
        meanErrorFold_i = np.mean(np.array(error_k));
        print ('Cross validated error metric for k=',k,
          ' is ',meanErrorFold_i);
        cvErrorOverK.append(np.mean(np.array(error_k)));
    return cvErrorOverK

In [7]:
#Perform cross validation to choose K for LL-Avg
trainArray = train.as_matrix();#non-normalized data
trainArrayNorm, normDivision, normMinList = (
    normalize(train.as_matrix()));#normalized data
llKChoiceList = range(5,100,5);
trainArrayNorm
cvErrorOverK = crossValidateForChioces(
                llKChoiceList
                ,trainArrayNorm
                ,lazyLearning
                ,fold = 10
                );


Performing cross validation to find the best k...
Testing k= 5  ...
Testing k= 5  for fold= 0  ...
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-5911334e1fdb> in <module>()
      9                 ,trainArrayNorm
     10                 ,lazyLearning
---> 11                 ,fold = 10
     12                 );

<ipython-input-6-c3b2fcbf6a6e> in crossValidateForChioces(choiceList, trainArray, mlMethod, fold)
     90             for testSample_i in range(len(cvTest_i)):
     91                 pred_fold_i.append(model
---> 92                           .predict(cvTest_i[testSample_i,0:-1]));
     93             pred_fold_i = np.array(pred_fold_i);
     94             error_fold_i = chouErrorMetric(pred_fold_i,cvTest_i[:,-1]);

<ipython-input-6-c3b2fcbf6a6e> in predict(self, test)
     25         """
     26         eclideanDist = np.linalg.norm(self.trainFeat - test, axis = 1);
---> 27         eclideanDistSort = eclideanDist.argsort(axis = 0);
     28         eclideanDistSort = eclideanDistSort[0:self.k];
     29         knneighbours = ([self.trainY[index] for 

KeyboardInterrupt: 

In [8]:
#plot error metric over K for LL-Avg
print (cvErrorOverK)

plt.plot(llKChoiceList,cvErrorOverK,
            color="b",
            label="Lazy Learning with Uniform Average Kernel");
plt.xlabel(
            "Choice of K",
            fontsize = 14
            );
plt.ylabel(
            "Cross Validation Error (Error Metric)",
            fontsize = 14
            );
plt.legend();
plt.show();


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-5c2f52e87b47> in <module>()
      1 #plot error metric over K for LL-Avg
----> 2 print (cvErrorOverK)
      3 
      4 plt.plot(llKChoiceList,cvErrorOverK,
      5             color="b",

NameError: name 'cvErrorOverK' is not defined

In [ ]:
#LL-Avg: Test over the next year's data
testArray = test.as_matrix();
optK = llKChoiceList[np.argmin(np.array(cvErrorOverK))];

modelData = train.as_matrix();
modelDataLen = modelData.shape[0];
allData_test = np.append(modelData, testArray, 0); #model data + test array
#normalize both model data and test data
#to the same scale
allData_test,normDivision_test,normMinList_test = (
                                                normalize (allData_test));
#modelData: the dataset pool for LL
#testArray: the different dataset for the tesing
modelData = allData_test[0:modelDataLen,:];
testArray = np.delete(allData_test
                     ,range(modelDataLen)
                     ,axis = 0);

llTest = lazyLearning(modelData, optK);

testRest_norm = [];
for sample_i in range(len(testArray)):
    testRest_norm.append(llTest
                    .predict(testArray[sample_i,0:-1]));
testRest_norm = np.array(testRest_norm);
(testError_norm, detail) = chouErrorMetric(testRest_norm, testArray[:,-1]);
print ("Test error metric over the test data for LL-Avg is ",testError_norm,".");

#plot test data, pred vs. true (normalized)
plt.plot(testRest_norm
         ,testArray[:,-1]
         ,'.'
         ,color="b"
         ,label="Lazy Learning with Uniform Average Kernel");

plt.xlabel(
            "Predicted Value for the Test Data (Min-max Normalized)",
            fontsize = 12
            );
plt.ylabel(
            "True Value for the Test Data (Min-max Normalized)",
            fontsize = 12
            );
plt.legend();
plt.show();
#plot test data, random 1000 non-consective pred and true over time index (unnormalized)
testRest_unNorm = testRest_norm*normDivision_test[-1]+normMinList_test[-1];
testRest_unNorm = testRest_unNorm.reshape((1,len(testRest_unNorm)));
trueRest_unNorm = testArray[:,-1]*normDivision_test[-1]+normMinList_test[-1];
trueRest_unNorm = trueRest_unNorm.reshape((1,len(trueRest_unNorm)));
testNtrueRest_unNorm = np.append(testRest_unNorm, trueRest_unNorm, axis = 0);

plt.plot(testNtrueRest_unNorm[0,0:2000]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,0:2000]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-01-01 to 2015-01-14",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Uniform Average Kernel");
plt.legend();
plt.show();

plt.plot(testNtrueRest_unNorm[0,11233:13233]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,11233:13233]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-03-20 to 2015-04-02",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Uniform Average Kernel");
plt.legend();
plt.show();

plt.plot(testNtrueRest_unNorm[0,21745:23745]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,21745:23745]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-06-01 to 2015-06-14",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Uniform Average Kernel");
plt.legend();
plt.show();
    
plt.plot(testNtrueRest_unNorm[0,42049:44049]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,42049:44049]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-10-20 to 2015-11-02",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Uniform Average Kernel");
plt.legend(loc='lower left');
plt.show();

In [ ]:
def makeLaggedData(lagOrder, rawData):
    """
    add lagged feature to the original dataset
    :lagOrder type
        1D list of float
    :lagOrder para,
        lagOrder corresponding to each column of the raw data
        e.g. [2,1,1] means for the first column (the first feature) in
            the rawData, add 1&2 time steps lagged data as the new features;
            for the second column (the original second feature) in the
            rawData, add 1 step lagged data as the new feature. 
    :rawData: numpy array, n*(m+1)
    return: new dataset with lagged features
    """
    colNum = rawData.shape[1];
    rowNum = rawData.shape[0];
    autoRegNum = len(lagOrder);
    returnedData = [];
    maxRegOrder = max(lagOrder);
    for i in range(colNum):
        
        if i < autoRegNum:
            autoRegOrder = lagOrder[i];
            for j in range(0,autoRegOrder):
                appendToHead = np.zeros(j+1);
                returnedData.append(
                        np.append(appendToHead, 
                            rawData[0:rowNum-j-1,i]));
        returnedData.append(rawData[:,i]);
    
    returnedData = np.array(returnedData).transpose();
    returnedData = returnedData[maxRegOrder:,:];
    return returnedData;

#lazy learning with linear regression as the local model
class lazyLearning_localLinear():
    
    def __init__(self, train, k):

        """
        perform lazy learning with linear average kernel and 
        eculidean distance.
        inputs: 
        train: numpy array, n*(m+1),  
               each row is a sample (total n samples),
               each col is a feature (total m features), 
               the last col is the real y
        k: k-nn in lazy learning
        """
        self.trainFeat = train[:,0:-1];
        self.trainY = train[:,-1];
        self.k = k;
    
    def predict(self,test):
        """
        test: numpy array 1*m, m is the number of feature
        return the predicted value (scalar)
        """
        eclideanDist = np.linalg.norm(self.trainFeat - test, axis = 1);
        eclideanDistSort = eclideanDist.argsort(axis = 0);
        eclideanDistSort = eclideanDistSort[0:self.k];
        knneighboursY = np.array([self.trainY[index] for 
                         index in eclideanDistSort]).reshape(self.k,1);
        m = self.trainFeat.shape[1];
        knneighboursX = self.trainFeat[eclideanDistSort[0],:].reshape(1,m);
        for index in eclideanDistSort[1:]:
            knneighboursX = np.append(
                knneighboursX, self.trainFeat[index,:].reshape(1,m),0);
        from sklearn import linear_model
        regr = linear_model.LinearRegression(n_jobs = -1);
        regr.fit(knneighboursX, knneighboursY)
        test = test.reshape(1,m);
        return regr.predict(test)[0,0];

In [ ]:
#Cross validation choose k for lazy learning with linear regression
#Perform cross validation
trainArray = train.as_matrix();#non-normalized data
trainArrayNorm, normDivision, normMinList = (
    normalize(train.as_matrix()));#normalized data

llKChoiceList = range(100,500,100);
cvErrorOverK = crossValidateForChioces(
                llKChoiceList
                ,trainArrayNorm
                ,lazyLearning_localLinear
                ,fold = 10
                );

In [ ]:
cvErrorOverK

In [ ]:
#ll with uniform average kernel, but with lagged features
#Perform cross validation
trainArray = train.as_matrix();#non-normalized data
trainArrayNorm, normDivision, normMinList = (
    normalize(train.as_matrix()));#normalized data
lagOrder = [2,0,2,0,0,0,0,0,0,2];
trainArrayNormLagged = makeLaggedData(lagOrder, trainArrayNorm)
llKChoiceList = range(5,50,5);
cvErrorOverK = crossValidateForChioces(
                llKChoiceList
                ,trainArrayNormLagged
                ,lazyLearning
                ,fold = 10
                );

In [ ]:
cvErrorOverK_0 = cvErrorOverK

In [ ]:
#Test over the test dataset for LL-LR
testArray = test.as_matrix();
optK = 100;

modelData = train.as_matrix();
modelDataLen = modelData.shape[0];
allData_test = np.append(modelData, testArray, 0); #model data + test array
allData_test,normDivision_test,normMinList_test = (
                                                normalize (allData_test));
modelData = allData_test[0:modelDataLen,:];
testArray = np.delete(allData_test
                     ,range(modelDataLen)
                     ,axis = 0);

llTest = lazyLearning_localLinear(modelData, optK);

testRest_norm = [];
for sample_i in range(len(testArray)):
    testRest_norm.append(llTest
                    .predict(testArray[sample_i,0:-1]));
testRest_norm = np.array(testRest_norm);

(testError_norm, detail) = chouErrorMetric(testRest_norm, testArray[:,-1]);
print ("Test error metric over the test data is ",testError_norm,".");
print (detail)

#plot test data, pred vs. true (normalized)
plt.plot(testRest_norm
         ,testArray[:,-1]
         ,'.'
         ,color="b"
         ,label="Lazy Learning with Linear Regression Kernel");

plt.xlabel(
            "Predicted Value for the Test Data (Min-max Normalized)",
            fontsize = 10
            );
plt.ylabel(
            "True Value for the Test Data (Min-max Normalized)",
            fontsize = 10
            );
plt.axis([0,1,0,1])
plt.legend();
plt.show();
#plot test data, random 1000 non-consective pred and true over time index (unnormalized)
testRest_unNorm = testRest_norm*normDivision_test[-1]+normMinList_test[-1];
testRest_unNorm = testRest_unNorm.reshape((1,len(testRest_unNorm)));
trueRest_unNorm = testArray[:,-1]*normDivision_test[-1]+normMinList_test[-1];
trueRest_unNorm = trueRest_unNorm.reshape((1,len(trueRest_unNorm)));
testNtrueRest_unNorm = np.append(testRest_unNorm, trueRest_unNorm, axis = 0);

plt.plot(testNtrueRest_unNorm[0,0:2000]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,0:2000]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-01-01 to 2015-01-14",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Linear Regression Kernel");
plt.legend();
plt.show();

plt.plot(testNtrueRest_unNorm[0,11233:13233]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,11233:13233]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-03-20 to 2015-04-02",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Linear Regression Kernel");
plt.legend();
plt.show();

plt.plot(testNtrueRest_unNorm[0,21745:23745]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,21745:23745]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-06-01 to 2015-06-14",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Linear Regression Kernel");
plt.legend();
plt.show();
    
plt.plot(testNtrueRest_unNorm[0,42049:44049]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,42049:44049]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-10-20 to 2015-11-02",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("Lazy Learning with Linear Regression Kernel");
plt.legend(loc='lower left');
plt.show();

In [ ]:
#Test over the test dataset for LL-Avg-Lag
cvErrorOverK = [0.006261511172007253,
                0.0078319748087286763,
                0.0092544725368474655,
                0.010491375591088101,
                0.011582034373530298,
                0.012577092631859799,
                0.013481123234175307,
                0.014271828395324013,
                0.014997619668965751]#copied from the previous computation

testArray = test.as_matrix();
optK = llKChoiceList[np.argmin(np.array(cvErrorOverK))];

modelData = train.as_matrix();
modelDataLen = modelData.shape[0];
allData_test = np.append(modelData, testArray, 0); #model data + test array
allData_test,normDivision_test,normMinList_test = (
                                                normalize (allData_test));
lagOrder = [2,0,2,0,0,0,0,0,0,2];
allData_test = makeLaggedData(lagOrder, allData_test);                                                                          #to the same scale
modelData = allData_test[0:modelDataLen,:];
testArray = np.delete(allData_test
                     ,range(modelDataLen)
                     ,axis = 0);

llTest = lazyLearning(modelData, optK);

testRest_norm = [];
#perform test.
#because lagged indoor air temperature (T_k-1, T_k-2, 
#k is the time step that we want to predict the T)
#is an input to the model, so the prediction
#is performed in an iterative way, that is:
#test data is divided into groups and each group
#contains the consectively 1000 sampes.
#For every 1000 test samples, the first sample's
#T_k-1 and T_k-2 is the real measured data; the
#second sample's T_k-1 is the predicted value from
#the last sample; the third sample's T_k-1 and 
#T_k-2 are the predicted values from the last two
#samples, so on and so forth. 
lagOrderT = lagOrder[-1];
test_sample_i = testArray[0,0:-1];
for sample_i in range(len(testArray)):
    pred_i = llTest.predict(test_sample_i);
    testRest_norm.append(pred_i);
    if sample_i + 1 < len(testArray):
        test_sample_i_new = testArray[sample_i+1,0:-1];
        sampleFeatM = len(test_sample_i_new);
        test_sample_i_new[sampleFeatM-lagOrderT] = pred_i;
        for lagOrderT_i in range(1,lagOrderT):
            test_sample_i_new[sampleFeatM - lagOrderT 
                              + lagOrderT_i] = test_sample_i[sampleFeatM 
                                                             - lagOrderT + lagOrderT_i - 1]
        test_sample_i = test_sample_i_new;
        
testRest_norm = np.array(testRest_norm);

In [ ]:
#continue...
(testError_norm, detail) = chouErrorMetric(testRest_norm, testArray[:,-1]);
print ("Test error metric over the test data is ",testError_norm,".");
print (detail)

#plot test data, pred vs. true (normalized)
plt.plot(testRest_norm
         ,testArray[:,-1]
         ,'.'
         ,color="b"
         ,label="LL with Average Kernel - Lagged Features");
plt.axis([0,1,0,1]);

plt.xlabel(
            "Predicted Value for the Test Data (Min-max Normalized)",
            fontsize = 10
            );
plt.ylabel(
            "True Value for the Test Data (Min-max Normalized)",
            fontsize = 10
            );
plt.legend();
plt.show();
#plot test data, samples from each season
testRest_unNorm = testRest_norm*normDivision_test[-1]+normMinList_test[-1];
testRest_unNorm = testRest_unNorm.reshape((1,len(testRest_unNorm)));
trueRest_unNorm = testArray[:,-1]*normDivision_test[-1]+normMinList_test[-1];
trueRest_unNorm = trueRest_unNorm.reshape((1,len(trueRest_unNorm)));
testNtrueRest_unNorm = np.append(testRest_unNorm, trueRest_unNorm, axis = 0);

plt.plot(testNtrueRest_unNorm[0,0:2000]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,0:2000]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-01-01 to 2015-01-14",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("LL with Average Kernel - Lagged Features");
plt.legend();
plt.show();

plt.plot(testNtrueRest_unNorm[0,11000:13000]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,11000:13000]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-03-19 to 2015-04-01",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("LL with Average Kernel - Lagged Features");
plt.legend(fontsize = 12);
plt.show();

plt.plot(testNtrueRest_unNorm[0,21800:23800]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,21800:23800]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-06-02 to 2015-06-15",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("LL with Average Kernel - Lagged Features");
plt.legend();
plt.show();
    
plt.plot(testNtrueRest_unNorm[0,42000:44000]
         ,color="b"
         ,label="Predicted Value");
plt.plot(testNtrueRest_unNorm[1,42000:44000]
         ,color='r'
         ,label='True Value');
plt.axis([0,2000,50,80]);
plt.xlabel(
            "2015-10-20 to 2015-11-02",
            fontsize = 14
            );
plt.ylabel(
            "Indoor Air Temperature (F)",
            fontsize = 14
            );
plt.title("LL with Average Kernel - Lagged Features");
plt.legend(loc='lower left');
plt.show();