In [13]:
import numpy as np
import statsmodels.api as sm
import pickle
import copy
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime
import pandas as pd

In [14]:
with open('../data/df_demand.pkl','rb') as f:

In [15]:
train_start_date = datetime(2011,1,1)
train_end_date = datetime(2012,12,31)

test_start_date = datetime(2013,1,1)
test_end_date = datetime(2014,1,1)

## Multiple Linear Regression (All Variables)

In [16]:
def mls(df):
df_train = df[train_start_date:train_end_date]
df_test = df[test_start_date:test_end_date]

x_train = df_train.drop('Demanda_shift_1',1).drop('date',1).as_matrix()
x_test = df_test.drop('Demanda_shift_1',1).drop('date',1).as_matrix()

y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

ols = sm.OLS(y_train,x_train)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test)

error = (y_test - pred_y)/y_test
mape = np.mean(abs(error))

print "MAPE ERROR: " + str(mape)
print "MAX ERROR: " + str(max(error))
#print res.summary()
return [mape,error]
#This shifts the demand data, so that each row has the values and then the next hours demand.
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna(subset=['Demanda_shift_1'])
mls(df)
pass

MAPE ERROR: 0.0437576648775
MAX ERROR: 0.378515466485

In [16]:

## Multiple Linear Regression with time-shifted variables

In [17]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df = copy.deepcopy(df)

mls(df)
pass

MAPE ERROR: 0.0424863467463
MAX ERROR: 0.444272017805

## Above MLR with seperate regression for peak and off-peak

In [18]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df_opk = copy.deepcopy(df[df['wd_peak']==0])
df_pk = copy.deepcopy(df[df['wd_peak']==1])
print 'Off-Peak'
mls(df_opk)
print
print 'Peak'
mls(df_pk)
pass

Off-Peak
MAPE ERROR: 0.0346863032069
MAX ERROR: 0.621379430005

Peak
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

## Multiple Linear Regression with PCA

In [19]:
def get_PCA_Matrix(df_imp_stand):
all_samples = df_imp_stand.as_matrix().T
num_var = len(all_samples)
mean_vector = all_samples.mean(1)
#print('Mean Vector:\n', mean_vector)

cov_mat = np.cov(all_samples)
#print('Covariance Matrix:\n', cov_mat)
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
for i in range(len(eig_val_cov)):
eigvec_cov = eig_vec_cov[:,i].reshape(1,num_var).T
#print('Eigenvalue {} from covariance matrix: {}'.format(i+1, eig_val_cov[i]))

# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
def getKey(item):
return item[0]
eig_pairs = sorted(eig_pairs,key=getKey)

eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
for i in eig_pairs:
#print(i[0])
pass
matrix_w = np.hstack((eig_pairs[0][1].reshape(num_var,1), eig_pairs[1][1].reshape(num_var,1),eig_pairs[2][1].reshape(num_var,1),eig_pairs[3][1].reshape(num_var,1),eig_pairs[4][1].reshape(num_var,1)))
#print matrix_w
transformed = matrix_w.T.dot(all_samples)
return [transformed, matrix_w]

In [20]:
def mls_pca(df):

#df_stand = copy.deepcopy((df - df.mean()) / df.std())

y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]

[transformed, matrix_w] = get_PCA_Matrix(df_train)

x_train = transformed.T
x_test =  matrix_w.T.dot(df_test.T).T

ols = sm.OLS(y_train,x_train)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test)

error = (y_test - pred_y)/y_test
mape = np.mean(abs(error))

print "MAPE ERROR: " + str(mape)
print "MAX ERROR: " + str(max(error))
#print res.summary()

table = []
for key in df.drop('date',1):
if key != 'Demanda_shift_1':
corr = [np.corrcoef(df_train[key],transformed[0])[0][1],np.corrcoef(df_train[key],transformed[1])[0][1],np.corrcoef(df_train[key],transformed[2])[0][1],np.corrcoef(df_train[key],transformed[3])[0][1],np.corrcoef(df_train[key],df['Demanda_shift_1'][train_start_date:train_end_date])[0][1]]
table.append(corr)
corr_df = pd.DataFrame(np.array(table).T,columns=df_train.keys()).T
corr_df.columns = ['PCA0','PCA1','PCA2','PCA3','Demand']
corr_df.sort('PCA0',ascending=False)

df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna()
mls_pca(df)

MAPE ERROR: 0.0506906718918
MAX ERROR: 0.506490500293

## MLS with PCA and time-shifted variables

In [21]:
def mls_vars_and_pca(df):

y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]

[transformed, matrix_w] = get_PCA_Matrix(df_train)

x_train_pls = transformed.T
x_test_pls =  matrix_w.T.dot(df_test.T).T

df = df.dropna()
df = copy.deepcopy(df)

df_train = df[train_start_date:train_end_date]
df_test = df[test_start_date:test_end_date]

x_train = df_train.drop('Demanda_shift_1',1).drop('date',1)
x_test = df_test.drop('Demanda_shift_1',1).drop('date',1)

x_train_all = np.vstack((x_train_pls.T,x_train.T)).T
x_test_all = np.vstack((x_test_pls.T,x_test.T)).T

ols = sm.OLS(y_train,x_train_all)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test_all)

error = (y_test - pred_y)/y_test
mape = np.mean(abs(error))

print "MAPE ERROR: " + str(mape)
print "MAX ERROR: " + str(abs(max(error)))
#print res.summary()
return [mape,error]

df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
mls_vars_and_pca(df)
pass

MAPE ERROR: 0.0424696972297
MAX ERROR: 0.44397743006

## MLS with time-shifted variables, seperate peaks for weekday and weekend

In [22]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string

Weekday Off-Peak (10PM-4PM)
MAPE ERROR: 0.0192551539812
MAX ERROR: 0.104300880708

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

Weekend Off-Peak (10PM-5PM)
MAPE ERROR: 0.0308714377455
MAX ERROR: 0.584087837849

Weekend Peak (6PM-9PM)
MAPE ERROR: 0.0234933150389
MAX ERROR: 0.107923347699

Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:0.0290449474911

## MLS with time-shifted variables and Peak/Off Peak for Weekday/Saturday/Sunday

In [23]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (9PM-9AM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==5])
print 'Sunday Off-Peak (9PM-6PM)'
[mape,error] = mls(df_sa_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==5])
print 'Saturday Peak (6PM-9PM)'
[mape,error] = mls(df_sa_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==6])
print 'Sunday Off-Peak (9PM-6PM)'
[mape,error] = mls(df_su_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==6])
print 'Sunday Peak (6PM-9PM)'
[mape,error] = mls(df_su_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:" + mape_string

Weekday Off-Peak (9PM-9AM)
MAPE ERROR: 0.0192551539812
MAX ERROR: 0.104300880708

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

Sunday Off-Peak (9PM-6PM)
MAPE ERROR: 0.0303154348048
MAX ERROR: 0.430677160457

Saturday Peak (6PM-9PM)
MAPE ERROR: 0.0250413074411
MAX ERROR: 0.102384405132

Sunday Off-Peak (9PM-6PM)
MAPE ERROR: 0.0291031378679
MAX ERROR: 0.187692313247

Sunday Peak (6PM-9PM)
MAPE ERROR: 0.0224302825589
MAX ERROR: 0.0751037358445

Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:0.028780548077

## Above with PCA

In [12]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string

In [ ]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string

In [29]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (9PM-9AM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (9PM-6PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string

Weekday Off-Peak (9PM-9AM)
MAPE ERROR: 0.0192551539812
MAX ERROR: 0.104300880708

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

Weekend Off-Peak (9PM-6PM)
MAPE ERROR: 0.0308714377455
MAX ERROR: 0.584087837849

Weekend Peak (6PM-9PM)
MAPE ERROR: 0.0234933150389
MAX ERROR: 0.107923347699

Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:0.0290449474911

In [ ]: