In [2]:
import numpy as np
import statsmodels.api as sm
import pickle
import copy
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
import pandas as pd
In [3]:
with open('../data/df_demand.pkl','rb') as f:
df_orig = pickle.load(f)
In [4]:
train_start_date = datetime(2011,1,1)
train_end_date = datetime(2012,12,31)
test_start_date = datetime(2013,1,1)
test_end_date = datetime(2014,1,1)
In [6]:
def mls(df):
df_train = df[train_start_date:train_end_date]
df_test = df[test_start_date:test_end_date]
x_train = df_train.drop('Demanda_shift_1',1).drop('date',1)
x_test = df_test.drop('Demanda_shift_1',1).drop('date',1)
y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
y_test = df['Demanda_shift_1'][test_start_date:test_end_date]
x_train = sm.add_constant(x_train, prepend=True) #add a constant
x_test = sm.add_constant(x_test, prepend=True) #add a constant
ols = sm.OLS(y_train,x_train)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test)
error = (y_test - pred_y)/y_test
mape = np.mean(abs(error))
print "MAPE ERROR: " + str(mape)
print "MAX ERROR: " + str(max(error))
print res.summary()
return [mape,error]
#This shifts the demand data, so that each row has the values and then the next hours demand.
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna(subset=['Demanda_shift_1'])
mls(df)
pass
In [28]:
Out[28]:
In [39]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)
df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df = copy.deepcopy(df)
mls(df)
pass
In [43]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)
df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df_opk = copy.deepcopy(df[df['wd_peak']==0])
df_pk = copy.deepcopy(df[df['wd_peak']==1])
print 'Off-Peak'
mls(df_opk)
print
print 'Peak'
mls(df_pk)
pass
In [44]:
def get_PCA_Matrix(df_imp_stand):
all_samples = df_imp_stand.as_matrix().T
num_var = len(all_samples)
mean_vector = all_samples.mean(1)
#print('Mean Vector:\n', mean_vector)
cov_mat = np.cov(all_samples)
#print('Covariance Matrix:\n', cov_mat)
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
for i in range(len(eig_val_cov)):
eigvec_cov = eig_vec_cov[:,i].reshape(1,num_var).T
#print('Eigenvalue {} from covariance matrix: {}'.format(i+1, eig_val_cov[i]))
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
def getKey(item):
return item[0]
eig_pairs = sorted(eig_pairs,key=getKey)
eig_pairs.reverse()
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
for i in eig_pairs:
#print(i[0])
pass
matrix_w = np.hstack((eig_pairs[0][1].reshape(num_var,1), eig_pairs[1][1].reshape(num_var,1),eig_pairs[2][1].reshape(num_var,1),eig_pairs[3][1].reshape(num_var,1),eig_pairs[4][1].reshape(num_var,1)))
#print matrix_w
transformed = matrix_w.T.dot(all_samples)
return [transformed, matrix_w]
In [45]:
def mls_pca(df):
#df_stand = copy.deepcopy((df - df.mean()) / df.std())
y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
y_test = df['Demanda_shift_1'][test_start_date:test_end_date]
df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]
[transformed, matrix_w] = get_PCA_Matrix(df_train)
x_train = transformed.T
x_test = matrix_w.T.dot(df_test.T).T
x_train = sm.add_constant(x_train, prepend=True) #add a constant
x_test = sm.add_constant(x_test, prepend=True) #add a constant
ols = sm.OLS(y_train,x_train)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test)
error = (y_test - pred_y)/y_test
mape = np.mean(abs(error))
print "MAPE ERROR: " + str(mape)
print "MAX ERROR: " + str(max(error))
#print res.summary()
table = []
for key in df.drop('date',1):
if key != 'Demanda_shift_1':
corr = [np.corrcoef(df_train[key],transformed[0])[0][1],np.corrcoef(df_train[key],transformed[1])[0][1],np.corrcoef(df_train[key],transformed[2])[0][1],np.corrcoef(df_train[key],transformed[3])[0][1],np.corrcoef(df_train[key],df['Demanda_shift_1'][train_start_date:train_end_date])[0][1]]
table.append(corr)
corr_df = pd.DataFrame(np.array(table).T,columns=df_train.keys()).T
corr_df.columns = ['PCA0','PCA1','PCA2','PCA3','Demand']
corr_df.sort('PCA0',ascending=False)
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna()
mls_pca(df)
In [47]:
def mls_vars_and_pca(df):
y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
y_test = df['Demanda_shift_1'][test_start_date:test_end_date]
df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]
[transformed, matrix_w] = get_PCA_Matrix(df_train)
x_train_pls = transformed.T
x_test_pls = matrix_w.T.dot(df_test.T).T
x_train_pls = sm.add_constant(x_train_pls, prepend=True) #add a constant
x_test_pls = sm.add_constant(x_test_pls, prepend=True) #add a constant
df = df.dropna()
df = copy.deepcopy(df)
df_train = df[train_start_date:train_end_date]
df_test = df[test_start_date:test_end_date]
x_train = df_train.drop('Demanda_shift_1',1).drop('date',1)
x_test = df_test.drop('Demanda_shift_1',1).drop('date',1)
x_train = sm.add_constant(x_train, prepend=True) #add a constant
x_test = sm.add_constant(x_test, prepend=True) #add a constant
x_train_all = np.vstack((x_train_pls.T,x_train.T)).T
x_test_all = np.vstack((x_test_pls.T,x_test.T)).T
ols = sm.OLS(y_train,x_train_all)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test_all)
error = (y_test - pred_y)/y_test
mape = np.mean(abs(error))
print "MAPE ERROR: " + str(mape)
print "MAX ERROR: " + str(abs(max(error)))
#print res.summary()
return [mape,error]
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)
df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
mls_vars_and_pca(df)
pass
In [48]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)
df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print
df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print
mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string
In [49]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)
df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print
df_sa_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==5])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls(df_sa_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_sa_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==5])
print 'Saturday Peak (6PM-9PM)'
[mape,error] = mls(df_sa_pk)
mapes.append(mape)
lengths.append(len(error))
print
df_su_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==6])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls(df_su_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_su_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==6])
print 'Sunday Peak (6PM-9PM)'
[mape,error] = mls(df_su_pk)
mapes.append(mape)
lengths.append(len(error))
print
mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:" + mape_string
In [50]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)
df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls_vars_and_pca(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls_vars_and_pca(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print
df_sa_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==5])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls_vars_and_pca(df_sa_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_sa_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==5])
print 'Saturday Peak (6PM-9PM)'
[mape,error] = mls_vars_and_pca(df_sa_pk)
mapes.append(mape)
lengths.append(len(error))
print
df_su_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==6])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls_vars_and_pca(df_su_opk)
mapes.append(mape)
lengths.append(len(error))
print
df_su_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==6])
print 'Sunday Peak (6PM-9PM)'
[mape,error] = mls_vars_and_pca(df_su_pk)
mapes.append(mape)
lengths.append(len(error))
print
mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:" + mape_string
In [ ]: