In [13]:
import numpy as np
import statsmodels.api as sm
import pickle
import copy
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime
import pandas as pd

In [14]:
with open('../data/df_demand.pkl','rb') as f:
    df_orig = pickle.load(f)

In [15]:
train_start_date = datetime(2011,1,1)
train_end_date = datetime(2012,12,31)

test_start_date = datetime(2013,1,1)
test_end_date = datetime(2014,1,1)

Multiple Linear Regression (All Variables)


In [16]:
def mls(df):
    df_train = df[train_start_date:train_end_date]
    df_test = df[test_start_date:test_end_date]

    x_train = df_train.drop('Demanda_shift_1',1).drop('date',1).as_matrix()
    x_test = df_test.drop('Demanda_shift_1',1).drop('date',1).as_matrix()

    y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
    y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    ols = sm.OLS(y_train,x_train)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test)

    error = (y_test - pred_y)/y_test
    mape = np.mean(abs(error))

    print "MAPE ERROR: " + str(mape)
    print "MAX ERROR: " + str(max(error))
    #print res.summary()
    return [mape,error]
#This shifts the demand data, so that each row has the values and then the next hours demand.
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna(subset=['Demanda_shift_1'])
mls(df)
pass


MAPE ERROR: 0.0437576648775
MAX ERROR: 0.378515466485

In [16]:

Multiple Linear Regression with time-shifted variables


In [17]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df = copy.deepcopy(df)

mls(df)
pass


MAPE ERROR: 0.0424863467463
MAX ERROR: 0.444272017805

Above MLR with seperate regression for peak and off-peak


In [18]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df_opk = copy.deepcopy(df[df['wd_peak']==0])
df_pk = copy.deepcopy(df[df['wd_peak']==1])
print 'Off-Peak'
mls(df_opk)
print
print 'Peak'
mls(df_pk)
pass


Off-Peak
MAPE ERROR: 0.0346863032069
MAX ERROR: 0.621379430005

Peak
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

Multiple Linear Regression with PCA


In [19]:
def get_PCA_Matrix(df_imp_stand):
    all_samples = df_imp_stand.as_matrix().T
    num_var = len(all_samples)
    mean_vector = all_samples.mean(1)
    #print('Mean Vector:\n', mean_vector)

    cov_mat = np.cov(all_samples)
    #print('Covariance Matrix:\n', cov_mat)
    eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
    for i in range(len(eig_val_cov)):
        eigvec_cov = eig_vec_cov[:,i].reshape(1,num_var).T
        #print('Eigenvalue {} from covariance matrix: {}'.format(i+1, eig_val_cov[i]))

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    def getKey(item):
        return item[0]
    eig_pairs = sorted(eig_pairs,key=getKey)

    eig_pairs.reverse()

    # Visually confirm that the list is correctly sorted by decreasing eigenvalues
    for i in eig_pairs:
        #print(i[0])
        pass
    matrix_w = np.hstack((eig_pairs[0][1].reshape(num_var,1), eig_pairs[1][1].reshape(num_var,1),eig_pairs[2][1].reshape(num_var,1),eig_pairs[3][1].reshape(num_var,1),eig_pairs[4][1].reshape(num_var,1)))
    #print matrix_w
    transformed = matrix_w.T.dot(all_samples)
    return [transformed, matrix_w]

In [20]:
def mls_pca(df):


    #df_stand = copy.deepcopy((df - df.mean()) / df.std())


    y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
    y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

    df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
    df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]

    [transformed, matrix_w] = get_PCA_Matrix(df_train)

    x_train = transformed.T
    x_test =  matrix_w.T.dot(df_test.T).T


    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    ols = sm.OLS(y_train,x_train)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test)

    error = (y_test - pred_y)/y_test
    mape = np.mean(abs(error))

    print "MAPE ERROR: " + str(mape)
    print "MAX ERROR: " + str(max(error))
    #print res.summary()

    table = []
    for key in df.drop('date',1):
        if key != 'Demanda_shift_1':
            corr = [np.corrcoef(df_train[key],transformed[0])[0][1],np.corrcoef(df_train[key],transformed[1])[0][1],np.corrcoef(df_train[key],transformed[2])[0][1],np.corrcoef(df_train[key],transformed[3])[0][1],np.corrcoef(df_train[key],df['Demanda_shift_1'][train_start_date:train_end_date])[0][1]]
            table.append(corr)
    corr_df = pd.DataFrame(np.array(table).T,columns=df_train.keys()).T
    corr_df.columns = ['PCA0','PCA1','PCA2','PCA3','Demand']
    corr_df.sort('PCA0',ascending=False)

df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna()
mls_pca(df)


MAPE ERROR: 0.0506906718918
MAX ERROR: 0.506490500293

MLS with PCA and time-shifted variables


In [21]:
def mls_vars_and_pca(df):

    y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
    y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

    df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
    df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]

    [transformed, matrix_w] = get_PCA_Matrix(df_train)

    x_train_pls = transformed.T
    x_test_pls =  matrix_w.T.dot(df_test.T).T

    x_train_pls = sm.add_constant(x_train_pls, prepend=True) #add a constant
    x_test_pls = sm.add_constant(x_test_pls, prepend=True) #add a constant

    df = df.dropna()
    df = copy.deepcopy(df)

    df_train = df[train_start_date:train_end_date]
    df_test = df[test_start_date:test_end_date]

    x_train = df_train.drop('Demanda_shift_1',1).drop('date',1)
    x_test = df_test.drop('Demanda_shift_1',1).drop('date',1)

    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    x_train_all = np.vstack((x_train_pls.T,x_train.T)).T
    x_test_all = np.vstack((x_test_pls.T,x_test.T)).T

    ols = sm.OLS(y_train,x_train_all)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test_all)

    error = (y_test - pred_y)/y_test
    mape = np.mean(abs(error))

    print "MAPE ERROR: " + str(mape)
    print "MAX ERROR: " + str(abs(max(error)))
    #print res.summary()
    return [mape,error]

df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
mls_vars_and_pca(df)
pass


MAPE ERROR: 0.0424696972297
MAX ERROR: 0.44397743006

MLS with time-shifted variables, seperate peaks for weekday and weekend


In [22]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string


Weekday Off-Peak (10PM-4PM)
MAPE ERROR: 0.0192551539812
MAX ERROR: 0.104300880708

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

Weekend Off-Peak (10PM-5PM)
MAPE ERROR: 0.0308714377455
MAX ERROR: 0.584087837849

Weekend Peak (6PM-9PM)
MAPE ERROR: 0.0234933150389
MAX ERROR: 0.107923347699

Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:0.0290449474911

MLS with time-shifted variables and Peak/Off Peak for Weekday/Saturday/Sunday


In [23]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (9PM-9AM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==5])
print 'Sunday Off-Peak (9PM-6PM)'
[mape,error] = mls(df_sa_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==5])
print 'Saturday Peak (6PM-9PM)'
[mape,error] = mls(df_sa_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==6])
print 'Sunday Off-Peak (9PM-6PM)'
[mape,error] = mls(df_su_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==6])
print 'Sunday Peak (6PM-9PM)'
[mape,error] = mls(df_su_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:" + mape_string


Weekday Off-Peak (9PM-9AM)
MAPE ERROR: 0.0192551539812
MAX ERROR: 0.104300880708

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

Sunday Off-Peak (9PM-6PM)
MAPE ERROR: 0.0303154348048
MAX ERROR: 0.430677160457

Saturday Peak (6PM-9PM)
MAPE ERROR: 0.0250413074411
MAX ERROR: 0.102384405132

Sunday Off-Peak (9PM-6PM)
MAPE ERROR: 0.0291031378679
MAX ERROR: 0.187692313247

Sunday Peak (6PM-9PM)
MAPE ERROR: 0.0224302825589
MAX ERROR: 0.0751037358445

Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:0.028780548077

Above with PCA


In [12]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string

Multiple Hours Ahead


In [ ]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string

In [29]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (9PM-9AM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (9PM-6PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string


Weekday Off-Peak (9PM-9AM)
MAPE ERROR: 0.0192551539812
MAX ERROR: 0.104300880708

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0368924766191
MAX ERROR: 0.210671799832

Weekend Off-Peak (9PM-6PM)
MAPE ERROR: 0.0308714377455
MAX ERROR: 0.584087837849

Weekend Peak (6PM-9PM)
MAPE ERROR: 0.0234933150389
MAX ERROR: 0.107923347699

Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:0.0290449474911

In [ ]: