In [2]:
import numpy as np
import statsmodels.api as sm
import pickle
import copy
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime
import pandas as pd

In [3]:
with open('../data/df_demand.pkl','rb') as f:
    df_orig = pickle.load(f)

In [4]:
train_start_date = datetime(2011,1,1)
train_end_date = datetime(2012,12,31)

test_start_date = datetime(2013,1,1)
test_end_date = datetime(2014,1,1)

Multiple Linear Regression (All Variables)


In [6]:
def mls(df):
    df_train = df[train_start_date:train_end_date]
    df_test = df[test_start_date:test_end_date]

    x_train = df_train.drop('Demanda_shift_1',1).drop('date',1)
    x_test = df_test.drop('Demanda_shift_1',1).drop('date',1)

    y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
    y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    ols = sm.OLS(y_train,x_train)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test)

    error = (y_test - pred_y)/y_test
    mape = np.mean(abs(error))

    print "MAPE ERROR: " + str(mape)
    print "MAX ERROR: " + str(max(error))
    print res.summary()
    return [mape,error]
#This shifts the demand data, so that each row has the values and then the next hours demand.
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna(subset=['Demanda_shift_1'])
mls(df)
pass


MAPE ERROR: 0.0490084776276
MAX ERROR: 0.50938020836
                            OLS Regression Results                            
==============================================================================
Dep. Variable:        Demanda_shift_1   R-squared:                       0.843
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     1878.
Date:                Tue, 12 May 2015   Prob (F-statistic):               0.00
Time:                        11:58:23   Log-Likelihood:                -42433.
No. Observations:                8761   AIC:                         8.492e+04
Df Residuals:                    8735   BIC:                         8.510e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------
const           3035.0607    244.865     12.395      0.000      2555.068  3515.053
windspeedKmph      1.9092      0.781      2.445      0.015         0.378     3.440
FeelsLikeF        -1.0122      1.631     -0.620      0.535        -4.210     2.186
FeelsLikeC        -1.8965      3.298     -0.575      0.565        -8.362     4.569
DewPointC          4.5517      1.155      3.940      0.000         2.287     6.816
windspeedMiles    -0.5464      1.214     -0.450      0.653        -2.927     1.834
DewPointF          6.2177      0.744      8.362      0.000         4.760     7.675
HeatIndexF        -3.3336      1.601     -2.082      0.037        -6.473    -0.195
cloudcover        -0.1865      0.025     -7.364      0.000        -0.236    -0.137
HeatIndexC         2.1634      3.262      0.663      0.507        -4.231     8.558
precipMM          -0.2042      0.058     -3.516      0.000        -0.318    -0.090
pressure          -2.6973      0.232    -11.619      0.000        -3.152    -2.242
WindGustKmph      -1.1193      0.748     -1.497      0.134        -2.585     0.346
WindGustMiles     -0.4025      1.184     -0.340      0.734        -2.724     1.919
weatherCode       -0.0023      0.006     -0.391      0.696        -0.014     0.009
tempC             -1.4408      1.845     -0.781      0.435        -5.057     2.176
tempF             -4.3837      1.067     -4.108      0.000        -6.475    -2.292
WindChillF         2.4162      1.187      2.035      0.042         0.088     4.744
WindChillC         2.0684      1.912      1.082      0.279        -1.680     5.817
winddirDegree     -0.0189      0.007     -2.797      0.005        -0.032    -0.006
humidity          -3.2387      0.212    -15.290      0.000        -3.654    -2.823
visibility        -1.5343      0.219     -7.018      0.000        -1.963    -1.106
Demanda            0.8630      0.007    128.202      0.000         0.850     0.876
month              0.0160      0.019      0.845      0.398        -0.021     0.053
weekday           -1.4999      0.177     -8.490      0.000        -1.846    -1.154
day                0.0160      0.019      0.845      0.398        -0.021     0.053
HORA              -0.9205      0.067    -13.765      0.000        -1.052    -0.789
==============================================================================
Omnibus:                     2222.786   Durbin-Watson:                   1.418
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            74540.313
Skew:                           0.543   Prob(JB):                         0.00
Kurtosis:                      17.248   Cond. No.                     5.94e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.18e-22. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [28]:



Out[28]:
26303

Multiple Linear Regression with time-shifted variables


In [39]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df = copy.deepcopy(df)

mls(df)
pass


MAPE ERROR: 0.0424863467463
MAX ERROR: 0.444272017805

Above MLR with seperate regression for peak and off-peak


In [43]:
df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
df_opk = copy.deepcopy(df[df['wd_peak']==0])
df_pk = copy.deepcopy(df[df['wd_peak']==1])
print 'Off-Peak'
mls(df_opk)
print
print 'Peak'
mls(df_pk)
pass


Off-Peak
MAPE ERROR: 0.0336347996493
MAX ERROR: 0.661430172163

Peak
MAPE ERROR: 0.0358466295815
MAX ERROR: 0.20790944532

Multiple Linear Regression with PCA


In [44]:
def get_PCA_Matrix(df_imp_stand):
    all_samples = df_imp_stand.as_matrix().T
    num_var = len(all_samples)
    mean_vector = all_samples.mean(1)
    #print('Mean Vector:\n', mean_vector)

    cov_mat = np.cov(all_samples)
    #print('Covariance Matrix:\n', cov_mat)
    eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
    for i in range(len(eig_val_cov)):
        eigvec_cov = eig_vec_cov[:,i].reshape(1,num_var).T
        #print('Eigenvalue {} from covariance matrix: {}'.format(i+1, eig_val_cov[i]))

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    def getKey(item):
        return item[0]
    eig_pairs = sorted(eig_pairs,key=getKey)

    eig_pairs.reverse()

    # Visually confirm that the list is correctly sorted by decreasing eigenvalues
    for i in eig_pairs:
        #print(i[0])
        pass
    matrix_w = np.hstack((eig_pairs[0][1].reshape(num_var,1), eig_pairs[1][1].reshape(num_var,1),eig_pairs[2][1].reshape(num_var,1),eig_pairs[3][1].reshape(num_var,1),eig_pairs[4][1].reshape(num_var,1)))
    #print matrix_w
    transformed = matrix_w.T.dot(all_samples)
    return [transformed, matrix_w]

In [45]:
def mls_pca(df):


    #df_stand = copy.deepcopy((df - df.mean()) / df.std())


    y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
    y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

    df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
    df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]

    [transformed, matrix_w] = get_PCA_Matrix(df_train)

    x_train = transformed.T
    x_test =  matrix_w.T.dot(df_test.T).T


    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    ols = sm.OLS(y_train,x_train)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test)

    error = (y_test - pred_y)/y_test
    mape = np.mean(abs(error))

    print "MAPE ERROR: " + str(mape)
    print "MAX ERROR: " + str(max(error))
    #print res.summary()

    table = []
    for key in df.drop('date',1):
        if key != 'Demanda_shift_1':
            corr = [np.corrcoef(df_train[key],transformed[0])[0][1],np.corrcoef(df_train[key],transformed[1])[0][1],np.corrcoef(df_train[key],transformed[2])[0][1],np.corrcoef(df_train[key],transformed[3])[0][1],np.corrcoef(df_train[key],df['Demanda_shift_1'][train_start_date:train_end_date])[0][1]]
            table.append(corr)
    corr_df = pd.DataFrame(np.array(table).T,columns=df_train.keys()).T
    corr_df.columns = ['PCA0','PCA1','PCA2','PCA3','Demand']
    corr_df.sort('PCA0',ascending=False)

df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df = df.dropna()
mls_pca(df)


MAPE ERROR: 0.0509779092642
MAX ERROR: 0.503135050948

MLS with PCA and time-shifted variables


In [47]:
def mls_vars_and_pca(df):

    y_train = df['Demanda_shift_1'][train_start_date:train_end_date]
    y_test = df['Demanda_shift_1'][test_start_date:test_end_date]

    df_train = df.drop('Demanda_shift_1',1).drop('date',1)[train_start_date:train_end_date]
    df_test = df.drop('Demanda_shift_1',1).drop('date',1)[test_start_date:test_end_date]

    [transformed, matrix_w] = get_PCA_Matrix(df_train)

    x_train_pls = transformed.T
    x_test_pls =  matrix_w.T.dot(df_test.T).T

    x_train_pls = sm.add_constant(x_train_pls, prepend=True) #add a constant
    x_test_pls = sm.add_constant(x_test_pls, prepend=True) #add a constant

    df = df.dropna()
    df = copy.deepcopy(df)

    df_train = df[train_start_date:train_end_date]
    df_test = df[test_start_date:test_end_date]

    x_train = df_train.drop('Demanda_shift_1',1).drop('date',1)
    x_test = df_test.drop('Demanda_shift_1',1).drop('date',1)

    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    x_train_all = np.vstack((x_train_pls.T,x_train.T)).T
    x_test_all = np.vstack((x_test_pls.T,x_test.T)).T

    ols = sm.OLS(y_train,x_train_all)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test_all)

    error = (y_test - pred_y)/y_test
    mape = np.mean(abs(error))

    print "MAPE ERROR: " + str(mape)
    print "MAX ERROR: " + str(abs(max(error)))
    #print res.summary()
    return [mape,error]

df = copy.deepcopy(df_orig)
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()
mls_vars_and_pca(df)
pass


MAPE ERROR: 0.0406500593597
MAX ERROR: 0.500772273727

MLS with time-shifted variables, seperate peaks for weekday and weekend


In [48]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_opk = copy.deepcopy(df[df['we_peak']==0][df['weekend']==1])
print 'Weekend Off-Peak (10PM-5PM)'
[mape,error] = mls(df_we_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_we_pk = copy.deepcopy(df[df['we_peak']==1][df['weekend']==1])
print 'Weekend Peak (6PM-9PM)'
[mape,error] = mls(df_we_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:" + mape_string


Weekday Off-Peak (10PM-4PM)
MAPE ERROR: 0.0194238497949
MAX ERROR: 0.107318275998

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0358466295815
MAX ERROR: 0.20790944532

Weekend Off-Peak (10PM-5PM)
MAPE ERROR: 0.0313181767578
MAX ERROR: 0.624360882887

Weekend Peak (6PM-9PM)
MAPE ERROR: 0.0228765107372
MAX ERROR: 0.127150150161

Overall MAPE for Split Between Peak/Off Peak for Weekday/Weekend:0.0287719834384

MLS with time-shifted variables and Peak/Off Peak for Weekday/Saturday/Sunday


In [49]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==5])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls(df_sa_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==5])
print 'Saturday Peak (6PM-9PM)'
[mape,error] = mls(df_sa_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==6])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls(df_su_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==6])
print 'Sunday Peak (6PM-9PM)'
[mape,error] = mls(df_su_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:" + mape_string


Weekday Off-Peak (10PM-4PM)
MAPE ERROR: 0.0194238497949
MAX ERROR: 0.107318275998

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0358466295815
MAX ERROR: 0.20790944532

Sunday Off-Peak (10PM-5PM)
MAPE ERROR: 0.0306869945146
MAX ERROR: 0.518578532105

Saturday Peak (6PM-9PM)
MAPE ERROR: 0.0246837229489
MAX ERROR: 0.111974998615

Sunday Off-Peak (10PM-5PM)
MAPE ERROR: 0.0288028062331
MAX ERROR: 0.188969721996

Sunday Peak (6PM-9PM)
MAPE ERROR: 0.0226364104772
MAX ERROR: 0.0820130824161

Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:0.0284356684524

Above with PCA


In [50]:
df = copy.deepcopy(df_orig)
mapes = []
lengths = []
df['Demanda_shift_1'] = df['Demanda'].shift(-1)
df['Demanda_back_1'] = df['Demanda'].shift(1)
df['Demanda_back_2'] = df['Demanda'].shift(2)
df['Demanda_back_3'] = df['Demanda'].shift(3)
df['Demanda_back_6'] = df['Demanda'].shift(6)
df['Demanda_back_12'] = df['Demanda'].shift(12)
df['Demanda_back_24'] = df['Demanda'].shift(24)

df['tempF_back_1'] = df['tempF'].shift(1)
df['tempF_back_2'] = df['tempF'].shift(2)
df['tempF_back_3'] = df['tempF'].shift(3)
df['tempF_back_6'] = df['tempF'].shift(6)
df['tempF_back_12'] = df['tempF'].shift(12)
df['tempF_back_24'] = df['tempF'].shift(24)
df = df.dropna()

df_wd_opk = copy.deepcopy(df[df['wd_peak']==0][df['weekend']==0])
print 'Weekday Off-Peak (10PM-4PM)'
[mape,error] = mls_vars_and_pca(df_wd_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_wd_pk = copy.deepcopy(df[df['wd_peak']==1][df['weekend']==0])
print 'Weekday Peak (9AM-9PM)'
[mape,error] = mls_vars_and_pca(df_wd_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==5])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls_vars_and_pca(df_sa_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_sa_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==5])
print 'Saturday Peak (6PM-9PM)'
[mape,error] = mls_vars_and_pca(df_sa_pk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_opk = copy.deepcopy(df[df['we_peak']==0][df['weekday']==6])
print 'Sunday Off-Peak (10PM-5PM)'
[mape,error] = mls_vars_and_pca(df_su_opk)
mapes.append(mape)
lengths.append(len(error))
print

df_su_pk = copy.deepcopy(df[df['we_peak']==1][df['weekday']==6])
print 'Sunday Peak (6PM-9PM)'
[mape,error] = mls_vars_and_pca(df_su_pk)
mapes.append(mape)
lengths.append(len(error))
print

mape_string = str(sum(np.array(mapes)*np.array(lengths))/sum(np.array(lengths)))
print "Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:" + mape_string


Weekday Off-Peak (10PM-4PM)
MAPE ERROR: 0.0197179580009
MAX ERROR: 0.10918840255

Weekday Peak (9AM-9PM)
MAPE ERROR: 0.0358466295815
MAX ERROR: 0.20790944532

Sunday Off-Peak (10PM-5PM)
MAPE ERROR: 0.0306869945146
MAX ERROR: 0.518578532105

Saturday Peak (6PM-9PM)
MAPE ERROR: 0.0246837229489
MAX ERROR: 0.111974998615

Sunday Off-Peak (10PM-5PM)
MAPE ERROR: 0.0288028062331
MAX ERROR: 0.188969721996

Sunday Peak (6PM-9PM)
MAPE ERROR: 0.0226364104772
MAX ERROR: 0.0820130824161

Overall MAPE for Split Between Peak/Off Peak for Weekday/Saturday/Sunday:0.0285320819631

In [ ]: