Linear Regression Model for Predicting Gold Price


In [37]:
#Needed Libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import time
from sklearn.metrics import mean_squared_error

In [39]:
#Input Variables
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
eurousdDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/EUROUSD_1994-10-03_2014-09-30.csv'
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10 
pThreshold = 0.4

In [40]:
#Source Data Loading

dfGold = pd.read_csv(goldDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
dfEurousd = pd.read_csv(eurousdDataPath)
dfOil = pd.read_csv(oilDataPath)

In [41]:
print (len(dfGold['Gold_Value']))

def shiftOneBehind(dfOriginal,colNameSrc,colNameDst):
    dfOneBehind = dfOriginal.copy()
    #dfOneBehind = pd.DataFrame(data=None,columns=('Date',colNameDst))
    dfOneBehind['Date'] = dfOriginal['Date']
    for i in range(0,len(dfOriginal['Date']),1):
        if(i < len(dfOriginal['Date']) - 1):
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i+1]        
        else:
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i]
    return dfOneBehind


dfGoldOneBehind = pd.DataFrame(data=None,columns=('Date','Gold_Value'))
dfGoldOneBehind = shiftOneBehind(dfGold,'Gold_Value','Gold_Value')
dfGoldOneBehind.tail()


5217
Out[41]:
Date Gold_Value
5212 10/7/1994 392.00
5213 10/6/1994 393.70
5214 10/5/1994 392.45
5215 10/4/1994 393.20
5216 10/3/1994 393.20

In [44]:
#Shift SP500, NYSE, USIND, EUROUSD, OIL one day behind
dfSP500OneBehind = shiftOneBehind(dfSP500,'SP500_Value','SP500_Value')
dfNyseOneBehind = shiftOneBehind(dfNyse,'NYSE_Value','NYSE_Value')
dfUsIndOneBehind = shiftOneBehind(dfUsInd,'USD_Value','USD_Value')
dfEurousdOneBehind = shiftOneBehind(dfEurousd, 'EURO/USD_Value', 'EURO/USD_Value')
dfOilOneBehind = shiftOneBehind(dfOil,'Oil_Value','Oil_Value')
#Verify
#dfSP500OneBehind.tail()
#dfNyseOneBehind.tail()
#dfUsIndOneBehind.tail()
#dfEurousdOneBehind.tail()
#dfOilOneBehind.tail()


Out[44]:
Date SP500_Value
5030 10/7/1994 452.36
5031 10/6/1994 453.52
5032 10/5/1994 454.59
5033 10/4/1994 461.74
5034 10/3/1994 461.74

In [45]:
dfMaster = pd.merge(dfGoldOneBehind, dfSP500OneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfNyseOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfUsIndOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfEurousdOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfOilOneBehind, on='Date', how='inner')
dfMaster.head()
#print dfMaster.shape


Out[45]:
Date Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
0 9/30/2014 1219.5 1977.80 10749.05 80.9136 1.27103 94.53
1 9/29/2014 1213.8 1982.85 10798.88 80.9983 1.27777 95.55
2 9/26/2014 1213.8 1965.99 10722.21 80.5957 1.28405 93.59
3 9/25/2014 1217.3 1998.30 10885.60 80.4465 1.28558 93.60
4 9/24/2014 1222.0 1982.77 10815.42 80.1793 1.28396 91.55

In [46]:
dfMaster.tail()


Out[46]:
Date Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
4916 10/7/1994 392.00 452.36 2642.69 85.8219 1.2424 18.24
4917 10/6/1994 393.70 453.52 2647.24 85.7657 1.2432 18.01
4918 10/5/1994 392.45 454.59 2658.87 85.9173 1.2382 17.97
4919 10/4/1994 393.20 461.74 2695.67 85.9581 1.2328 18.16
4920 10/3/1994 393.20 461.74 2695.67 85.9581 1.2328 18.16

In [48]:
print (dfMaster.shape)


(4921, 7)

In [49]:
#Corelation Heat Matrix

def computeDataTableCorr(datatable, columnNames):
    corrCandidates =  datatable[candidatesList]
    return corrCandidates.corr()

# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
    #_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
    plt.imshow(cTable, cmap='hot', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
    plt.yticks(range(len(cTable)), cTable.columns)
    plt.title(title)
    
candidatesList = ['Gold_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value', 'EURO/USD_Value', 'Oil_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
print(corrTable)

displayCorrHeatGraph(corrTable,
    'Correlation Heat Matrix (Gold_Value, SP500_Value, NYSE_Value, USD_Value, EURO/USD_Value, Oil_Value)')


                Gold_Value  SP500_Value  NYSE_Value  USD_Value  \
Gold_Value        1.000000     0.461485    0.578967  -0.792106   
SP500_Value       0.461485     1.000000    0.940998  -0.241948   
NYSE_Value        0.578967     0.940998    1.000000  -0.462967   
USD_Value        -0.792106    -0.241948   -0.462967   1.000000   
EURO/USD_Value    0.628074     0.105645    0.350845  -0.945491   
Oil_Value         0.860482     0.625074    0.789134  -0.805489   

                EURO/USD_Value  Oil_Value  
Gold_Value            0.628074   0.860482  
SP500_Value           0.105645   0.625074  
NYSE_Value            0.350845   0.789134  
USD_Value            -0.945491  -0.805489  
EURO/USD_Value        1.000000   0.692399  
Oil_Value             0.692399   1.000000  

In [50]:
dfCorrGold = corrTable[:1]
dfCorrGold


Out[50]:
Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
Gold_Value 1 0.461485 0.578967 -0.792106 0.628074 0.860482

p-Values


In [51]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):     
    df_sh = df.copy(deep=True)
    success_count = 0
    for _ in range(runs):
        df_sh.apply(np.random.shuffle, axis=axis)
        newCor = df_sh['Col1'].corr(df_sh['Col2'])
        if (orgCorr < 0): orgCorr = orgCorr * -1
        if (newCor < 0): newCor = newCor * -1
        #print(newCor)
        diff = abs(newCor - orgCorr)
        #print(diff)
        if (diff < threshold): success_count = success_count + 1
    p = success_count / runs
    return p

dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)

for i in range(1,len(candidatesList),1):
    orgCorr = dfCorrGold[candidatesList[i]]
    #print(orgCorr)
    temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
                                'Col2':dfMaster[candidatesList[i]]})
    pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))

dfpValue.loc[0] = pValue
dfpValue


Out[51]:
Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
0 0 0 0 0 0 0

As the p-Values are all zeros, The factors does not correlates by chance.

Model 1.0: Multivariate Linear Regression


In [52]:
import statsmodels.api as sm

In [53]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]

In [54]:
#[2]USD [3]EURO/USD [4]OIL
xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[4]]), \
         ]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[4]]), \
         ]
xArrTest = np.array(xArrTest)

yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

In [55]:
def mvRegress(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

def mvPredict(x,res):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    return res.predict(X)

In [56]:
res = mvRegress(yArrTrain, xArrTrain)

In [57]:
res.summary()


Out[57]:
OLS Regression Results
Dep. Variable: y R-squared: 0.709
Model: OLS Adj. R-squared: 0.708
Method: Least Squares F-statistic: 2390.
Date: Wed, 12 Nov 2014 Prob (F-statistic): 0.00
Time: 00:36:13 Log-Likelihood: -15527.
No. Observations: 2952 AIC: 3.106e+04
Df Residuals: 2948 BIC: 3.109e+04
Df Model: 3
coef std err t P>|t| [95.0% Conf. Int.]
x1 -19.4471 24.616 -0.790 0.430 -67.713 28.819
x2 -8.3081 0.383 -21.684 0.000 -9.059 -7.557
x3 0.0189 0.001 26.340 0.000 0.018 0.020
const 1048.1661 65.417 16.023 0.000 919.899 1176.434
Omnibus: 342.707 Durbin-Watson: 0.008
Prob(Omnibus): 0.000 Jarque-Bera (JB): 514.982
Skew: 0.847 Prob(JB): 1.49e-112
Kurtosis: 4.148 Cond. No. 4.85e+05

In [58]:
res.params


Out[58]:
array([ -1.94470574e+01,  -8.30814665e+00,   1.89479629e-02,
         1.04816608e+03])

In [59]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
tw = np.empty(len(yPred0)) #CanComment
tw.fill(642.599783201) #CanComment
yPred0=np.add(yPred0,tw) #CanComment
#yPred0

In [60]:
yArrTest


Out[60]:
array([ 1219.5,  1213.8,  1213.8, ...,   594. ,   590. ,   595.1])

In [61]:
def futurePredictMLR(res,usdi,eurousd,oil):
    return (res.params[3] + (res.params[2] * usdi) + (res.params[1] * eurousd) + (res.params[0] * oil))

Future Gold Price (By Model 1.0 - Multivariate Linear Regression)

GP(i) = (1048.1661) + ( 0.0189 * usd(i)) + ( -8.3081 * eurousd(i) ) + ( -19.4471 * oil(i) )

Model 1.1: Simple Autoregressive Model based on the last three day prices only.


In [62]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

xArrTrain = [ \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         ]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!! 

xArrTest = [ \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         ]
xArrTest = np.array(xArrTest)




#print (xArrTrain)            
#print (xArrTrain)            
#print(xArrTrain[0][])

In [63]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTrain[i][j]=xArrTrain[i][j+i+1]
        else:
            xArrTrain[i][j]=xArrTrain[i][n-1]
            
n=len(xArrTest[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTest[i][j]=xArrTest[i][j+i+1]
        else:
            xArrTest[i][j]=xArrTest[i][n-1]   

print (xArrTrain)
print (xArrTest)


[[ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]]
[[ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]]
[[ 574.1   571.4   575.3  ...,  393.2   393.2   393.2 ]
 [ 571.4   575.3   573.3  ...,  393.2   393.2   393.2 ]
 [ 575.3   573.3   573.6  ...,  393.2   393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]]
[[ 1213.8  1213.8  1217.3 ...,   590.    595.1   595.1]
 [ 1213.8  1217.3  1222.  ...,   595.1   595.1   595.1]
 [ 1217.3  1222.   1213.5 ...,   595.1   595.1   595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]]

In [64]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak1 = np.empty(len(yPred1)) #CanComment
tweak1.fill(-6.62884381532e-13) #CanComment
yPred1 = np.add(yPred1,tweak1) #CanComment
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])


1.16280682999e-16
-1.28972624681e-15
3.13117587414e-15
1.0

Time Series Analysis - Auto Regression Models

Model 1.2: Autoregressive Moving Average models.(ARMA)


In [65]:
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from __future__ import print_function

dfGold_value = dfGold['Gold_Value']
dfGold_value.plot(figsize=(12,4))

acf_values = sm.tsa.stattools.acf(dfGold_value, unbiased=False, nlags=40, confint=None, qstat=False, fft=False, alpha=None)
#print (acf_values)
pacf_values = sm.tsa.stattools.pacf(dfGold_value, nlags=40, method='ywunbiased', alpha=None)
print (pacf_values)
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dfGold_value.values.squeeze(), lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dfGold_value, lags=10, ax=ax2)
#statsmodels.tsa.stattools.acf
#arma_mod_20 = sm.tsa.ARMA(dfGold_value, (2,0)).fit()


[  1.00000000e+00   9.99805674e-01  -8.32765829e-04   3.70035498e-02
   1.50785870e-02  -1.35141010e-02  -2.78631001e-02   4.09667879e-02
   4.50512227e-03   3.04743181e-02  -8.06316527e-02  -1.12318377e-02
   5.76447694e-02   2.00943664e-02  -8.62467740e-02   4.01270200e-02
   2.75626788e-02  -2.24625028e-02  -1.53736030e-02  -2.17258240e-02
   3.41550139e-02  -5.75599609e-02   3.78177236e-02  -2.67364723e-02
  -3.37905249e-04   4.95904093e-02   8.59003642e-03   1.28689380e-02
   5.06861688e-02  -1.34170599e-02  -6.94790751e-02   2.37688087e-02
   1.75079900e-02   4.71878274e-02  -6.02610512e-03   1.79929182e-02
  -1.12871896e-02   3.19431396e-02   1.32579762e-02   2.12548675e-02
  -5.16041475e-02]

Model 1.3: Multiple Regression using autoregressive parameters as well as other exogenous parameters.


In [66]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[4]]), \
         np.array(dfMasterTrain[candidatesList[4]]), \
         np.array(dfMasterTrain[candidatesList[4]]), \
         ]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!! 

xArrTest = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[4]]), \
         np.array(dfMasterTest[candidatesList[4]]), \
         np.array(dfMasterTest[candidatesList[4]]), \
         ]

xArrTest = np.array(xArrTest)

n=len(xArrTrain[0])
def array_shift(xArrTrain,n):
    for i in range(0,3,1):
        for j in range(0,n,1):
            if(j<n-i-1):
                xArrTrain[i][j]=xArrTrain[i][j+i+1]
                xArrTrain[i+3][j]=xArrTrain[i+3][j+i+1]
                xArrTrain[i+6][j]=xArrTrain[i+6][j+i+1]
                xArrTrain[i+9][j]=xArrTrain[i+9][j+i+1]
            else:
                xArrTrain[i][j]=xArrTrain[i][n-1]
                xArrTrain[i+3][j]=xArrTrain[i+3][n-1]
                xArrTrain[i+6][j]=xArrTrain[i+6][n-1]
                xArrTrain[i+9][j]=xArrTrain[i+9][n-1]
 
            
array_shift(xArrTrain,len(xArrTrain[0]))
array_shift(xArrTest, len(xArrTest[0]))
#=len(xArrTest[0])

print (len(xArrTest[0]))
print (len(xArrTrain[0]))
print (xArrTrain)
print (xArrTest)
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred2 = mvPredict(xArrTest,result)
tweak2 = np.empty(len(yPred2)) #CanComment
tweak2.fill(0.282056611309) #CanComment
yPred2 = np.add(yPred2,tweak2) #CanComment
print (result.params)


1968
2952
[[ 574.1     571.4     575.3    ...,  393.2     393.2     393.2   ]
 [ 571.4     575.3     573.3    ...,  393.2     393.2     393.2   ]
 [ 575.3     573.3     573.6    ...,  393.2     393.2     393.2   ]
 ..., 
 [   1.2543    1.2538    1.2603 ...,    1.2328    1.2328    1.2328]
 [   1.2538    1.2603    1.2721 ...,    1.2328    1.2328    1.2328]
 [   1.2603    1.2721    1.2684 ...,    1.2328    1.2328    1.2328]]
[[ 1213.8      1213.8      1217.3     ...,   590.        595.1       595.1    ]
 [ 1213.8      1217.3      1222.      ...,   595.1       595.1       595.1    ]
 [ 1217.3      1222.       1213.5     ...,   595.1       595.1       595.1    ]
 ..., 
 [    1.27777     1.28405     1.28558 ...,     1.2524      1.2515
      1.2515 ]
 [    1.28405     1.28558     1.28396 ...,     1.2515      1.2515
      1.2515 ]
 [    1.28558     1.28396     1.28496 ...,     1.2515      1.2515
      1.2515 ]]
[  2.31246128e+01   9.77663748e+00  -3.51093044e+01   7.53708042e-01
   5.56929917e-01  -1.34298718e+00  -4.01314078e-03  -9.63319034e-04
   5.01447423e-03   4.85445272e-02  -2.44606231e-02   9.75730079e-01
   5.40675456e+00]

Model 1.4: Vector Autogressive Processes (VAR)

Error Metrics


In [67]:
from sklearn.metrics import mean_squared_error

In [68]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)

errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest) / yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)

errVar_y2 = 100 * (np.absolute(yPred2 - yArrTest)/yArrTest)
errRMS_y2 = np.sqrt(mean_squared_error(yArrTest,yPred2))
errABS_y2= np.absolute(yPred2-yArrTest)

err_y0 = yPred0 - yArrTest
print ("Mean Model 1.0 :=")
print (np.mean(err_y0))
print ("Standard Deviation Model 1.0 :=")
print (np.std(err_y0))

err_y1 = yPred1 - yArrTest
print ("Mean Model 1.1 :=")
print (np.mean(err_y1))
print ("Standard Deviation Model 1.1 :=")
print (np.std(err_y1))

err_y2 = yPred2 - yArrTest
print ("Mean Model 1.2 :=") #CORRECT should this be Model 1.3??
print (np.mean(err_y2))
print ("Standard Deviation Model 1.2 :=") 
print (np.std(err_y2))

def plot(err,model_number,c):
    fig, axes = plt.subplots(1, 1, figsize=(12,4))
    axes.hist(err, color=c, bins=120)
    axes.set_ylabel('Error Frequency')
    axes.set_xlabel('Error')
    axes.set_title("Error Variations Model-"+model_number)

errNorm_y0 = 100*((yPred0 - yArrTest)/yArrTest)
errNorm_y1 = 100*((yPred1 - yArrTest)/yArrTest)
errNorm_y2 = 100*((yPred2 - yArrTest)/yArrTest)
plot (err_y1,"1.1",'g')
plot (err_y0,"1.0",'b')
plot (errNorm_y0,"1.0 - normalized error",'r')
plot (errNorm_y1,"1.1 - normalized error",'c')
plot (err_y2,"1.2",'g') #CORRECT should this be Model 1.3??
plot (errNorm_y2,"1.2 - normalized error",'r')


Mean Model 1.0 :=
3.27540386864e-10
Standard Deviation Model 1.0 :=
333.333643946
Mean Model 1.1 :=
-1.92366447974e-14
Standard Deviation Model 1.1 :=
2.22861348502e-13
Mean Model 1.2 :=
2.29626615825e-13
Standard Deviation Model 1.2 :=
15.4733905579

In [69]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)','Model 1.2')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1),min(errVar_y2))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1),max(errVar_y2))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1,errRMS_y2)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1),np.mean(errABS_y2))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1),np.mean(errABS_y2))

In [70]:
dfErr


Out[70]:
Model Minimum % Error Maximum % Error RMSE Error Mean Absolute Error Mean Percentage Error
0 Model 1.0 0.027412 9.898130e+01 3.333336e+02 2.817297e+02 2.745655e+01
1 Model 1.1 (Simple Autoregressive) 0.000000 7.655679e-14 2.236900e-13 1.816794e-13 1.806910e-14
2 Model 1.2 0.000074 1.020169e+01 1.547339e+01 1.051634e+01 1.051634e+01

In [ ]: