Baseline Model - Oil and Gold

Baseline Model

  • 0.0 - Price remains constant as last month
  • 0.1 - Price is average of last k months
  • 0.2 - Price is calculated using a autoregressive model where the co-efficients are determined using PSO swarm algorithm.
  • 0.3 - Weighted Average of the last k months. Weight of the ith month in the history is (k-i+1)
  • 0.4 - Cubic Weighted Average of the last k months. Weight of the ith month in the history is (k-i+1)^3

In [35]:
#Import needed library
import numpy as np
from __future__ import print_function
import prettyplotlib as ppl
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from sklearn.metrics import mean_squared_error
from math import sqrt
from IPython.core.display import HTML
import brewer2mpl
import seaborn as sb
import scipy.stats as stats
%matplotlib inline

In [36]:
#Monthly data for past 30 years

#Oil and Gold
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CRUDE_OIL_WTI_US_ENERGY_Monthly_198601-201410.csv'
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/GOLD_Montly_198410_201410.csv'

#CPI - Inflation Adjustment
cpiDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CPI_USA_Monthly_198410_201410.csv'

In [37]:
#Inflation Adjusted
def adjustForInflation(quantity):
    return quantity * (dfCpi['CPI'][0]/dfCpi['CPI'])

Input: Select Oil or Gold (0 - Oil and 1 - Gold)


In [38]:
choice = 1
candidatesList = ['Oil_Value', 'Gold_Value']
kMonths = 3

In [39]:
#Read CSV to pandas data frame
if choice == 0:
    dfMaster = pd.read_csv(oilDataPath)
else:
    dfMaster = pd.read_csv(goldDataPath)
dfCpi = pd.read_csv(cpiDataPath)

#Adjust to Inflation
dfMaster[candidatesList[choice]] = adjustForInflation(dfMaster[candidatesList[choice]])

In [40]:
#slice only last 9 years for testing
dfMaster = dfMaster[:107]
dfMaster.head()


Out[40]:
Date Gold_Value
0 11/30/2014 1182.800000
1 10/31/2014 1164.300000
2 9/30/2014 1213.443814
3 8/31/2014 1283.534935
4 7/31/2014 1280.892486

In [41]:
#Naming the model
def getModelName(modelNo):
    if modelNo == "0.0":
        return "Price remains constant as last month"
    elif modelNo == "0.1":
        return "Price is average of last" +str(kMonths)+ " months"
    elif modelNo == "0.2":
        return "Price is calculated using a autoregressive model where the co-efficients are determined using PSO swarm algorithm."
    elif modelNo == "0.3":
        return "Weighted Average of the last "+str(kMonths)+" months. Weight of the ith month in the history is ("+str(kMonths)+"-i+1)"
    elif modelNo == "0.4":
        return "Cubic Weighted Average of the last "+str(kMonths)+" months. Weight of the ith month in the history is ("+str(kMonths)+"-i+1)^3"
    else:
        return "New Baseline Model, Yet to be defined"

In [42]:
#Model
def performPrediction(y,modelNo,k):
    n = len(y)
    pred_y = np.copy(y)
    if modelNo == "0.0":
        print ("Model 0.0")
        for i in range(0,n,1):
            if(i<n-k):
                pred_y[i]=y[i+1]
            else:
                pred_y[i]=y[i]
    elif modelNo == "0.1":
        print ("Model 0.1")
        for i in range(0,n,1):
            if (i<n-k):
                pred_y[i]=0
                for j in range(1,k+1,1):
                    pred_y[i]=pred_y[i] + y[i+j]
                pred_y[i]=pred_y[i]/k
            else:
                pred_y[i] = y[i]
    elif modelNo == "0.2":
        print ("Model 0.2")
        for i in range(0,n,1):
            if (i<n-k):
                pred_y[i] = 0.976*y[i+1] + 0.01373*y[i+2] + 0.1157
            else:
                pred_y[i] = y[i]
    elif modelNo == "0.3":
        print ("Model 0.3")
        for i in range(0,n,1):
            if(i<n-k):
                pred_y[i]=0
                for j in range(1,k+1,1):
                    pred_y[i]=pred_y[i]+(y[i+j]*(k-j+1))
                pred_y[i]=(2*pred_y[i])/(k*(k+1))
            else:
                pred_y[i]=y[i]
    elif modelNo == "0.4":
        print ("Model 0.4")
        cubic = (k*(k+1))/2
        cubic = cubic**2
        for i in range(0,n,1):
            if(i<n-k):
                pred_y[i]=0
                for j in range(1,k+1,1):
                    factor=k-j+1
                    factor=factor**3
                    pred_y[i]=pred_y[i]+(y[i+j]*factor)
                pred_y[i]=((pred_y[i])/cubic)
            else:
                pred_y[i]=y[i]
    else:
        print ("Wrong Model")
    return pred_y

Histogram Error Plot


In [43]:
#Plot the error histogram
colors = brewer2mpl.get_map('Set1', 'qualitative', 8).mpl_colors
def plotErrorHistogram(error,nBins,modelName):
    fig, axes = ppl.subplots(1, 1, figsize=(18,12))    
    axes.set_xlim(-0.3, 0.3)
    axes.set_ylim(0, 25)
    axes.set_ylabel('Error Frequency')
    axes.set_xlabel('Error')
    axes.set_title(modelName)
    sb.distplot(error, kde=False, fit=stats.norm, bins=nBins);
    ppl.hist(error, bins=nBins, color=colors[1], grid='y')
    plt.savefig(modelName+'.png')

Compute Errors


In [44]:
#Error Correction
def computeErrorCorrectionFactor(pred,yActual):
    err_corr = (np.mean((pred - yActual)))
    return err_corr

def computeCorrectedMean(newPredy,yActual):
    return np.mean((newPredy - yActual))

def computeCorrectedSD(newPredy,yActual):
    return round(np.std((newPredy-yActual)/yActual),5)

def computeNewPrediction(pred,errCorrection):
    newPredy = pred - errCorrection
    #print("Mean is " + str(computeCorrectedMean(new_pred,actual)))
    #print("Standard Deviation is " + str(computeCorrectedSD(new_pred,actual)))
    return newPredy

def computeRelativeError(newPredy, yActual):
    errRel = 100*(np.absolute(newPredy - yActual)/yActual)
    return errRel

def computerRMSError(newPredy, yActual):
    errRMSe = sqrt(mean_squared_error(yActual,newPredy))
    return round(errRMSe,2)

def computeAbsoluteError(newPredy, yActual):
    return np.absolute(newPredy - yActual)

def computeProbability(newPredy,yActual, newSD):
    pos = 0
    for i in range(0,len(newPredy)):
        if(np.absolute((newPredy[i]-yActual[i])/yActual[i]) <= np.absolute(newSD)):
            pos += 1
    conf = round((1. * pos/len(newPredy) ),5)
    return conf
    
def computeErrors(yPred, yActual,modelNo):
    errCorr = computeErrorCorrectionFactor(yPred,yActual)
    newPredy = computeNewPrediction(yPred,errCorr)
    newMean = computeCorrectedMean(newPredy,yActual)
    newSD = computeCorrectedSD(newPredy, yActual)
    errRel = computeRelativeError(newPredy, yActual)
    errRMS = computerRMSError(newPredy, yActual)
    errAbs = computeAbsoluteError(newPredy, yActual)
    prob = computeProbability(newPredy,yActual,newSD)
    #Mean Computations
    meanErrRel = np.mean(errRel)
    meanErrRel = round(meanErrRel,2)
    meanErrAbs = np.mean(errAbs)
    meanErrAbs = round(meanErrAbs,2)
    
    #Plot Relative Error Distributions
    #Uncomment the below line during the report
    plotErrorHistogram(((newPredy-yActual)/yActual),16,getModelName(modelNo))
    
    dfErr = DataFrame(data=None, columns=['Model','Mean Relative Error', 'Mean Absolute Error', \
                                         'RMS Error', 'Standard Deviation', 'Probability'])
    dfErr.loc[0,'Model'] = getModelName(modelNo)
    dfErr.loc[0,'Mean Relative Error'] = meanErrRel
    dfErr.loc[0,'Mean Absolute Error'] = meanErrAbs
    dfErr.loc[0,'RMS Error'] = errRMS
    dfErr.loc[0,'Standard Deviation'] = newSD
    dfErr.loc[0,'Probability'] = prob
    
    return dfErr

Baseline Models


In [45]:
def baselineModel(modelNo):
    x = dfMaster['Date']
    yActual = np.array(dfMaster[candidatesList[choice]])
    yPred = performPrediction(yActual,modelNo,kMonths)
    yPredNew = predictNextMonth(yActual, modelNo)
    print("The predicted value for next month is "+str(yPredNew[0]))
    return computeErrors(yPred, yActual,modelNo)

Predicting Next Month Price


In [46]:
def computePrediction(yPredNew, modelNo, k):
    if modelNo == "0.0":
        yPredNew[0] = yPredNew[1]
    elif modelNo == "0.1":
        sumi = 0.
        for i in range(1,k+1,1):
            sumi = sumi + yPredNew[i]
        yPredNew[0] = round(sumi/k,2)
    elif modelNo == "0.2":
        yPredNew[0] = 0.976*yPredNew[1] + 0.01373*yPredNew[2] + 0.1157
    elif modelNo == "0.3":
        wsum = 0.
        for i in range(0,k+1,1):
            wsum = wsum + (yPredNew[i+1] * (k-i))
        denom = (k*(k+1))/2
        yPredNew[0] = round(wsum/denom,2)
    elif modelNo == "0.4":
        csum = 0.
        for i in range(0,k+1,1):
            factor = (k-i) ** 3
            csum = csum + (yPredNew[i+1] * factor)
        denom = (k*(k+1))/2
        denom = denom ** 2
        yPredNew[0] = round(csum/denom,2)
    return yPredNew

def predictNextMonth(yActual, modelNo):
    yPredNew = np.copy(yActual)
    yPredNew = np.insert(yPredNew,0,0)
    #compute for current month
    yPredNew = computePrediction(yPredNew, modelNo, kMonths)
    yPredNew = np.insert(yPredNew,0,0)
    #compute for next month
    yPredNew = computePrediction(yPredNew, modelNo, kMonths)    
    return yPredNew

Run the model


In [47]:
#a1,a2,a3,a4,a5,a6 = baselineModel('0.0')
baselineModel('0.0')


Model 0.0
The predicted value for next month is 1182.8
Out[47]:
Model Mean Relative Error Mean Absolute Error RMS Error Standard Deviation Probability
0 Price remains constant as last month 4.61 56.11 75.36 0.06038 0.71963

In [48]:
baselineModel('0.1')


Model 0.1
The predicted value for next month is 1177.98
Out[48]:
Model Mean Relative Error Mean Absolute Error RMS Error Standard Deviation Probability
0 Price is average of last3 months 4.78 58.6 80.12 0.0646 0.71963

In [49]:
baselineModel('0.2')


Model 0.2
The predicted value for next month is 1158.77753886
Out[49]:
Model Mean Relative Error Mean Absolute Error RMS Error Standard Deviation Probability
0 Price is calculated using a autoregressive mod... 4.62 56.12 74.68 0.05992 0.72897

In [50]:
baselineModel('0.3')


Model 0.3
The predicted value for next month is 1179.19
Out[50]:
Model Mean Relative Error Mean Absolute Error RMS Error Standard Deviation Probability
0 Weighted Average of the last 3 months. Weight ... 4.54 55.68 75.46 0.06062 0.72897

In [51]:
baselineModel('0.4')


Model 0.4
The predicted value for next month is 1179.84
Out[51]:
Model Mean Relative Error Mean Absolute Error RMS Error Standard Deviation Probability
0 Cubic Weighted Average of the last 3 months. W... 4.56 55.78 73.27 0.05869 0.69159