Baseline Model - Oil and Gold

Baseline Model

0.0 - Price remains constant as last month
0.1 - Price is average of last k months
0.2 - Price is calculated using a autoregressive model where the co-efficients are determined using PSO swarm algorithm.
0.3 - Weighted Average of the last k months. Weight of the ith month in the history is (k-i+1)
0.4 - Cubic Weighted Average of the last k months. Weight of the ith month in the history is (k-i+1)^3



In [35]:

    
#Import needed library
import numpy as np
from __future__ import print_function
import prettyplotlib as ppl
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from sklearn.metrics import mean_squared_error
from math import sqrt
from IPython.core.display import HTML
import brewer2mpl
import seaborn as sb
import scipy.stats as stats
%matplotlib inline



In [36]:

    
#Monthly data for past 30 years

#Oil and Gold
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CRUDE_OIL_WTI_US_ENERGY_Monthly_198601-201410.csv'
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/GOLD_Montly_198410_201410.csv'

#CPI - Inflation Adjustment
cpiDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CPI_USA_Monthly_198410_201410.csv'



In [37]:

    
#Inflation Adjusted
def adjustForInflation(quantity):
    return quantity * (dfCpi['CPI'][0]/dfCpi['CPI'])

Input: Select Oil or Gold (0 - Oil and 1 - Gold)



In [38]:

    
choice = 1
candidatesList = ['Oil_Value', 'Gold_Value']
kMonths = 3



In [39]:

    
#Read CSV to pandas data frame
if choice == 0:
    dfMaster = pd.read_csv(oilDataPath)
else:
    dfMaster = pd.read_csv(goldDataPath)
dfCpi = pd.read_csv(cpiDataPath)

#Adjust to Inflation
dfMaster[candidatesList[choice]] = adjustForInflation(dfMaster[candidatesList[choice]])



In [40]:

    
#slice only last 9 years for testing
dfMaster = dfMaster[:107]
dfMaster.head()









    Out[40]:






  
    
      
      Date
      Gold_Value
    
  
  
    
      0
       11/30/2014
       1182.800000
    
    
      1
       10/31/2014
       1164.300000
    
    
      2
        9/30/2014
       1213.443814
    
    
      3
        8/31/2014
       1283.534935
    
    
      4
        7/31/2014
       1280.892486



In [41]:

    
#Naming the model
def getModelName(modelNo):
    if modelNo == "0.0":
        return "Price remains constant as last month"
    elif modelNo == "0.1":
        return "Price is average of last" +str(kMonths)+ " months"
    elif modelNo == "0.2":
        return "Price is calculated using a autoregressive model where the co-efficients are determined using PSO swarm algorithm."
    elif modelNo == "0.3":
        return "Weighted Average of the last "+str(kMonths)+" months. Weight of the ith month in the history is ("+str(kMonths)+"-i+1)"
    elif modelNo == "0.4":
        return "Cubic Weighted Average of the last "+str(kMonths)+" months. Weight of the ith month in the history is ("+str(kMonths)+"-i+1)^3"
    else:
        return "New Baseline Model, Yet to be defined"



In [42]:

    
#Model
def performPrediction(y,modelNo,k):
    n = len(y)
    pred_y = np.copy(y)
    if modelNo == "0.0":
        print ("Model 0.0")
        for i in range(0,n,1):
            if(i<n-k):
                pred_y[i]=y[i+1]
            else:
                pred_y[i]=y[i]
    elif modelNo == "0.1":
        print ("Model 0.1")
        for i in range(0,n,1):
            if (i<n-k):
                pred_y[i]=0
                for j in range(1,k+1,1):
                    pred_y[i]=pred_y[i] + y[i+j]
                pred_y[i]=pred_y[i]/k
            else:
                pred_y[i] = y[i]
    elif modelNo == "0.2":
        print ("Model 0.2")
        for i in range(0,n,1):
            if (i<n-k):
                pred_y[i] = 0.976*y[i+1] + 0.01373*y[i+2] + 0.1157
            else:
                pred_y[i] = y[i]
    elif modelNo == "0.3":
        print ("Model 0.3")
        for i in range(0,n,1):
            if(i<n-k):
                pred_y[i]=0
                for j in range(1,k+1,1):
                    pred_y[i]=pred_y[i]+(y[i+j]*(k-j+1))
                pred_y[i]=(2*pred_y[i])/(k*(k+1))
            else:
                pred_y[i]=y[i]
    elif modelNo == "0.4":
        print ("Model 0.4")
        cubic = (k*(k+1))/2
        cubic = cubic**2
        for i in range(0,n,1):
            if(i<n-k):
                pred_y[i]=0
                for j in range(1,k+1,1):
                    factor=k-j+1
                    factor=factor**3
                    pred_y[i]=pred_y[i]+(y[i+j]*factor)
                pred_y[i]=((pred_y[i])/cubic)
            else:
                pred_y[i]=y[i]
    else:
        print ("Wrong Model")
    return pred_y

Histogram Error Plot



In [43]:

    
#Plot the error histogram
colors = brewer2mpl.get_map('Set1', 'qualitative', 8).mpl_colors
def plotErrorHistogram(error,nBins,modelName):
    fig, axes = ppl.subplots(1, 1, figsize=(18,12))    
    axes.set_xlim(-0.3, 0.3)
    axes.set_ylim(0, 25)
    axes.set_ylabel('Error Frequency')
    axes.set_xlabel('Error')
    axes.set_title(modelName)
    sb.distplot(error, kde=False, fit=stats.norm, bins=nBins);
    ppl.hist(error, bins=nBins, color=colors[1], grid='y')
    plt.savefig(modelName+'.png')

Compute Errors



In [44]:

    
#Error Correction
def computeErrorCorrectionFactor(pred,yActual):
    err_corr = (np.mean((pred - yActual)))
    return err_corr

def computeCorrectedMean(newPredy,yActual):
    return np.mean((newPredy - yActual))

def computeCorrectedSD(newPredy,yActual):
    return round(np.std((newPredy-yActual)/yActual),5)

def computeNewPrediction(pred,errCorrection):
    newPredy = pred - errCorrection
    #print("Mean is " + str(computeCorrectedMean(new_pred,actual)))
    #print("Standard Deviation is " + str(computeCorrectedSD(new_pred,actual)))
    return newPredy

def computeRelativeError(newPredy, yActual):
    errRel = 100*(np.absolute(newPredy - yActual)/yActual)
    return errRel

def computerRMSError(newPredy, yActual):
    errRMSe = sqrt(mean_squared_error(yActual,newPredy))
    return round(errRMSe,2)

def computeAbsoluteError(newPredy, yActual):
    return np.absolute(newPredy - yActual)

def computeProbability(newPredy,yActual, newSD):
    pos = 0
    for i in range(0,len(newPredy)):
        if(np.absolute((newPredy[i]-yActual[i])/yActual[i]) <= np.absolute(newSD)):
            pos += 1
    conf = round((1. * pos/len(newPredy) ),5)
    return conf
    
def computeErrors(yPred, yActual,modelNo):
    errCorr = computeErrorCorrectionFactor(yPred,yActual)
    newPredy = computeNewPrediction(yPred,errCorr)
    newMean = computeCorrectedMean(newPredy,yActual)
    newSD = computeCorrectedSD(newPredy, yActual)
    errRel = computeRelativeError(newPredy, yActual)
    errRMS = computerRMSError(newPredy, yActual)
    errAbs = computeAbsoluteError(newPredy, yActual)
    prob = computeProbability(newPredy,yActual,newSD)
    #Mean Computations
    meanErrRel = np.mean(errRel)
    meanErrRel = round(meanErrRel,2)
    meanErrAbs = np.mean(errAbs)
    meanErrAbs = round(meanErrAbs,2)
    
    #Plot Relative Error Distributions
    #Uncomment the below line during the report
    plotErrorHistogram(((newPredy-yActual)/yActual),16,getModelName(modelNo))
    
    dfErr = DataFrame(data=None, columns=['Model','Mean Relative Error', 'Mean Absolute Error', \
                                         'RMS Error', 'Standard Deviation', 'Probability'])
    dfErr.loc[0,'Model'] = getModelName(modelNo)
    dfErr.loc[0,'Mean Relative Error'] = meanErrRel
    dfErr.loc[0,'Mean Absolute Error'] = meanErrAbs
    dfErr.loc[0,'RMS Error'] = errRMS
    dfErr.loc[0,'Standard Deviation'] = newSD
    dfErr.loc[0,'Probability'] = prob
    
    return dfErr

Baseline Models



In [45]:

    
def baselineModel(modelNo):
    x = dfMaster['Date']
    yActual = np.array(dfMaster[candidatesList[choice]])
    yPred = performPrediction(yActual,modelNo,kMonths)
    yPredNew = predictNextMonth(yActual, modelNo)
    print("The predicted value for next month is "+str(yPredNew[0]))
    return computeErrors(yPred, yActual,modelNo)

Predicting Next Month Price



In [46]:

    
def computePrediction(yPredNew, modelNo, k):
    if modelNo == "0.0":
        yPredNew[0] = yPredNew[1]
    elif modelNo == "0.1":
        sumi = 0.
        for i in range(1,k+1,1):
            sumi = sumi + yPredNew[i]
        yPredNew[0] = round(sumi/k,2)
    elif modelNo == "0.2":
        yPredNew[0] = 0.976*yPredNew[1] + 0.01373*yPredNew[2] + 0.1157
    elif modelNo == "0.3":
        wsum = 0.
        for i in range(0,k+1,1):
            wsum = wsum + (yPredNew[i+1] * (k-i))
        denom = (k*(k+1))/2
        yPredNew[0] = round(wsum/denom,2)
    elif modelNo == "0.4":
        csum = 0.
        for i in range(0,k+1,1):
            factor = (k-i) ** 3
            csum = csum + (yPredNew[i+1] * factor)
        denom = (k*(k+1))/2
        denom = denom ** 2
        yPredNew[0] = round(csum/denom,2)
    return yPredNew

def predictNextMonth(yActual, modelNo):
    yPredNew = np.copy(yActual)
    yPredNew = np.insert(yPredNew,0,0)
    #compute for current month
    yPredNew = computePrediction(yPredNew, modelNo, kMonths)
    yPredNew = np.insert(yPredNew,0,0)
    #compute for next month
    yPredNew = computePrediction(yPredNew, modelNo, kMonths)    
    return yPredNew

Run the model



In [47]:

    
#a1,a2,a3,a4,a5,a6 = baselineModel('0.0')
baselineModel('0.0')









    



Model 0.0
The predicted value for next month is 1182.8






    Out[47]:






  
    
      
      Model
      Mean Relative Error
      Mean Absolute Error
      RMS Error
      Standard Deviation
      Probability
    
  
  
    
      0
       Price remains constant as last month
       4.61
       56.11
       75.36
       0.06038
       0.71963



In [48]:

    
baselineModel('0.1')









    



Model 0.1
The predicted value for next month is 1177.98






    Out[48]:






  
    
      
      Model
      Mean Relative Error
      Mean Absolute Error
      RMS Error
      Standard Deviation
      Probability
    
  
  
    
      0
       Price is average of last3 months
       4.78
       58.6
       80.12
       0.0646
       0.71963



In [49]:

    
baselineModel('0.2')









    



Model 0.2
The predicted value for next month is 1158.77753886






    Out[49]:






  
    
      
      Model
      Mean Relative Error
      Mean Absolute Error
      RMS Error
      Standard Deviation
      Probability
    
  
  
    
      0
       Price is calculated using a autoregressive mod...
       4.62
       56.12
       74.68
       0.05992
       0.72897



In [50]:

    
baselineModel('0.3')









    



Model 0.3
The predicted value for next month is 1179.19






    Out[50]:






  
    
      
      Model
      Mean Relative Error
      Mean Absolute Error
      RMS Error
      Standard Deviation
      Probability
    
  
  
    
      0
       Weighted Average of the last 3 months. Weight ...
       4.54
       55.68
       75.46
       0.06062
       0.72897



In [51]:

    
baselineModel('0.4')









    



Model 0.4
The predicted value for next month is 1179.84






    Out[51]:






  
    
      
      Model
      Mean Relative Error
      Mean Absolute Error
      RMS Error
      Standard Deviation
      Probability
    
  
  
    
      0
       Cubic Weighted Average of the last 3 months. W...
       4.56
       55.78
       73.27
       0.05869
       0.69159

	Date	Gold_Value
0	11/30/2014	1182.800000
1	10/31/2014	1164.300000
2	9/30/2014	1213.443814
3	8/31/2014	1283.534935
4	7/31/2014	1280.892486