In [35]:
#Import needed library
import numpy as np
from __future__ import print_function
import prettyplotlib as ppl
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from sklearn.metrics import mean_squared_error
from math import sqrt
from IPython.core.display import HTML
import brewer2mpl
import seaborn as sb
import scipy.stats as stats
%matplotlib inline
In [36]:
#Monthly data for past 30 years
#Oil and Gold
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CRUDE_OIL_WTI_US_ENERGY_Monthly_198601-201410.csv'
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/GOLD_Montly_198410_201410.csv'
#CPI - Inflation Adjustment
cpiDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CPI_USA_Monthly_198410_201410.csv'
In [37]:
#Inflation Adjusted
def adjustForInflation(quantity):
return quantity * (dfCpi['CPI'][0]/dfCpi['CPI'])
In [38]:
choice = 1
candidatesList = ['Oil_Value', 'Gold_Value']
kMonths = 3
In [39]:
#Read CSV to pandas data frame
if choice == 0:
dfMaster = pd.read_csv(oilDataPath)
else:
dfMaster = pd.read_csv(goldDataPath)
dfCpi = pd.read_csv(cpiDataPath)
#Adjust to Inflation
dfMaster[candidatesList[choice]] = adjustForInflation(dfMaster[candidatesList[choice]])
In [40]:
#slice only last 9 years for testing
dfMaster = dfMaster[:107]
dfMaster.head()
Out[40]:
In [41]:
#Naming the model
def getModelName(modelNo):
if modelNo == "0.0":
return "Price remains constant as last month"
elif modelNo == "0.1":
return "Price is average of last" +str(kMonths)+ " months"
elif modelNo == "0.2":
return "Price is calculated using a autoregressive model where the co-efficients are determined using PSO swarm algorithm."
elif modelNo == "0.3":
return "Weighted Average of the last "+str(kMonths)+" months. Weight of the ith month in the history is ("+str(kMonths)+"-i+1)"
elif modelNo == "0.4":
return "Cubic Weighted Average of the last "+str(kMonths)+" months. Weight of the ith month in the history is ("+str(kMonths)+"-i+1)^3"
else:
return "New Baseline Model, Yet to be defined"
In [42]:
#Model
def performPrediction(y,modelNo,k):
n = len(y)
pred_y = np.copy(y)
if modelNo == "0.0":
print ("Model 0.0")
for i in range(0,n,1):
if(i<n-k):
pred_y[i]=y[i+1]
else:
pred_y[i]=y[i]
elif modelNo == "0.1":
print ("Model 0.1")
for i in range(0,n,1):
if (i<n-k):
pred_y[i]=0
for j in range(1,k+1,1):
pred_y[i]=pred_y[i] + y[i+j]
pred_y[i]=pred_y[i]/k
else:
pred_y[i] = y[i]
elif modelNo == "0.2":
print ("Model 0.2")
for i in range(0,n,1):
if (i<n-k):
pred_y[i] = 0.976*y[i+1] + 0.01373*y[i+2] + 0.1157
else:
pred_y[i] = y[i]
elif modelNo == "0.3":
print ("Model 0.3")
for i in range(0,n,1):
if(i<n-k):
pred_y[i]=0
for j in range(1,k+1,1):
pred_y[i]=pred_y[i]+(y[i+j]*(k-j+1))
pred_y[i]=(2*pred_y[i])/(k*(k+1))
else:
pred_y[i]=y[i]
elif modelNo == "0.4":
print ("Model 0.4")
cubic = (k*(k+1))/2
cubic = cubic**2
for i in range(0,n,1):
if(i<n-k):
pred_y[i]=0
for j in range(1,k+1,1):
factor=k-j+1
factor=factor**3
pred_y[i]=pred_y[i]+(y[i+j]*factor)
pred_y[i]=((pred_y[i])/cubic)
else:
pred_y[i]=y[i]
else:
print ("Wrong Model")
return pred_y
In [43]:
#Plot the error histogram
colors = brewer2mpl.get_map('Set1', 'qualitative', 8).mpl_colors
def plotErrorHistogram(error,nBins,modelName):
fig, axes = ppl.subplots(1, 1, figsize=(18,12))
axes.set_xlim(-0.3, 0.3)
axes.set_ylim(0, 25)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title(modelName)
sb.distplot(error, kde=False, fit=stats.norm, bins=nBins);
ppl.hist(error, bins=nBins, color=colors[1], grid='y')
plt.savefig(modelName+'.png')
In [44]:
#Error Correction
def computeErrorCorrectionFactor(pred,yActual):
err_corr = (np.mean((pred - yActual)))
return err_corr
def computeCorrectedMean(newPredy,yActual):
return np.mean((newPredy - yActual))
def computeCorrectedSD(newPredy,yActual):
return round(np.std((newPredy-yActual)/yActual),5)
def computeNewPrediction(pred,errCorrection):
newPredy = pred - errCorrection
#print("Mean is " + str(computeCorrectedMean(new_pred,actual)))
#print("Standard Deviation is " + str(computeCorrectedSD(new_pred,actual)))
return newPredy
def computeRelativeError(newPredy, yActual):
errRel = 100*(np.absolute(newPredy - yActual)/yActual)
return errRel
def computerRMSError(newPredy, yActual):
errRMSe = sqrt(mean_squared_error(yActual,newPredy))
return round(errRMSe,2)
def computeAbsoluteError(newPredy, yActual):
return np.absolute(newPredy - yActual)
def computeProbability(newPredy,yActual, newSD):
pos = 0
for i in range(0,len(newPredy)):
if(np.absolute((newPredy[i]-yActual[i])/yActual[i]) <= np.absolute(newSD)):
pos += 1
conf = round((1. * pos/len(newPredy) ),5)
return conf
def computeErrors(yPred, yActual,modelNo):
errCorr = computeErrorCorrectionFactor(yPred,yActual)
newPredy = computeNewPrediction(yPred,errCorr)
newMean = computeCorrectedMean(newPredy,yActual)
newSD = computeCorrectedSD(newPredy, yActual)
errRel = computeRelativeError(newPredy, yActual)
errRMS = computerRMSError(newPredy, yActual)
errAbs = computeAbsoluteError(newPredy, yActual)
prob = computeProbability(newPredy,yActual,newSD)
#Mean Computations
meanErrRel = np.mean(errRel)
meanErrRel = round(meanErrRel,2)
meanErrAbs = np.mean(errAbs)
meanErrAbs = round(meanErrAbs,2)
#Plot Relative Error Distributions
#Uncomment the below line during the report
plotErrorHistogram(((newPredy-yActual)/yActual),16,getModelName(modelNo))
dfErr = DataFrame(data=None, columns=['Model','Mean Relative Error', 'Mean Absolute Error', \
'RMS Error', 'Standard Deviation', 'Probability'])
dfErr.loc[0,'Model'] = getModelName(modelNo)
dfErr.loc[0,'Mean Relative Error'] = meanErrRel
dfErr.loc[0,'Mean Absolute Error'] = meanErrAbs
dfErr.loc[0,'RMS Error'] = errRMS
dfErr.loc[0,'Standard Deviation'] = newSD
dfErr.loc[0,'Probability'] = prob
return dfErr
In [45]:
def baselineModel(modelNo):
x = dfMaster['Date']
yActual = np.array(dfMaster[candidatesList[choice]])
yPred = performPrediction(yActual,modelNo,kMonths)
yPredNew = predictNextMonth(yActual, modelNo)
print("The predicted value for next month is "+str(yPredNew[0]))
return computeErrors(yPred, yActual,modelNo)
In [46]:
def computePrediction(yPredNew, modelNo, k):
if modelNo == "0.0":
yPredNew[0] = yPredNew[1]
elif modelNo == "0.1":
sumi = 0.
for i in range(1,k+1,1):
sumi = sumi + yPredNew[i]
yPredNew[0] = round(sumi/k,2)
elif modelNo == "0.2":
yPredNew[0] = 0.976*yPredNew[1] + 0.01373*yPredNew[2] + 0.1157
elif modelNo == "0.3":
wsum = 0.
for i in range(0,k+1,1):
wsum = wsum + (yPredNew[i+1] * (k-i))
denom = (k*(k+1))/2
yPredNew[0] = round(wsum/denom,2)
elif modelNo == "0.4":
csum = 0.
for i in range(0,k+1,1):
factor = (k-i) ** 3
csum = csum + (yPredNew[i+1] * factor)
denom = (k*(k+1))/2
denom = denom ** 2
yPredNew[0] = round(csum/denom,2)
return yPredNew
def predictNextMonth(yActual, modelNo):
yPredNew = np.copy(yActual)
yPredNew = np.insert(yPredNew,0,0)
#compute for current month
yPredNew = computePrediction(yPredNew, modelNo, kMonths)
yPredNew = np.insert(yPredNew,0,0)
#compute for next month
yPredNew = computePrediction(yPredNew, modelNo, kMonths)
return yPredNew
In [47]:
#a1,a2,a3,a4,a5,a6 = baselineModel('0.0')
baselineModel('0.0')
Out[47]:
In [48]:
baselineModel('0.1')
Out[48]:
In [49]:
baselineModel('0.2')
Out[49]:
In [50]:
baselineModel('0.3')
Out[50]:
In [51]:
baselineModel('0.4')
Out[51]: