In [1]:
from pandas import Series, DataFrame
from pandas import merge#Needed Libraries
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from __future__ import print_function
In [2]:
'''
#Input Variables
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
eurousdDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/EUROUSD_1994-10-03_2014-09-30.csv'
csiDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CSI_Daily_19941003-20140930.csv'
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
'''
#Monthly Data - 30 Years
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CRUDE_OIL_WTI_US_ENERGY_Monthly_198601-201410.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/YAHOO_SP500_INDEX_Monthly_198410-201410.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/YAHOO_NYSE_INDEX_Monthly_198410-201410.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/USD_Index_Monthly_198410_201410.csv'
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/GOLD_Montly_198410_201410.csv'
csiDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/Monthly_30yr/CSI_Monthly_198410-201410.csv'
dfGold = pd.read_csv(goldDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
#dfEurousd = pd.read_csv(eurousdDataPath)
dfCsi = pd.read_csv(csiDataPath)
dfOil = pd.read_csv(oilDataPath)
dfOil.tail()
Out[2]:
In [3]:
trainingRatio = 0.6
dfMaster = merge(dfGold,dfSP500,on='Date',how='inner')
dfMaster = merge(dfMaster,dfNyse,on='Date',how='inner')
dfMaster = merge(dfMaster,dfUsInd,on='Date',how='inner')
#dfMaster = merge(dfMaster,dfEurousd,on='Date',how='inner')
dfMaster = merge(dfMaster,dfCsi,on='Date',how='inner')
dfMaster = merge(dfMaster,dfOil,on='Date',how='inner')
#dfMaster = merge(dfMaster,<new factor data frame>,on='Date',how='inner')
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #80:20 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]
candidatesList = ['Gold_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value', 'CSI_Value', 'Oil_Value']# add factor here
In [4]:
dfMaster.head()
Out[4]:
In [5]:
def mvRegress(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def mvPredict(x,res):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
return res.predict(X)
In [6]:
#p_vector = [Gold, SP500, NYSE, USD_Index, EURO/USD, CSI, Oil]
def model_1_0(dfMasterTrain,dfMasterTest,p_vector):
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])
xArrTrain = []
n = len(p_vector)
for i in range(0,n,1):
k = p_vector[i]
for j in range(0,k,1):
xArrTrain.append(np.array(dfMasterTrain[candidatesList[i]]))
#print xArrTrain
#now we must shift accordingly
curr=0
for it in range(0,n,1):
k = p_vector[it]
for i in range(0,k,1):
l=len(xArrTrain[curr])
for j in range(0,l,1):
if(j<l-i-1):
xArrTrain[curr][j]=xArrTrain[curr][j+i+1]
else:
xArrTrain[curr][j]=xArrTrain[curr][l-1]
curr=curr+1
#print xArrTrain
#set xArrTest
xArrTest = []
n = len(p_vector)
for i in range(0,n,1):
k = p_vector[i]
for j in range(0,k,1):
xArrTest.append(np.array(dfMasterTest[candidatesList[i]]))
#print xArrTrain
#now we must shift accordingly
curr=0
for it in range(0,n,1):
k = p_vector[it]
for i in range(0,k,1):
l=len(xArrTest[curr])
for j in range(0,l,1):
if(j<l-i-1):
xArrTest[curr][j]=xArrTest[curr][j+i+1]
else:
xArrTest[curr][j]=xArrTest[curr][l-1]
curr=curr+1
#next phase
#train
result = mvRegress(yArrTrain, xArrTrain)
#print result.summary()
yPred = mvPredict(xArrTest,result)
return result,yPred
In [7]:
def plot(err,c,nb,displayTitle):
fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err, color='#009999', bins=nb)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title(displayTitle)
In [8]:
def computeErrors(yPred, yArrTest):
errPred = yPred - yArrTest
avg=np.mean(errPred)
yPred = yPred - avg
errPred = yPred - yArrTest
errRel= 100 * (np.absolute(yPred - yArrTest) / yArrTest)
errRMSE = np.sqrt(mean_squared_error(yArrTest,yPred))
errABS= np.absolute(yPred-yArrTest)
errPlot = errPred/yArrTest
''' # Comment temporarily, uncoment this later
print ("Mean Relative Error: ")
print (np.mean(errVar))
print ("Mean Absolute Error: ")
print (np.mean(errABS))
print ("RMSE: ")
print (errRMSE)
'''
#plot(errPred/yArrTest,'g',10)
return np.mean(errRel),np.mean(errABS),errRMSE,errPlot
In [9]:
def frameString(modelVector, candidatesList):
displayString = ''
for i in range(len(modelVector)):
if(modelVector[i] > 0):
if(len(displayString)>0):
displayString += ', '
displayString += candidatesList[i]
return displayString
In [10]:
def PredictNextMonth(p_vector, modelRes,dfTest):
xArrTest = []
n = len(p_vector)
for i in range(0,n,1):
k = p_vector[i]
for j in range(0,k,1):
xArrTest.append(np.array(dfTest[candidatesList[i]]))
#print(xArrTest)
#now we must shift accordingly
curr=0
for it in range(0,n,1):
k = p_vector[it]
for i in range(0,k,1):
l=len(xArrTest[curr])
for j in range(0,l,1):
if(j<l-i-1):
xArrTest[curr][j]=xArrTest[curr][j+i+1]
else:
xArrTest[curr][j]=xArrTest[curr][l-1]
curr=curr+1
return mvPredict(xArrTest,modelRes)
In [11]:
def computePrediction(modelRes, dfMasterTrain, dfMasterTest, modelVector, yPred, candidatesList):
cols = np.array(candidatesList)
cols = np.insert(cols,0,'Date')
dfPred = DataFrame(data=dfMasterTest[:1], columns=cols)
dfPred = dfPred.append(dfMasterTest,ignore_index=True)
errRel = 100. * np.absolute((yPred - dfMasterTest['Gold_Value'])) / dfMasterTest['Gold_Value']
#Predict Current Month
yPredNew = PredictNextMonth(modelVector,modelRes,dfPred)
dfPred.loc[0,'Gold_Value']=yPredNew[0]
#Predict Next Month
dfPredNext = DataFrame(data=dfPred[:1], columns=cols)
dfPredNext = dfPredNext.append(dfPred,ignore_index=True)
yPredNew = PredictNextMonth(modelVector,modelRes,dfPredNext)
dfPredNext.loc[0,'Gold_Value']=yPredNew[0]
dfPredNext.loc[0,'Date'] = '12/30/2014'
dfPredNext.loc[1,'Date'] = '11/30/2014'
#Compute Confidence
a = len(errRel[errRel <= np.mean(errRel)])
b = len(errRel)
conf = round((100. * a/b ),2)
predVal = round(dfPred.loc[0,'Gold_Value'],2)
meanRelErr = round(np.mean(errRel),2)
strResult = "\nThe predicted value is "+str(predVal) + " +/- " \
+ str(meanRelErr)+"% with " + str() + "% confidence \n"
#print(strResult)
return predVal,meanRelErr,conf
In [12]:
#[Gold, SP500, NYSE, USD_Index, CSI, Oil]
inputModels = [\
np.array([1,0,1,0,0,0]), \
#np.array([1,0,1,1,1,1]), \
#np.array([1,0,0,1,0,1]) \
#np.array([1,0,1,1,0,0]),
#np.array([1,0,1,0,1,0]),
#np.array([1,0,1,1,1,0]),
#np.array([1,0,1,1,0,1]),
#np.array([1,0,1,0,1,1]),
#np.array([1,0,0,1,1,1]),
]
maxDays = 10
dfResult = DataFrame(data=None, columns=['Model', 'Days', 'Predicted_Value', 'Relative_Error(%)', 'Confidence(%)', \
'Mean_Absolute_Error', 'Mean_RMS_Error'])
ctr = 0
figHeight = 6
figWidth = 12
fig0, axes0 = plt.subplots(1, figsize=(figWidth,figHeight))
axes0.set_title("Gold - Autoregressive Model - Relative Error Plot")
axes0.set_ylabel("Relative Errors")
axes0.set_xlabel("No of Months")
linestyles = ['solid' , 'dashed' , 'dashdot' , 'dotted','solid' , 'dashed' , 'dashdot' , 'dotted']
colors=['c','m','y','k','r','g','b']
for m in range(len(inputModels)):
modelVector = inputModels[m]
nMonths = []
relErrModel = []
for i in range(1,maxDays+1,1):
if(i > 1):
modelVector[modelVector == (i-1)] = i
yArrTest = np.array(dfMasterTest[candidatesList[0]])
modelRes,yPred = model_1_0(dfMasterTrain,dfMasterTest,modelVector)
meanErrRel,meanErrABS,meanErrRMSE,errPlot = computeErrors(yPred, yArrTest)
relErrModel.append(meanErrRel)
nMonths.append(i)
predVal,meanRelErr,conf = computePrediction(modelRes, dfMasterTrain, dfMasterTest, modelVector, yPred, candidatesList)
#print(displayStr)
modelName = frameString(modelVector, candidatesList)
dfResult.loc[ctr,'Model'] = modelName
dfResult.loc[ctr,'Days'] = i
dfResult.loc[ctr,'Predicted_Value'] = predVal
dfResult.loc[ctr,'Relative_Error(%)'] = meanRelErr
dfResult.loc[ctr,'Confidence(%)'] = conf
dfResult.loc[ctr,'Mean_Absolute_Error'] = meanErrABS
dfResult.loc[ctr,'Mean_RMS_Error'] = meanErrRMSE
ctr = ctr + 1
#plot(errPlot,'g',10,"Gold -" + str(i) + " - " + modelName)
plt.plot(nMonths,relErrModel, linewidth=3, linestyle=linestyles[m-1], color=colors[m-1], label=modelName)
# Place a legend to the right of this smaller figure.
plt.legend(bbox_to_anchor=(1.05, 1), loc=1, borderaxespad=0.)
dfResult
Out[12]:
In [13]:
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
from __future__ import print_function
In [14]:
dfGold_value = dfGold['Gold_Value']
#print(dfGold_value.head())
npGold_value = np.array(dfGold_value)
npGold_value = npGold_value[::-1]
#print (npGold_value)
npGold_ARMA_Train=npGold_value[0:np.int(trainSize)]
#print(npGold_ARMA_Train)
npOil_ARMA_Test=npGold_value[np.int(trainSize):]
acf_values = sm.tsa.stattools.acf(npGold_value, unbiased=False, nlags=10)
pacf_values = sm.tsa.stattools.pacf(npGold_value, nlags=10)
In [15]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(npGold_value, lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(npGold_value, lags=10, ax=ax2)
In [16]:
arma_mod20 = sm.tsa.ARMA(npGold_value, (2,0), dates=None).fit()
print(arma_mod20.params)
print(arma_mod20.fittedvalues)
In [17]:
yPredARMA= arma_mod20.fittedvalues[np.int(trainSize):]
y = npGold_value[np.int(trainSize):]
print (yPredARMA)
errPredARMA = yPredARMA - y
avg = np.mean(errPredARMA)
yPredARMA = yPredARMA - avg
errPredARMA = yPredARMA - y
errARMA_Var= 100 * (np.absolute(errPredARMA) / y)
errARMA_RMSE = np.sqrt(mean_squared_error(yPredARMA,y))
errARMA_ABS= np.absolute(errPredARMA)
print ("Mean Absolute error",np.mean(errARMA_ABS))
print ("RMSE error",errARMA_RMSE)
print ("Mean variance",np.mean(errARMA_Var))
plot(errPredARMA/y,'g',10,"ARMA error plot")
In [17]: