Linear Regression Model for Predicting Gold Price


In [1]:
#Needed Libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import time
from sklearn.metrics import mean_squared_error

In [2]:
#Input Variables
goldDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/GOLD_DAILY_1994-10-03_2014-09-30.csv'
sp500DataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_SP500_INDEX_DAILY_1994-10-03_2014-09-30.csv'
nyseDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/YAHOO_NYSE_INDEX_DAILY_1994-10-03_2014-09-30.csv'
usdIndexDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/USD_Index_Daily_1994-10-03_2014-09-30.csv'
eurousdDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/EUROUSD_1994-10-03_2014-09-30.csv'
oilDataPath = 'https://raw.githubusercontent.com/Sree-vathsan/CSE591-Data-Science-Project/master/regressionModel/data/CRUDE_OIL_WTI_US_ENERGY_Daily_1994-10-03_2014-09-30.csv'
trainingRatio = 0.6
pRuns = 10 
pThreshold = 0.4

In [3]:
#Source Data Loading

dfGold = pd.read_csv(goldDataPath)
dfSP500 = pd.read_csv(sp500DataPath)
dfNyse = pd.read_csv(nyseDataPath)
dfUsInd = pd.read_csv(usdIndexDataPath)
dfEurousd = pd.read_csv(eurousdDataPath)
dfOil = pd.read_csv(oilDataPath)

In [4]:
print (len(dfGold['Gold_Value']))

def shiftOneBehind(dfOriginal,colNameSrc,colNameDst):
    dfOneBehind = dfOriginal.copy()
    #dfOneBehind = pd.DataFrame(data=None,columns=('Date',colNameDst))
    dfOneBehind['Date'] = dfOriginal['Date']
    for i in range(0,len(dfOriginal['Date']),1):
        if(i < len(dfOriginal['Date']) - 1):
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i+1]        
        else:
            dfOneBehind[colNameDst][i] = dfOriginal[colNameSrc][i]
    return dfOneBehind


dfGoldOneBehind = pd.DataFrame(data=None,columns=('Date','Gold_Value'))
dfGoldOneBehind = shiftOneBehind(dfGold,'Gold_Value','Gold_Value')
dfGoldOneBehind.tail()


5217
Out[4]:
Date Gold_Value
5212 10/7/1994 392.00
5213 10/6/1994 393.70
5214 10/5/1994 392.45
5215 10/4/1994 393.20
5216 10/3/1994 393.20

In [5]:
#Shift SP500, NYSE, USIND, EUROUSD, OIL one day behind
dfSP500OneBehind = shiftOneBehind(dfSP500,'SP500_Value','SP500_Value')
dfNyseOneBehind = shiftOneBehind(dfNyse,'NYSE_Value','NYSE_Value')
dfUsIndOneBehind = shiftOneBehind(dfUsInd,'USD_Value','USD_Value')
dfEurousdOneBehind = shiftOneBehind(dfEurousd, 'EURO/USD_Value', 'EURO/USD_Value')
dfOilOneBehind = shiftOneBehind(dfOil,'Oil_Value','Oil_Value')
#Verify
print dfSP500OneBehind.tail()
print dfNyseOneBehind.tail()
print dfUsIndOneBehind.tail()
print dfEurousdOneBehind.tail()
print dfOilOneBehind.tail()


           Date  SP500_Value
5030  10/7/1994       452.36
5031  10/6/1994       453.52
5032  10/5/1994       454.59
5033  10/4/1994       461.74
5034  10/3/1994       461.74
           Date  NYSE_Value
5030  10/7/1994     2642.69
5031  10/6/1994     2647.24
5032  10/5/1994     2658.87
5033  10/4/1994     2695.67
5034  10/3/1994     2695.67
           Date  USD_Value
5024  10/7/1994    85.8219
5025  10/6/1994    85.7657
5026  10/5/1994    85.9173
5027  10/4/1994    85.9581
5028  10/3/1994    85.9581
           Date  EURO/USD_Value
5104  10/7/1994          1.2424
5105  10/6/1994          1.2432
5106  10/5/1994          1.2382
5107  10/4/1994          1.2328
5108  10/3/1994          1.2328
           Date  Oil_Value
5018  10/7/1994      18.24
5019  10/6/1994      18.01
5020  10/5/1994      17.97
5021  10/4/1994      18.16
5022  10/3/1994      18.16

In [6]:
dfMaster = pd.merge(dfGoldOneBehind, dfSP500OneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfNyseOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfUsIndOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfEurousdOneBehind, on='Date', how='inner')
dfMaster = pd.merge(dfMaster, dfOilOneBehind, on='Date', how='inner')
dfMaster.head()
#print dfMaster.shape


Out[6]:
Date Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
0 9/30/2014 1219.5 1977.80 10749.05 80.9136 1.27103 94.53
1 9/29/2014 1213.8 1982.85 10798.88 80.9983 1.27777 95.55
2 9/26/2014 1213.8 1965.99 10722.21 80.5957 1.28405 93.59
3 9/25/2014 1217.3 1998.30 10885.60 80.4465 1.28558 93.60
4 9/24/2014 1222.0 1982.77 10815.42 80.1793 1.28396 91.55

In [7]:
dfMaster.tail()


Out[7]:
Date Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
4916 10/7/1994 392.00 452.36 2642.69 85.8219 1.2424 18.24
4917 10/6/1994 393.70 453.52 2647.24 85.7657 1.2432 18.01
4918 10/5/1994 392.45 454.59 2658.87 85.9173 1.2382 17.97
4919 10/4/1994 393.20 461.74 2695.67 85.9581 1.2328 18.16
4920 10/3/1994 393.20 461.74 2695.67 85.9581 1.2328 18.16

In [8]:
print dfMaster.shape


(4921, 7)

In [9]:
#Corelation Heat Matrix

def computeDataTableCorr(datatable, columnNames):
    corrCandidates =  datatable[candidatesList]
    return corrCandidates.corr()

# Plotting correlation heat graph
def displayCorrHeatGraph(cTable, title):
    #_ = pd.scatter_matrix(corrTable, diagonal='kde', figsize=(10, 10))
    plt.imshow(cTable, cmap='hot', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(cTable)), cTable.columns, rotation=90)
    plt.yticks(range(len(cTable)), cTable.columns)
    plt.title(title)
    
candidatesList = ['Gold_Value', 'SP500_Value', 'NYSE_Value', 'USD_Value', 'EURO/USD_Value', 'Oil_Value']
corrTable = computeDataTableCorr(dfMaster,candidatesList)
print(corrTable)

displayCorrHeatGraph(corrTable,
    'Correlation Heat Matrix (Gold_Value, SP500_Value, NYSE_Value, USD_Value, EURO/USD_Value, Oil_Value)')


                Gold_Value  SP500_Value  NYSE_Value  USD_Value  \
Gold_Value        1.000000     0.461485    0.578967  -0.792106   
SP500_Value       0.461485     1.000000    0.940998  -0.241948   
NYSE_Value        0.578967     0.940998    1.000000  -0.462967   
USD_Value        -0.792106    -0.241948   -0.462967   1.000000   
EURO/USD_Value    0.628074     0.105645    0.350845  -0.945491   
Oil_Value         0.860482     0.625074    0.789134  -0.805489   

                EURO/USD_Value  Oil_Value  
Gold_Value            0.628074   0.860482  
SP500_Value           0.105645   0.625074  
NYSE_Value            0.350845   0.789134  
USD_Value            -0.945491  -0.805489  
EURO/USD_Value        1.000000   0.692399  
Oil_Value             0.692399   1.000000  

In [10]:
dfCorrGold = corrTable[:1]
dfCorrGold


Out[10]:
Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
Gold_Value 1 0.461485 0.578967 -0.792106 0.628074 0.860482

p-Values


In [11]:
#pValue Test
def shuffleAndCorr(df, orgCorr, runs=10, threshold=0.3, axis=0):     
    df_sh = df.copy(deep=True)
    success_count = 0
    for _ in range(runs):
        df_sh.apply(np.random.shuffle, axis=axis)
        newCor = df_sh['Col1'].corr(df_sh['Col2'])
        if (orgCorr < 0): orgCorr = orgCorr * -1
        if (newCor < 0): newCor = newCor * -1
        #print(newCor)
        diff = abs(newCor - orgCorr)
        #print(diff)
        if (diff < threshold): success_count = success_count + 1
    p = success_count / runs
    return p

dfpValue = pd.DataFrame(data=None,columns=candidatesList)
pValue = []
pValue.append(0)

for i in range(1,len(candidatesList),1):
    orgCorr = dfCorrGold[candidatesList[i]]
    #print(orgCorr)
    temp_df = pd.DataFrame({'Col1':dfMaster[candidatesList[0]], \
                                'Col2':dfMaster[candidatesList[i]]})
    pValue.append(shuffleAndCorr(temp_df,np.float(orgCorr),pRuns,pThreshold))

dfpValue.loc[0] = pValue
dfpValue


Out[11]:
Gold_Value SP500_Value NYSE_Value USD_Value EURO/USD_Value Oil_Value
0 0 0 0 0 0 0

As the p-Values are all zeros, The factors does not correlates by chance.

Model 1.0: Multivariate Linear Regression


In [12]:
import statsmodels.api as sm

In [13]:
#Inputs, Train and Test Data preparation
trainSize = np.floor(len(dfMaster['Date']) * trainingRatio) #60:40 ratio
dfMasterTrain = dfMaster[len(dfMaster)-np.int(trainSize):len(dfMaster)]
dfMasterTest = dfMaster[0:(len(dfMaster)-np.int(trainSize))-1]

In [14]:
#[2]USD [3]EURO/USD [4]OIL
xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[2]]), \
         np.array(dfMasterTrain[candidatesList[3]]), \
         np.array(dfMasterTrain[candidatesList[4]]), \
         ]
xArrTrain = np.array(xArrTrain)
xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[2]]), \
         np.array(dfMasterTest[candidatesList[3]]), \
         np.array(dfMasterTest[candidatesList[4]]), \
         ]
xArrTest = np.array(xArrTest)

yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

In [15]:
def mvRegress(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

def mvPredict(x,res):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    return res.predict(X)

In [16]:
res = mvRegress(yArrTrain, xArrTrain)

In [17]:
res.summary()


Out[17]:
OLS Regression Results
Dep. Variable: y R-squared: 0.709
Model: OLS Adj. R-squared: 0.708
Method: Least Squares F-statistic: 2390.
Date: Sun, 02 Nov 2014 Prob (F-statistic): 0.00
Time: 18:31:32 Log-Likelihood: -15527.
No. Observations: 2952 AIC: 3.106e+04
Df Residuals: 2948 BIC: 3.109e+04
Df Model: 3
coef std err t P>|t| [95.0% Conf. Int.]
x1 -19.4471 24.616 -0.790 0.430 -67.713 28.819
x2 -8.3081 0.383 -21.684 0.000 -9.059 -7.557
x3 0.0189 0.001 26.340 0.000 0.018 0.020
const 1048.1661 65.417 16.023 0.000 919.899 1176.434
Omnibus: 342.707 Durbin-Watson: 0.008
Prob(Omnibus): 0.000 Jarque-Bera (JB): 514.982
Skew: 0.847 Prob(JB): 1.49e-112
Kurtosis: 4.148 Cond. No. 4.85e+05

In [18]:
res.params


Out[18]:
array([ -1.94470574e+01,  -8.30814665e+00,   1.89479629e-02,
         1.04816608e+03])

In [19]:
#res.params
# estY = res.params[3] + (res.params[2] * xArrTest[0]) + (res.params[1] * xArrTest[1]) + (res.params[0] * xArrTest[2])
yPred0 = mvPredict(xArrTest,res)
#yPred0

In [20]:
yArrTest


Out[20]:
array([ 1219.5,  1213.8,  1213.8, ...,   594. ,   590. ,   595.1])

In [21]:
def futurePredictMLR(res,usdi,eurousd,oil):
    return (res.params[3] + (res.params[2] * usdi) + (res.params[1] * eurousd) + (res.params[0] * oil))

Future Gold Price (By Model 1.0 - Multivariate Linear Regression)

GP(i) = (1048.1661) + ( 0.0189 * usd(i)) + ( -8.3081 * eurousd(i) ) + ( -19.4471 * oil(i) )

Model 1.1: Simple Autoregressive Model based on the last three day prices only.


In [22]:
yArrTrain = np.array(dfMasterTrain[candidatesList[0]])
yArrTest = np.array(dfMasterTest[candidatesList[0]])

xArrTrain = [ \
         #np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         np.array(dfMasterTrain[candidatesList[0]]), \
         ]
xArrTrain = np.array(xArrTrain)
#train is essentially the same as test!! 

xArrTest = [ \
         #np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         np.array(dfMasterTest[candidatesList[0]]), \
         ]
xArrTest = np.array(xArrTest)




#print (xArrTrain)            
#print (xArrTrain)            
#print(xArrTrain[0][])

In [23]:
print (xArrTest)
print (xArrTrain)
#print (xArrTrain[0][5])
n=len(xArrTrain[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTrain[i][j]=xArrTrain[i][j+i+1]
        else:
            xArrTrain[i][j]=xArrTrain[i][n-1]
            
n=len(xArrTest[0])
for i in range(0,3,1):
    for j in range(0,n,1):
        if(j<n-i-1):
            xArrTest[i][j]=xArrTest[i][j+i+1]
        else:
            xArrTest[i][j]=xArrTest[i][n-1]   

print (xArrTrain)
print (xArrTest)


[[ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]
 [ 1219.5  1213.8  1213.8 ...,   594.    590.    595.1]]
[[ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]
 [ 573.    574.1   571.4  ...,  392.45  393.2   393.2 ]]
[[ 574.1  571.4  575.3 ...,  393.2  393.2  393.2]
 [ 571.4  575.3  573.3 ...,  393.2  393.2  393.2]
 [ 575.3  573.3  573.6 ...,  393.2  393.2  393.2]]
[[ 1213.8  1213.8  1217.3 ...,   590.    595.1   595.1]
 [ 1213.8  1217.3  1222.  ...,   595.1   595.1   595.1]
 [ 1217.3  1222.   1213.5 ...,   595.1   595.1   595.1]]

In [24]:
result = mvRegress(yArrTrain, xArrTrain)
result.summary()
yPred1 = mvPredict(xArrTest,result)
tweak = np.empty(len(yPred1))
tweak.fill(0.036515358062)
yPred1 = np.add(yPred1,tweak)
print (result.params[3])
print (result.params[2])
print (result.params[1])
print (result.params[0])


0.00705306597369
1.01014590019
-0.0574113039927
0.0474216676995

Time Series Analysis - Auto Regression Models

Model 1.2: Univariate Autogressive Processes (AR)

Model 1.3: Autogressive Moving-Average Processes (ARMA) and Kalman Filter

Model 1.4: Vector Autogressive Processes (VAR)

Error Metrics


In [25]:
from sklearn.metrics import mean_squared_error

In [26]:
errVar_y0 = 100 * (np.absolute(yPred0 - yArrTest) / yArrTest)
errRMS_y0 = np.sqrt(mean_squared_error(yArrTest,yPred0))
errABS_y0= np.absolute(yPred0-yArrTest)

errVar_y1 = 100 * (np.absolute(yPred1 - yArrTest) / yArrTest)
errRMS_y1 = np.sqrt(mean_squared_error(yArrTest,yPred1))
errABS_y1= np.absolute(yPred1-yArrTest)

err_y1 = yPred1 - yArrTest
print np.mean(err_y1)

fig, axes = plt.subplots(1, 1, figsize=(12,4))
axes.hist(err_y1, color='g', bins=120)
axes.set_ylabel('Error Frequency')
axes.set_xlabel('Error')
axes.set_title("Error Variations Model-1.1")


-0.0988469883172
Out[26]:
<matplotlib.text.Text at 0x7f429a638a50>

In [27]:
dfErr = pd.DataFrame(data=None, columns=['Model','Minimum % Error','Maximum % Error', 'RMSE Error', 'Mean Absolute Error','Mean Percentage Error'])
dfErr['Model'] = ('Model 1.0','Model 1.1 (Simple Autoregressive)')
dfErr['Minimum % Error'] = (min(errVar_y0),min(errVar_y1))
dfErr['Maximum % Error'] = (max(errVar_y0),max(errVar_y1))
dfErr['RMSE Error'] = (errRMS_y0,errRMS_y1)
dfErr['Mean Absolute Error'] = (np.mean(errABS_y0),np.mean(errABS_y1))
dfErr['Mean Percentage Error'] = (np.mean(errVar_y0),np.mean(errVar_y1))

In [28]:
dfErr


Out[28]:
Model Minimum % Error Maximum % Error RMSE Error Mean Absolute Error Mean Percentage Error
0 Model 1.0 12.658695 69.811997 723.910077 642.599783 49.591577
1 Model 1.1 (Simple Autoregressive) 0.001525 10.104877 15.519673 10.568039 0.905870

In [26]: